gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 dump_printf_loc (MSG_NOTE, vect_location,
 262                                  "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 263                                  TYPE_VECTOR_SUBPARTS (vectype));
 264
 265               vect_update_max_nunits (&vectorization_factor, vectype);
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           if (dump_enabled_p ())
 554             dump_printf_loc (MSG_NOTE, vect_location,
 555                              "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 556                              TYPE_VECTOR_SUBPARTS (vf_vectype));
 557
 558           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     {
 571       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 572       dump_dec (MSG_NOTE, vectorization_factor);
 573       dump_printf (MSG_NOTE, "\n");
 574     }
 575
 576   if (known_le (vectorization_factor, 1U))
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "not vectorized: unsupported data-type\n");
 581       return false;
 582     }
 583   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 584
 585   for (i = 0; i < mask_producers.length (); i++)
 586     {
 587       tree mask_type = NULL;
 588
 589       stmt = STMT_VINFO_STMT (mask_producers[i]);
 590
 591       if (is_gimple_assign (stmt)
 592           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 593           && !VECT_SCALAR_BOOLEAN_TYPE_P
 594                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 595         {
 596           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 597           mask_type = get_mask_type_for_scalar_type (scalar_type);
 598
 599           if (!mask_type)
 600             {
 601               if (dump_enabled_p ())
 602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 603                                  "not vectorized: unsupported mask\n");
 604               return false;
 605             }
 606         }
 607       else
 608         {
 609           tree rhs;
 610           ssa_op_iter iter;
 611           gimple *def_stmt;
 612           enum vect_def_type dt;
 613
 614           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 615             {
 616               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 617                                        &def_stmt, &dt, &vectype))
 618                 {
 619                   if (dump_enabled_p ())
 620                     {
 621                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 622                                        "not vectorized: can't compute mask type "
 623                                        "for statement, ");
 624                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 625                                         0);
 626                     }
 627                   return false;
 628                 }
 629
 630               /* No vectype probably means external definition.
 631                  Allow it in case there is another operand which
 632                  allows to determine mask type.  */
 633               if (!vectype)
 634                 continue;
 635
 636               if (!mask_type)
 637                 mask_type = vectype;
 638               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 639                        != TYPE_VECTOR_SUBPARTS (vectype))
 640                 {
 641                   if (dump_enabled_p ())
 642                     {
 643                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 644                                        "not vectorized: different sized masks "
 645                                        "types in statement, ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          mask_type);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 649                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 650                                          vectype);
 651                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 652                     }
 653                   return false;
 654                 }
 655               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 656                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 657                 {
 658                   if (dump_enabled_p ())
 659                     {
 660                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                                        "not vectorized: mixed mask and "
 662                                        "nonmask vector types in statement, ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          mask_type);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 666                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 667                                          vectype);
 668                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 669                     }
 670                   return false;
 671                 }
 672             }
 673
 674           /* We may compare boolean value loaded as vector of integers.
 675              Fix mask_type in such case.  */
 676           if (mask_type
 677               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 678               && gimple_code (stmt) == GIMPLE_ASSIGN
 679               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 680             mask_type = build_same_sized_truth_vector_type (mask_type);
 681         }
 682
 683       /* No mask_type should mean loop invariant predicate.
 684          This is probably a subject for optimization in
 685          if-conversion.  */
 686       if (!mask_type)
 687         {
 688           if (dump_enabled_p ())
 689             {
 690               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 691                                "not vectorized: can't compute mask type "
 692                                "for statement, ");
 693               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 694                                 0);
 695             }
 696           return false;
 697         }
 698
 699       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 700     }
 701
 702   return true;
 703 }
 704
 705
 706 /* Function vect_is_simple_iv_evolution.
 707
 708    FORNOW: A simple evolution of an induction variables in the loop is
 709    considered a polynomial evolution.  */
 710
 711 static bool
 712 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 713                              tree * step)
 714 {
 715   tree init_expr;
 716   tree step_expr;
 717   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 718   basic_block bb;
 719
 720   /* When there is no evolution in this loop, the evolution function
 721      is not "simple".  */
 722   if (evolution_part == NULL_TREE)
 723     return false;
 724
 725   /* When the evolution is a polynomial of degree >= 2
 726      the evolution function is not "simple".  */
 727   if (tree_is_chrec (evolution_part))
 728     return false;
 729
 730   step_expr = evolution_part;
 731   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 732
 733   if (dump_enabled_p ())
 734     {
 735       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 737       dump_printf (MSG_NOTE, ",  init: ");
 738       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 739       dump_printf (MSG_NOTE, "\n");
 740     }
 741
 742   *init = init_expr;
 743   *step = step_expr;
 744
 745   if (TREE_CODE (step_expr) != INTEGER_CST
 746       && (TREE_CODE (step_expr) != SSA_NAME
 747           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 748               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 749           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 750               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 751                   || !flag_associative_math)))
 752       && (TREE_CODE (step_expr) != REAL_CST
 753           || !flag_associative_math))
 754     {
 755       if (dump_enabled_p ())
 756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                          "step unknown.\n");
 758       return false;
 759     }
 760
 761   return true;
 762 }
 763
 764 /* Function vect_analyze_scalar_cycles_1.
 765
 766    Examine the cross iteration def-use cycles of scalar variables
 767    in LOOP.  LOOP_VINFO represents the loop that is now being
 768    considered for vectorization (can be LOOP, or an outer-loop
 769    enclosing LOOP).  */
 770
 771 static void
 772 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 773 {
 774   basic_block bb = loop->header;
 775   tree init, step;
 776   auto_vec<gimple *, 64> worklist;
 777   gphi_iterator gsi;
 778   bool double_reduc;
 779
 780   if (dump_enabled_p ())
 781     dump_printf_loc (MSG_NOTE, vect_location,
 782                      "=== vect_analyze_scalar_cycles ===\n");
 783
 784   /* First - identify all inductions.  Reduction detection assumes that all the
 785      inductions have been identified, therefore, this order must not be
 786      changed.  */
 787   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 788     {
 789       gphi *phi = gsi.phi ();
 790       tree access_fn = NULL;
 791       tree def = PHI_RESULT (phi);
 792       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 793
 794       if (dump_enabled_p ())
 795         {
 796           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 797           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 798         }
 799
 800       /* Skip virtual phi's.  The data dependences that are associated with
 801          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 802       if (virtual_operand_p (def))
 803         continue;
 804
 805       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 806
 807       /* Analyze the evolution function.  */
 808       access_fn = analyze_scalar_evolution (loop, def);
 809       if (access_fn)
 810         {
 811           STRIP_NOPS (access_fn);
 812           if (dump_enabled_p ())
 813             {
 814               dump_printf_loc (MSG_NOTE, vect_location,
 815                                "Access function of PHI: ");
 816               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 817               dump_printf (MSG_NOTE, "\n");
 818             }
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 820             = initial_condition_in_loop_num (access_fn, loop->num);
 821           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 822             = evolution_part_in_loop_num (access_fn, loop->num);
 823         }
 824
 825       if (!access_fn
 826           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 827           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 828               && TREE_CODE (step) != INTEGER_CST))
 829         {
 830           worklist.safe_push (phi);
 831           continue;
 832         }
 833
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 835                   != NULL_TREE);
 836       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 837
 838       if (dump_enabled_p ())
 839         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 840       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 841     }
 842
 843
 844   /* Second - identify all reductions and nested cycles.  */
 845   while (worklist.length () > 0)
 846     {
 847       gimple *phi = worklist.pop ();
 848       tree def = PHI_RESULT (phi);
 849       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 850       gimple *reduc_stmt;
 851
 852       if (dump_enabled_p ())
 853         {
 854           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 855           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 856         }
 857
 858       gcc_assert (!virtual_operand_p (def)
 859                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 860
 861       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 862                                                 &double_reduc, false);
 863       if (reduc_stmt)
 864         {
 865           if (double_reduc)
 866             {
 867               if (dump_enabled_p ())
 868                 dump_printf_loc (MSG_NOTE, vect_location,
 869                                  "Detected double reduction.\n");
 870
 871               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 872               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 873                                                     vect_double_reduction_def;
 874             }
 875           else
 876             {
 877               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 878                 {
 879                   if (dump_enabled_p ())
 880                     dump_printf_loc (MSG_NOTE, vect_location,
 881                                      "Detected vectorizable nested cycle.\n");
 882
 883                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 884                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 885                                                              vect_nested_cycle;
 886                 }
 887               else
 888                 {
 889                   if (dump_enabled_p ())
 890                     dump_printf_loc (MSG_NOTE, vect_location,
 891                                      "Detected reduction.\n");
 892
 893                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 894                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 895                                                            vect_reduction_def;
 896                   /* Store the reduction cycles for possible vectorization in
 897                      loop-aware SLP if it was not detected as reduction
 898                      chain.  */
 899                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 900                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 901                 }
 902             }
 903         }
 904       else
 905         if (dump_enabled_p ())
 906           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 907                            "Unknown def-use cycle pattern.\n");
 908     }
 909 }
 910
 911
 912 /* Function vect_analyze_scalar_cycles.
 913
 914    Examine the cross iteration def-use cycles of scalar variables, by
 915    analyzing the loop-header PHIs of scalar variables.  Classify each
 916    cycle as one of the following: invariant, induction, reduction, unknown.
 917    We do that for the loop represented by LOOP_VINFO, and also to its
 918    inner-loop, if exists.
 919    Examples for scalar cycles:
 920
 921    Example1: reduction:
 922
 923               loop1:
 924               for (i=0; i<N; i++)
 925                  sum += a[i];
 926
 927    Example2: induction:
 928
 929               loop2:
 930               for (i=0; i<N; i++)
 931                  a[i] = i;  */
 932
 933 static void
 934 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 935 {
 936   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 937
 938   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 939
 940   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 941      Reductions in such inner-loop therefore have different properties than
 942      the reductions in the nest that gets vectorized:
 943      1. When vectorized, they are executed in the same order as in the original
 944         scalar loop, so we can't change the order of computation when
 945         vectorizing them.
 946      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 947         current checks are too strict.  */
 948
 949   if (loop->inner)
 950     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 951 }
 952
 953 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 954
 955 static void
 956 vect_fixup_reduc_chain (gimple *stmt)
 957 {
 958   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 959   gimple *stmtp;
 960   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 961               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 962   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 963   do
 964     {
 965       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 966       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 967       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 968       if (stmt)
 969         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 970           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971     }
 972   while (stmt);
 973   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 974 }
 975
 976 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 977
 978 static void
 979 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 980 {
 981   gimple *first;
 982   unsigned i;
 983
 984   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 985     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 986       {
 987         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 988         while (next)
 989           {
 990             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 991               break;
 992             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 993           }
 994         /* If not all stmt in the chain are patterns try to handle
 995            the chain without patterns.  */
 996         if (! next)
 997           {
 998             vect_fixup_reduc_chain (first);
 999             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1000               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001           }
1002       }
1003 }
1004
1005 /* Function vect_get_loop_niters.
1006
1007    Determine how many iterations the loop is executed and place it
1008    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1009    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1010    niter information holds in ASSUMPTIONS.
1011
1012    Return the loop exit condition.  */
1013
1014
1015 static gcond *
1016 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1017                       tree *number_of_iterations, tree *number_of_iterationsm1)
1018 {
1019   edge exit = single_exit (loop);
1020   struct tree_niter_desc niter_desc;
1021   tree niter_assumptions, niter, may_be_zero;
1022   gcond *cond = get_loop_exit_condition (loop);
1023
1024   *assumptions = boolean_true_node;
1025   *number_of_iterationsm1 = chrec_dont_know;
1026   *number_of_iterations = chrec_dont_know;
1027   if (dump_enabled_p ())
1028     dump_printf_loc (MSG_NOTE, vect_location,
1029                      "=== get_loop_niters ===\n");
1030
1031   if (!exit)
1032     return cond;
1033
1034   niter = chrec_dont_know;
1035   may_be_zero = NULL_TREE;
1036   niter_assumptions = boolean_true_node;
1037   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1038       || chrec_contains_undetermined (niter_desc.niter))
1039     return cond;
1040
1041   niter_assumptions = niter_desc.assumptions;
1042   may_be_zero = niter_desc.may_be_zero;
1043   niter = niter_desc.niter;
1044
1045   if (may_be_zero && integer_zerop (may_be_zero))
1046     may_be_zero = NULL_TREE;
1047
1048   if (may_be_zero)
1049     {
1050       if (COMPARISON_CLASS_P (may_be_zero))
1051         {
1052           /* Try to combine may_be_zero with assumptions, this can simplify
1053              computation of niter expression.  */
1054           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1055             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1056                                              niter_assumptions,
1057                                              fold_build1 (TRUTH_NOT_EXPR,
1058                                                           boolean_type_node,
1059                                                           may_be_zero));
1060           else
1061             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1062                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1063
1064           may_be_zero = NULL_TREE;
1065         }
1066       else if (integer_nonzerop (may_be_zero))
1067         {
1068           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1069           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1070           return cond;
1071         }
1072       else
1073         return cond;
1074     }
1075
1076   *assumptions = niter_assumptions;
1077   *number_of_iterationsm1 = niter;
1078
1079   /* We want the number of loop header executions which is the number
1080      of latch executions plus one.
1081      ???  For UINT_MAX latch executions this number overflows to zero
1082      for loops like do { n++; } while (n != 0);  */
1083   if (niter && !chrec_contains_undetermined (niter))
1084     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1085                           build_int_cst (TREE_TYPE (niter), 1));
1086   *number_of_iterations = niter;
1087
1088   return cond;
1089 }
1090
1091 /* Function bb_in_loop_p
1092
1093    Used as predicate for dfs order traversal of the loop bbs.  */
1094
1095 static bool
1096 bb_in_loop_p (const_basic_block bb, const void *data)
1097 {
1098   const struct loop *const loop = (const struct loop *)data;
1099   if (flow_bb_inside_loop_p (loop, bb))
1100     return true;
1101   return false;
1102 }
1103
1104
1105 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1106    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1107
1108 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1109   : vec_info (vec_info::loop, init_cost (loop_in)),
1110     loop (loop_in),
1111     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1112     num_itersm1 (NULL_TREE),
1113     num_iters (NULL_TREE),
1114     num_iters_unchanged (NULL_TREE),
1115     num_iters_assumptions (NULL_TREE),
1116     th (0),
1117     versioning_threshold (0),
1118     vectorization_factor (0),
1119     max_vectorization_factor (0),
1120     unaligned_dr (NULL),
1121     peeling_for_alignment (0),
1122     ptr_mask (0),
1123     slp_unrolling_factor (1),
1124     single_scalar_iteration_cost (0),
1125     vectorizable (false),
1126     peeling_for_gaps (false),
1127     peeling_for_niter (false),
1128     operands_swapped (false),
1129     no_data_dependencies (false),
1130     has_mask_store (false),
1131     scalar_loop (NULL),
1132     orig_loop_info (NULL)
1133 {
1134   /* Create/Update stmt_info for all stmts in the loop.  */
1135   basic_block *body = get_loop_body (loop);
1136   for (unsigned int i = 0; i < loop->num_nodes; i++)
1137     {
1138       basic_block bb = body[i];
1139       gimple_stmt_iterator si;
1140
1141       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1142         {
1143           gimple *phi = gsi_stmt (si);
1144           gimple_set_uid (phi, 0);
1145           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1146         }
1147
1148       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1149         {
1150           gimple *stmt = gsi_stmt (si);
1151           gimple_set_uid (stmt, 0);
1152           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1153         }
1154     }
1155   free (body);
1156
1157   /* CHECKME: We want to visit all BBs before their successors (except for
1158      latch blocks, for which this assertion wouldn't hold).  In the simple
1159      case of the loop forms we allow, a dfs order of the BBs would the same
1160      as reversed postorder traversal, so we are safe.  */
1161
1162   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1163                                           bbs, loop->num_nodes, loop);
1164   gcc_assert (nbbs == loop->num_nodes);
1165 }
1166
1167
1168 /* Free all memory used by the _loop_vec_info, as well as all the
1169    stmt_vec_info structs of all the stmts in the loop.  */
1170
1171 _loop_vec_info::~_loop_vec_info ()
1172 {
1173   int nbbs;
1174   gimple_stmt_iterator si;
1175   int j;
1176
1177   nbbs = loop->num_nodes;
1178   for (j = 0; j < nbbs; j++)
1179     {
1180       basic_block bb = bbs[j];
1181       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1182         free_stmt_vec_info (gsi_stmt (si));
1183
1184       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1185         {
1186           gimple *stmt = gsi_stmt (si);
1187
1188           /* We may have broken canonical form by moving a constant
1189              into RHS1 of a commutative op.  Fix such occurrences.  */
1190           if (operands_swapped && is_gimple_assign (stmt))
1191             {
1192               enum tree_code code = gimple_assign_rhs_code (stmt);
1193
1194               if ((code == PLUS_EXPR
1195                    || code == POINTER_PLUS_EXPR
1196                    || code == MULT_EXPR)
1197                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1198                 swap_ssa_operands (stmt,
1199                                    gimple_assign_rhs1_ptr (stmt),
1200                                    gimple_assign_rhs2_ptr (stmt));
1201               else if (code == COND_EXPR
1202                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1203                 {
1204                   tree cond_expr = gimple_assign_rhs1 (stmt);
1205                   enum tree_code cond_code = TREE_CODE (cond_expr);
1206
1207                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1208                     {
1209                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1210                                                                   0));
1211                       cond_code = invert_tree_comparison (cond_code,
1212                                                           honor_nans);
1213                       if (cond_code != ERROR_MARK)
1214                         {
1215                           TREE_SET_CODE (cond_expr, cond_code);
1216                           swap_ssa_operands (stmt,
1217                                              gimple_assign_rhs2_ptr (stmt),
1218                                              gimple_assign_rhs3_ptr (stmt));
1219                         }
1220                     }
1221                 }
1222             }
1223
1224           /* Free stmt_vec_info.  */
1225           free_stmt_vec_info (stmt);
1226           gsi_next (&si);
1227         }
1228     }
1229
1230   free (bbs);
1231
1232   loop->aux = NULL;
1233 }
1234
1235
1236 /* Calculate the cost of one scalar iteration of the loop.  */
1237 static void
1238 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1239 {
1240   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1241   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1242   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1243   int innerloop_iters, i;
1244
1245   /* Count statements in scalar loop.  Using this as scalar cost for a single
1246      iteration for now.
1247
1248      TODO: Add outer loop support.
1249
1250      TODO: Consider assigning different costs to different scalar
1251      statements.  */
1252
1253   /* FORNOW.  */
1254   innerloop_iters = 1;
1255   if (loop->inner)
1256     innerloop_iters = 50; /* FIXME */
1257
1258   for (i = 0; i < nbbs; i++)
1259     {
1260       gimple_stmt_iterator si;
1261       basic_block bb = bbs[i];
1262
1263       if (bb->loop_father == loop->inner)
1264         factor = innerloop_iters;
1265       else
1266         factor = 1;
1267
1268       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1269         {
1270           gimple *stmt = gsi_stmt (si);
1271           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1272
1273           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1274             continue;
1275
1276           /* Skip stmts that are not vectorized inside the loop.  */
1277           if (stmt_info
1278               && !STMT_VINFO_RELEVANT_P (stmt_info)
1279               && (!STMT_VINFO_LIVE_P (stmt_info)
1280                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1281               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1282             continue;
1283
1284           vect_cost_for_stmt kind;
1285           if (STMT_VINFO_DATA_REF (stmt_info))
1286             {
1287               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1288                kind = scalar_load;
1289              else
1290                kind = scalar_store;
1291             }
1292           else
1293             kind = scalar_stmt;
1294
1295           scalar_single_iter_cost
1296             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1297                                  factor, kind, stmt_info, 0, vect_prologue);
1298         }
1299     }
1300   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1301     = scalar_single_iter_cost;
1302 }
1303
1304
1305 /* Function vect_analyze_loop_form_1.
1306
1307    Verify that certain CFG restrictions hold, including:
1308    - the loop has a pre-header
1309    - the loop has a single entry and exit
1310    - the loop exit condition is simple enough
1311    - the number of iterations can be analyzed, i.e, a countable loop.  The
1312      niter could be analyzed under some assumptions.  */
1313
1314 bool
1315 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1316                           tree *assumptions, tree *number_of_iterationsm1,
1317                           tree *number_of_iterations, gcond **inner_loop_cond)
1318 {
1319   if (dump_enabled_p ())
1320     dump_printf_loc (MSG_NOTE, vect_location,
1321                      "=== vect_analyze_loop_form ===\n");
1322
1323   /* Different restrictions apply when we are considering an inner-most loop,
1324      vs. an outer (nested) loop.
1325      (FORNOW. May want to relax some of these restrictions in the future).  */
1326
1327   if (!loop->inner)
1328     {
1329       /* Inner-most loop.  We currently require that the number of BBs is
1330          exactly 2 (the header and latch).  Vectorizable inner-most loops
1331          look like this:
1332
1333                         (pre-header)
1334                            |
1335                           header <--------+
1336                            | |            |
1337                            | +--> latch --+
1338                            |
1339                         (exit-bb)  */
1340
1341       if (loop->num_nodes != 2)
1342         {
1343           if (dump_enabled_p ())
1344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                              "not vectorized: control flow in loop.\n");
1346           return false;
1347         }
1348
1349       if (empty_block_p (loop->header))
1350         {
1351           if (dump_enabled_p ())
1352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353                              "not vectorized: empty loop.\n");
1354           return false;
1355         }
1356     }
1357   else
1358     {
1359       struct loop *innerloop = loop->inner;
1360       edge entryedge;
1361
1362       /* Nested loop. We currently require that the loop is doubly-nested,
1363          contains a single inner loop, and the number of BBs is exactly 5.
1364          Vectorizable outer-loops look like this:
1365
1366                         (pre-header)
1367                            |
1368                           header <---+
1369                            |         |
1370                           inner-loop |
1371                            |         |
1372                           tail ------+
1373                            |
1374                         (exit-bb)
1375
1376          The inner-loop has the properties expected of inner-most loops
1377          as described above.  */
1378
1379       if ((loop->inner)->inner || (loop->inner)->next)
1380         {
1381           if (dump_enabled_p ())
1382             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383                              "not vectorized: multiple nested loops.\n");
1384           return false;
1385         }
1386
1387       if (loop->num_nodes != 5)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: control flow in loop.\n");
1392           return false;
1393         }
1394
1395       entryedge = loop_preheader_edge (innerloop);
1396       if (entryedge->src != loop->header
1397           || !single_exit (innerloop)
1398           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1399         {
1400           if (dump_enabled_p ())
1401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402                              "not vectorized: unsupported outerloop form.\n");
1403           return false;
1404         }
1405
1406       /* Analyze the inner-loop.  */
1407       tree inner_niterm1, inner_niter, inner_assumptions;
1408       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1409                                       &inner_assumptions, &inner_niterm1,
1410                                       &inner_niter, NULL)
1411           /* Don't support analyzing niter under assumptions for inner
1412              loop.  */
1413           || !integer_onep (inner_assumptions))
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: Bad inner loop.\n");
1418           return false;
1419         }
1420
1421       if (!expr_invariant_in_loop_p (loop, inner_niter))
1422         {
1423           if (dump_enabled_p ())
1424             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1425                              "not vectorized: inner-loop count not"
1426                              " invariant.\n");
1427           return false;
1428         }
1429
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Considering outer-loop vectorization.\n");
1433     }
1434
1435   if (!single_exit (loop)
1436       || EDGE_COUNT (loop->header->preds) != 2)
1437     {
1438       if (dump_enabled_p ())
1439         {
1440           if (!single_exit (loop))
1441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                              "not vectorized: multiple exits.\n");
1443           else if (EDGE_COUNT (loop->header->preds) != 2)
1444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1445                              "not vectorized: too many incoming edges.\n");
1446         }
1447       return false;
1448     }
1449
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     {
1457       if (dump_enabled_p ())
1458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                          "not vectorized: latch block not empty.\n");
1460       return false;
1461     }
1462
1463   /* Make sure the exit is not abnormal.  */
1464   edge e = single_exit (loop);
1465   if (e->flags & EDGE_ABNORMAL)
1466     {
1467       if (dump_enabled_p ())
1468         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                          "not vectorized: abnormal loop exit edge.\n");
1470       return false;
1471     }
1472
1473   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1474                                      number_of_iterationsm1);
1475   if (!*loop_cond)
1476     {
1477       if (dump_enabled_p ())
1478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1479                          "not vectorized: complicated exit condition.\n");
1480       return false;
1481     }
1482
1483   if (integer_zerop (*assumptions)
1484       || !*number_of_iterations
1485       || chrec_contains_undetermined (*number_of_iterations))
1486     {
1487       if (dump_enabled_p ())
1488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                          "not vectorized: number of iterations cannot be "
1490                          "computed.\n");
1491       return false;
1492     }
1493
1494   if (integer_zerop (*number_of_iterations))
1495     {
1496       if (dump_enabled_p ())
1497         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1498                          "not vectorized: number of iterations = 0.\n");
1499       return false;
1500     }
1501
1502   return true;
1503 }
1504
1505 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1506
1507 loop_vec_info
1508 vect_analyze_loop_form (struct loop *loop)
1509 {
1510   tree assumptions, number_of_iterations, number_of_iterationsm1;
1511   gcond *loop_cond, *inner_loop_cond = NULL;
1512
1513   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1514                                   &assumptions, &number_of_iterationsm1,
1515                                   &number_of_iterations, &inner_loop_cond))
1516     return NULL;
1517
1518   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1519   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1520   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1521   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1522   if (!integer_onep (assumptions))
1523     {
1524       /* We consider to vectorize this loop by versioning it under
1525          some assumptions.  In order to do this, we need to clear
1526          existing information computed by scev and niter analyzer.  */
1527       scev_reset_htab ();
1528       free_numbers_of_iterations_estimates (loop);
1529       /* Also set flag for this loop so that following scev and niter
1530          analysis are done under the assumptions.  */
1531       loop_constraint_set (loop, LOOP_C_FINITE);
1532       /* Also record the assumptions for versioning.  */
1533       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1534     }
1535
1536   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1537     {
1538       if (dump_enabled_p ())
1539         {
1540           dump_printf_loc (MSG_NOTE, vect_location,
1541                            "Symbolic number of iterations is ");
1542           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1543           dump_printf (MSG_NOTE, "\n");
1544         }
1545     }
1546
1547   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1548   if (inner_loop_cond)
1549     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1550       = loop_exit_ctrl_vec_info_type;
1551
1552   gcc_assert (!loop->aux);
1553   loop->aux = loop_vinfo;
1554   return loop_vinfo;
1555 }
1556
1557
1558
1559 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1560    statements update the vectorization factor.  */
1561
1562 static void
1563 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1564 {
1565   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567   int nbbs = loop->num_nodes;
1568   poly_uint64 vectorization_factor;
1569   int i;
1570
1571   if (dump_enabled_p ())
1572     dump_printf_loc (MSG_NOTE, vect_location,
1573                      "=== vect_update_vf_for_slp ===\n");
1574
1575   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1576   gcc_assert (known_ne (vectorization_factor, 0U));
1577
1578   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1579      vectorization factor of the loop is the unrolling factor required by
1580      the SLP instances.  If that unrolling factor is 1, we say, that we
1581      perform pure SLP on loop - cross iteration parallelism is not
1582      exploited.  */
1583   bool only_slp_in_loop = true;
1584   for (i = 0; i < nbbs; i++)
1585     {
1586       basic_block bb = bbs[i];
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1592           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1593               && STMT_VINFO_RELATED_STMT (stmt_info))
1594             {
1595               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1596               stmt_info = vinfo_for_stmt (stmt);
1597             }
1598           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1599                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1600               && !PURE_SLP_STMT (stmt_info))
1601             /* STMT needs both SLP and loop-based vectorization.  */
1602             only_slp_in_loop = false;
1603         }
1604     }
1605
1606   if (only_slp_in_loop)
1607     {
1608       dump_printf_loc (MSG_NOTE, vect_location,
1609                        "Loop contains only SLP stmts\n");
1610       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1611     }
1612   else
1613     {
1614       dump_printf_loc (MSG_NOTE, vect_location,
1615                        "Loop contains SLP and non-SLP stmts\n");
1616       /* Both the vectorization factor and unroll factor have the form
1617          current_vector_size * X for some rational X, so they must have
1618          a common multiple.  */
1619       vectorization_factor
1620         = force_common_multiple (vectorization_factor,
1621                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1622     }
1623
1624   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1625   if (dump_enabled_p ())
1626     {
1627       dump_printf_loc (MSG_NOTE, vect_location,
1628                        "Updating vectorization factor to ");
1629       dump_dec (MSG_NOTE, vectorization_factor);
1630       dump_printf (MSG_NOTE, ".\n");
1631     }
1632 }
1633
1634 /* Function vect_analyze_loop_operations.
1635
1636    Scan the loop stmts and make sure they are all vectorizable.  */
1637
1638 static bool
1639 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1640 {
1641   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1642   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1643   int nbbs = loop->num_nodes;
1644   int i;
1645   stmt_vec_info stmt_info;
1646   bool need_to_vectorize = false;
1647   bool ok;
1648
1649   if (dump_enabled_p ())
1650     dump_printf_loc (MSG_NOTE, vect_location,
1651                      "=== vect_analyze_loop_operations ===\n");
1652
1653   for (i = 0; i < nbbs; i++)
1654     {
1655       basic_block bb = bbs[i];
1656
1657       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1658            gsi_next (&si))
1659         {
1660           gphi *phi = si.phi ();
1661           ok = true;
1662
1663           stmt_info = vinfo_for_stmt (phi);
1664           if (dump_enabled_p ())
1665             {
1666               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1667               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1668             }
1669           if (virtual_operand_p (gimple_phi_result (phi)))
1670             continue;
1671
1672           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1673              (i.e., a phi in the tail of the outer-loop).  */
1674           if (! is_loop_header_bb_p (bb))
1675             {
1676               /* FORNOW: we currently don't support the case that these phis
1677                  are not used in the outerloop (unless it is double reduction,
1678                  i.e., this phi is vect_reduction_def), cause this case
1679                  requires to actually do something here.  */
1680               if (STMT_VINFO_LIVE_P (stmt_info)
1681                   && STMT_VINFO_DEF_TYPE (stmt_info)
1682                      != vect_double_reduction_def)
1683                 {
1684                   if (dump_enabled_p ())
1685                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                                      "Unsupported loop-closed phi in "
1687                                      "outer-loop.\n");
1688                   return false;
1689                 }
1690
1691               /* If PHI is used in the outer loop, we check that its operand
1692                  is defined in the inner loop.  */
1693               if (STMT_VINFO_RELEVANT_P (stmt_info))
1694                 {
1695                   tree phi_op;
1696                   gimple *op_def_stmt;
1697
1698                   if (gimple_phi_num_args (phi) != 1)
1699                     return false;
1700
1701                   phi_op = PHI_ARG_DEF (phi, 0);
1702                   if (TREE_CODE (phi_op) != SSA_NAME)
1703                     return false;
1704
1705                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1706                   if (gimple_nop_p (op_def_stmt)
1707                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1708                       || !vinfo_for_stmt (op_def_stmt))
1709                     return false;
1710
1711                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1712                         != vect_used_in_outer
1713                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1714                            != vect_used_in_outer_by_reduction)
1715                     return false;
1716                 }
1717
1718               continue;
1719             }
1720
1721           gcc_assert (stmt_info);
1722
1723           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1724                || STMT_VINFO_LIVE_P (stmt_info))
1725               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1726             {
1727               /* A scalar-dependence cycle that we don't support.  */
1728               if (dump_enabled_p ())
1729                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                                  "not vectorized: scalar dependence cycle.\n");
1731               return false;
1732             }
1733
1734           if (STMT_VINFO_RELEVANT_P (stmt_info))
1735             {
1736               need_to_vectorize = true;
1737               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1738                   && ! PURE_SLP_STMT (stmt_info))
1739                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1740               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1741                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1742                        && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1744             }
1745
1746           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1747             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1748
1749           if (!ok)
1750             {
1751               if (dump_enabled_p ())
1752                 {
1753                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                                    "not vectorized: relevant phi not "
1755                                    "supported: ");
1756                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1757                 }
1758               return false;
1759             }
1760         }
1761
1762       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1763            gsi_next (&si))
1764         {
1765           gimple *stmt = gsi_stmt (si);
1766           if (!gimple_clobber_p (stmt)
1767               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1768             return false;
1769         }
1770     } /* bbs */
1771
1772   /* All operations in the loop are either irrelevant (deal with loop
1773      control, or dead), or only used outside the loop and can be moved
1774      out of the loop (e.g. invariants, inductions).  The loop can be
1775      optimized away by scalar optimizations.  We're better off not
1776      touching this loop.  */
1777   if (!need_to_vectorize)
1778     {
1779       if (dump_enabled_p ())
1780         dump_printf_loc (MSG_NOTE, vect_location,
1781                          "All the computation can be taken out of the loop.\n");
1782       if (dump_enabled_p ())
1783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784                          "not vectorized: redundant loop. no profit to "
1785                          "vectorize.\n");
1786       return false;
1787     }
1788
1789   return true;
1790 }
1791
1792
1793 /* Function vect_analyze_loop_2.
1794
1795    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1796    for it.  The different analyses will record information in the
1797    loop_vec_info struct.  */
1798 static bool
1799 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1800 {
1801   bool ok;
1802   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1803   poly_uint64 min_vf = 2;
1804   unsigned int n_stmts = 0;
1805
1806   /* The first group of checks is independent of the vector size.  */
1807   fatal = true;
1808
1809   /* Find all data references in the loop (which correspond to vdefs/vuses)
1810      and analyze their evolution in the loop.  */
1811
1812   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1813
1814   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1815   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1816     {
1817       if (dump_enabled_p ())
1818         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1819                          "not vectorized: loop nest containing two "
1820                          "or more consecutive inner loops cannot be "
1821                          "vectorized\n");
1822       return false;
1823     }
1824
1825   for (unsigned i = 0; i < loop->num_nodes; i++)
1826     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1827          !gsi_end_p (gsi); gsi_next (&gsi))
1828       {
1829         gimple *stmt = gsi_stmt (gsi);
1830         if (is_gimple_debug (stmt))
1831           continue;
1832         ++n_stmts;
1833         if (!find_data_references_in_stmt (loop, stmt,
1834                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1835           {
1836             if (is_gimple_call (stmt) && loop->safelen)
1837               {
1838                 tree fndecl = gimple_call_fndecl (stmt), op;
1839                 if (fndecl != NULL_TREE)
1840                   {
1841                     cgraph_node *node = cgraph_node::get (fndecl);
1842                     if (node != NULL && node->simd_clones != NULL)
1843                       {
1844                         unsigned int j, n = gimple_call_num_args (stmt);
1845                         for (j = 0; j < n; j++)
1846                           {
1847                             op = gimple_call_arg (stmt, j);
1848                             if (DECL_P (op)
1849                                 || (REFERENCE_CLASS_P (op)
1850                                     && get_base_address (op)))
1851                               break;
1852                           }
1853                         op = gimple_call_lhs (stmt);
1854                         /* Ignore #pragma omp declare simd functions
1855                            if they don't have data references in the
1856                            call stmt itself.  */
1857                         if (j == n
1858                             && !(op
1859                                  && (DECL_P (op)
1860                                      || (REFERENCE_CLASS_P (op)
1861                                          && get_base_address (op)))))
1862                           continue;
1863                       }
1864                   }
1865               }
1866             if (dump_enabled_p ())
1867               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                                "not vectorized: loop contains function "
1869                                "calls or data references that cannot "
1870                                "be analyzed\n");
1871             return false;
1872           }
1873       }
1874
1875   /* Analyze the data references and also adjust the minimal
1876      vectorization factor according to the loads and stores.  */
1877
1878   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1879   if (!ok)
1880     {
1881       if (dump_enabled_p ())
1882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883                          "bad data references.\n");
1884       return false;
1885     }
1886
1887   /* Classify all cross-iteration scalar data-flow cycles.
1888      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1889   vect_analyze_scalar_cycles (loop_vinfo);
1890
1891   vect_pattern_recog (loop_vinfo);
1892
1893   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1894
1895   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1896      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1897
1898   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "bad data access.\n");
1904       return false;
1905     }
1906
1907   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1908
1909   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "unexpected pattern.\n");
1915       return false;
1916     }
1917
1918   /* While the rest of the analysis below depends on it in some way.  */
1919   fatal = false;
1920
1921   /* Analyze data dependences between the data-refs in the loop
1922      and adjust the maximum vectorization factor according to
1923      the dependences.
1924      FORNOW: fail at the first data dependence that we encounter.  */
1925
1926   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1927   if (!ok
1928       || (max_vf != MAX_VECTORIZATION_FACTOR
1929           && maybe_lt (max_vf, min_vf)))
1930     {
1931       if (dump_enabled_p ())
1932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                              "bad data dependence.\n");
1934       return false;
1935     }
1936   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1937
1938   ok = vect_determine_vectorization_factor (loop_vinfo);
1939   if (!ok)
1940     {
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "can't determine vectorization factor.\n");
1944       return false;
1945     }
1946   if (max_vf != MAX_VECTORIZATION_FACTOR
1947       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1948     {
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1951                          "bad data dependence.\n");
1952       return false;
1953     }
1954
1955   /* Compute the scalar iteration cost.  */
1956   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1957
1958   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   HOST_WIDE_INT estimated_niter;
1960   unsigned th;
1961   int min_scalar_loop_bound;
1962
1963   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1964   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1965   if (!ok)
1966     return false;
1967
1968   /* If there are any SLP instances mark them as pure_slp.  */
1969   bool slp = vect_make_slp_decision (loop_vinfo);
1970   if (slp)
1971     {
1972       /* Find stmts that need to be both vectorized and SLPed.  */
1973       vect_detect_hybrid_slp (loop_vinfo);
1974
1975       /* Update the vectorization factor based on the SLP decision.  */
1976       vect_update_vf_for_slp (loop_vinfo);
1977     }
1978
1979   /* This is the point where we can re-start analysis with SLP forced off.  */
1980 start_over:
1981
1982   /* Now the vectorization factor is final.  */
1983   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1984   gcc_assert (known_ne (vectorization_factor, 0U));
1985   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1986
1987   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1988     {
1989       dump_printf_loc (MSG_NOTE, vect_location,
1990                        "vectorization_factor = ");
1991       dump_dec (MSG_NOTE, vectorization_factor);
1992       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1993                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1994     }
1995
1996   HOST_WIDE_INT max_niter
1997     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1998   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1999        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < assumed_vf))
2000       || (max_niter != -1
2001           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf))
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: iteration count smaller than "
2006                          "vectorization factor.\n");
2007       return false;
2008     }
2009
2010   /* Analyze the alignment of the data-refs in the loop.
2011      Fail if a data reference is found that cannot be vectorized.  */
2012
2013   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2014   if (!ok)
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "bad data alignment.\n");
2019       return false;
2020     }
2021
2022   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2023      It is important to call pruning after vect_analyze_data_ref_accesses,
2024      since we use grouping information gathered by interleaving analysis.  */
2025   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2026   if (!ok)
2027     return false;
2028
2029   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2030      vectorization.  */
2031   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2032     {
2033     /* This pass will decide on using loop versioning and/or loop peeling in
2034        order to enhance the alignment of data references in the loop.  */
2035     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2036     if (!ok)
2037       {
2038         if (dump_enabled_p ())
2039           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040                            "bad data alignment.\n");
2041         return false;
2042       }
2043     }
2044
2045   if (slp)
2046     {
2047       /* Analyze operations in the SLP instances.  Note this may
2048          remove unsupported SLP instances which makes the above
2049          SLP kind detection invalid.  */
2050       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2051       vect_slp_analyze_operations (loop_vinfo);
2052       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2053         goto again;
2054     }
2055
2056   /* Scan all the remaining operations in the loop that are not subject
2057      to SLP and make sure they are vectorizable.  */
2058   ok = vect_analyze_loop_operations (loop_vinfo);
2059   if (!ok)
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "bad operation or unsupported loop bound.\n");
2064       return false;
2065     }
2066
2067   /* If epilog loop is required because of data accesses with gaps,
2068      one additional iteration needs to be peeled.  Check if there is
2069      enough iterations for vectorization.  */
2070   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2071       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2072     {
2073       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2074       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2075
2076       if (known_lt (wi::to_widest (scalar_niters), vf))
2077         {
2078           if (dump_enabled_p ())
2079             dump_printf_loc (MSG_NOTE, vect_location,
2080                              "loop has no enough iterations to support"
2081                              " peeling for gaps.\n");
2082           return false;
2083         }
2084     }
2085
2086   /* Analyze cost.  Decide if worth while to vectorize.  */
2087   int min_profitable_estimate, min_profitable_iters;
2088   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2089                                       &min_profitable_estimate);
2090
2091   if (min_profitable_iters < 0)
2092     {
2093       if (dump_enabled_p ())
2094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095                          "not vectorized: vectorization not profitable.\n");
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "not vectorized: vector version will never be "
2099                          "profitable.\n");
2100       goto again;
2101     }
2102
2103   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2104                            * assumed_vf);
2105
2106   /* Use the cost model only if it is more conservative than user specified
2107      threshold.  */
2108   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2109
2110   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2111
2112   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2113       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2114     {
2115       if (dump_enabled_p ())
2116         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                          "not vectorized: vectorization not profitable.\n");
2118       if (dump_enabled_p ())
2119         dump_printf_loc (MSG_NOTE, vect_location,
2120                          "not vectorized: iteration count smaller than user "
2121                          "specified loop bound parameter or minimum profitable "
2122                          "iterations (whichever is more conservative).\n");
2123       goto again;
2124     }
2125
2126   estimated_niter
2127     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2128   if (estimated_niter == -1)
2129     estimated_niter = max_niter;
2130   if (estimated_niter != -1
2131       && ((unsigned HOST_WIDE_INT) estimated_niter
2132           < MAX (th, (unsigned) min_profitable_estimate)))
2133     {
2134       if (dump_enabled_p ())
2135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                          "not vectorized: estimated iteration count too "
2137                          "small.\n");
2138       if (dump_enabled_p ())
2139         dump_printf_loc (MSG_NOTE, vect_location,
2140                          "not vectorized: estimated iteration count smaller "
2141                          "than specified loop bound parameter or minimum "
2142                          "profitable iterations (whichever is more "
2143                          "conservative).\n");
2144       goto again;
2145     }
2146
2147   /* Decide whether we need to create an epilogue loop to handle
2148      remaining scalar iterations.  */
2149   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2150
2151   unsigned HOST_WIDE_INT const_vf;
2152   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2154     {
2155       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2156                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2157                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2158         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2159     }
2160   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2161            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2162            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2163                 < (unsigned) exact_log2 (const_vf))
2164                /* In case of versioning, check if the maximum number of
2165                   iterations is greater than th.  If they are identical,
2166                   the epilogue is unnecessary.  */
2167                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2168                    || ((unsigned HOST_WIDE_INT) max_niter
2169                        > (th / const_vf) * const_vf))))
2170     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2171
2172   /* If an epilogue loop is required make sure we can create one.  */
2173   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2174       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2175     {
2176       if (dump_enabled_p ())
2177         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2178       if (!vect_can_advance_ivs_p (loop_vinfo)
2179           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2180                                            single_exit (LOOP_VINFO_LOOP
2181                                                          (loop_vinfo))))
2182         {
2183           if (dump_enabled_p ())
2184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185                              "not vectorized: can't create required "
2186                              "epilog loop\n");
2187           goto again;
2188         }
2189     }
2190
2191   /* During peeling, we need to check if number of loop iterations is
2192      enough for both peeled prolog loop and vector loop.  This check
2193      can be merged along with threshold check of loop versioning, so
2194      increase threshold for this case if necessary.  */
2195   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2196     {
2197       poly_uint64 niters_th;
2198
2199       /* Niters for peeled prolog loop.  */
2200       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2201         {
2202           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2203           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2204
2205           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2206         }
2207       else
2208         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2209
2210       /* Niters for at least one iteration of vectorized loop.  */
2211       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2212       /* One additional iteration because of peeling for gap.  */
2213       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2214         niters_th += 1;
2215       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2216     }
2217
2218   gcc_assert (known_eq (vectorization_factor,
2219                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2220
2221   /* Ok to vectorize!  */
2222   return true;
2223
2224 again:
2225   /* Try again with SLP forced off but if we didn't do any SLP there is
2226      no point in re-trying.  */
2227   if (!slp)
2228     return false;
2229
2230   /* If there are reduction chains re-trying will fail anyway.  */
2231   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2232     return false;
2233
2234   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2235      via interleaving or lane instructions.  */
2236   slp_instance instance;
2237   slp_tree node;
2238   unsigned i, j;
2239   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2240     {
2241       stmt_vec_info vinfo;
2242       vinfo = vinfo_for_stmt
2243           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2244       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2245         continue;
2246       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2247       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2248       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2249       if (! vect_store_lanes_supported (vectype, size)
2250           && ! vect_grouped_store_supported (vectype, size))
2251         return false;
2252       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2253         {
2254           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2255           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2256           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2257           size = STMT_VINFO_GROUP_SIZE (vinfo);
2258           vectype = STMT_VINFO_VECTYPE (vinfo);
2259           if (! vect_load_lanes_supported (vectype, size)
2260               && ! vect_grouped_load_supported (vectype, single_element_p,
2261                                                 size))
2262             return false;
2263         }
2264     }
2265
2266   if (dump_enabled_p ())
2267     dump_printf_loc (MSG_NOTE, vect_location,
2268                      "re-trying with SLP disabled\n");
2269
2270   /* Roll back state appropriately.  No SLP this time.  */
2271   slp = false;
2272   /* Restore vectorization factor as it were without SLP.  */
2273   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2274   /* Free the SLP instances.  */
2275   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2276     vect_free_slp_instance (instance);
2277   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2278   /* Reset SLP type to loop_vect on all stmts.  */
2279   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2280     {
2281       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2282       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2283            !gsi_end_p (si); gsi_next (&si))
2284         {
2285           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2286           STMT_SLP_TYPE (stmt_info) = loop_vect;
2287         }
2288       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2289            !gsi_end_p (si); gsi_next (&si))
2290         {
2291           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2292           STMT_SLP_TYPE (stmt_info) = loop_vect;
2293           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2294             {
2295               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2296               STMT_SLP_TYPE (stmt_info) = loop_vect;
2297               for (gimple_stmt_iterator pi
2298                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2299                    !gsi_end_p (pi); gsi_next (&pi))
2300                 {
2301                   gimple *pstmt = gsi_stmt (pi);
2302                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2303                 }
2304             }
2305         }
2306     }
2307   /* Free optimized alias test DDRS.  */
2308   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2309   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2310   /* Reset target cost data.  */
2311   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2312   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2313     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2314   /* Reset assorted flags.  */
2315   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2316   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2317   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2318   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2319
2320   goto start_over;
2321 }
2322
2323 /* Function vect_analyze_loop.
2324
2325    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2326    for it.  The different analyses will record information in the
2327    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2328    be vectorized.  */
2329 loop_vec_info
2330 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2331 {
2332   loop_vec_info loop_vinfo;
2333   unsigned int vector_sizes;
2334
2335   /* Autodetect first vector size we try.  */
2336   current_vector_size = 0;
2337   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2338
2339   if (dump_enabled_p ())
2340     dump_printf_loc (MSG_NOTE, vect_location,
2341                      "===== analyze_loop_nest =====\n");
2342
2343   if (loop_outer (loop)
2344       && loop_vec_info_for_loop (loop_outer (loop))
2345       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2346     {
2347       if (dump_enabled_p ())
2348         dump_printf_loc (MSG_NOTE, vect_location,
2349                          "outer-loop already vectorized.\n");
2350       return NULL;
2351     }
2352
2353   while (1)
2354     {
2355       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2356       loop_vinfo = vect_analyze_loop_form (loop);
2357       if (!loop_vinfo)
2358         {
2359           if (dump_enabled_p ())
2360             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2361                              "bad loop form.\n");
2362           return NULL;
2363         }
2364
2365       bool fatal = false;
2366
2367       if (orig_loop_vinfo)
2368         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2369
2370       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2371         {
2372           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2373
2374           return loop_vinfo;
2375         }
2376
2377       delete loop_vinfo;
2378
2379       vector_sizes &= ~current_vector_size;
2380       if (fatal
2381           || vector_sizes == 0
2382           || current_vector_size == 0)
2383         return NULL;
2384
2385       /* Try the next biggest vector size.  */
2386       current_vector_size = 1 << floor_log2 (vector_sizes);
2387       if (dump_enabled_p ())
2388         dump_printf_loc (MSG_NOTE, vect_location,
2389                          "***** Re-trying analysis with "
2390                          "vector size %d\n", current_vector_size);
2391     }
2392 }
2393
2394
2395 /* Function reduction_fn_for_scalar_code
2396
2397    Input:
2398    CODE - tree_code of a reduction operations.
2399
2400    Output:
2401    REDUC_FN - the corresponding internal function to be used to reduce the
2402       vector of partial results into a single scalar result, or IFN_LAST
2403       if the operation is a supported reduction operation, but does not have
2404       such an internal function.
2405
2406    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2407
2408 static bool
2409 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2410 {
2411   switch (code)
2412     {
2413       case MAX_EXPR:
2414         *reduc_fn = IFN_REDUC_MAX;
2415         return true;
2416
2417       case MIN_EXPR:
2418         *reduc_fn = IFN_REDUC_MIN;
2419         return true;
2420
2421       case PLUS_EXPR:
2422         *reduc_fn = IFN_REDUC_PLUS;
2423         return true;
2424
2425       case MULT_EXPR:
2426       case MINUS_EXPR:
2427       case BIT_IOR_EXPR:
2428       case BIT_XOR_EXPR:
2429       case BIT_AND_EXPR:
2430         *reduc_fn = IFN_LAST;
2431         return true;
2432
2433       default:
2434        return false;
2435     }
2436 }
2437
2438
2439 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2440    STMT is printed with a message MSG. */
2441
2442 static void
2443 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2444 {
2445   dump_printf_loc (msg_type, vect_location, "%s", msg);
2446   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2447 }
2448
2449
2450 /* Detect SLP reduction of the form:
2451
2452    #a1 = phi <a5, a0>
2453    a2 = operation (a1)
2454    a3 = operation (a2)
2455    a4 = operation (a3)
2456    a5 = operation (a4)
2457
2458    #a = phi <a5>
2459
2460    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2461    FIRST_STMT is the first reduction stmt in the chain
2462    (a2 = operation (a1)).
2463
2464    Return TRUE if a reduction chain was detected.  */
2465
2466 static bool
2467 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2468                        gimple *first_stmt)
2469 {
2470   struct loop *loop = (gimple_bb (phi))->loop_father;
2471   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2472   enum tree_code code;
2473   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2474   stmt_vec_info use_stmt_info, current_stmt_info;
2475   tree lhs;
2476   imm_use_iterator imm_iter;
2477   use_operand_p use_p;
2478   int nloop_uses, size = 0, n_out_of_loop_uses;
2479   bool found = false;
2480
2481   if (loop != vect_loop)
2482     return false;
2483
2484   lhs = PHI_RESULT (phi);
2485   code = gimple_assign_rhs_code (first_stmt);
2486   while (1)
2487     {
2488       nloop_uses = 0;
2489       n_out_of_loop_uses = 0;
2490       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2491         {
2492           gimple *use_stmt = USE_STMT (use_p);
2493           if (is_gimple_debug (use_stmt))
2494             continue;
2495
2496           /* Check if we got back to the reduction phi.  */
2497           if (use_stmt == phi)
2498             {
2499               loop_use_stmt = use_stmt;
2500               found = true;
2501               break;
2502             }
2503
2504           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2505             {
2506               loop_use_stmt = use_stmt;
2507               nloop_uses++;
2508             }
2509            else
2510              n_out_of_loop_uses++;
2511
2512            /* There are can be either a single use in the loop or two uses in
2513               phi nodes.  */
2514            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2515              return false;
2516         }
2517
2518       if (found)
2519         break;
2520
2521       /* We reached a statement with no loop uses.  */
2522       if (nloop_uses == 0)
2523         return false;
2524
2525       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2526       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2527         return false;
2528
2529       if (!is_gimple_assign (loop_use_stmt)
2530           || code != gimple_assign_rhs_code (loop_use_stmt)
2531           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2532         return false;
2533
2534       /* Insert USE_STMT into reduction chain.  */
2535       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2536       if (current_stmt)
2537         {
2538           current_stmt_info = vinfo_for_stmt (current_stmt);
2539           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2540           GROUP_FIRST_ELEMENT (use_stmt_info)
2541             = GROUP_FIRST_ELEMENT (current_stmt_info);
2542         }
2543       else
2544         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2545
2546       lhs = gimple_assign_lhs (loop_use_stmt);
2547       current_stmt = loop_use_stmt;
2548       size++;
2549    }
2550
2551   if (!found || loop_use_stmt != phi || size < 2)
2552     return false;
2553
2554   /* Swap the operands, if needed, to make the reduction operand be the second
2555      operand.  */
2556   lhs = PHI_RESULT (phi);
2557   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2558   while (next_stmt)
2559     {
2560       if (gimple_assign_rhs2 (next_stmt) == lhs)
2561         {
2562           tree op = gimple_assign_rhs1 (next_stmt);
2563           gimple *def_stmt = NULL;
2564
2565           if (TREE_CODE (op) == SSA_NAME)
2566             def_stmt = SSA_NAME_DEF_STMT (op);
2567
2568           /* Check that the other def is either defined in the loop
2569              ("vect_internal_def"), or it's an induction (defined by a
2570              loop-header phi-node).  */
2571           if (def_stmt
2572               && gimple_bb (def_stmt)
2573               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2574               && (is_gimple_assign (def_stmt)
2575                   || is_gimple_call (def_stmt)
2576                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2577                            == vect_induction_def
2578                   || (gimple_code (def_stmt) == GIMPLE_PHI
2579                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2580                                   == vect_internal_def
2581                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2582             {
2583               lhs = gimple_assign_lhs (next_stmt);
2584               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2585               continue;
2586             }
2587
2588           return false;
2589         }
2590       else
2591         {
2592           tree op = gimple_assign_rhs2 (next_stmt);
2593           gimple *def_stmt = NULL;
2594
2595           if (TREE_CODE (op) == SSA_NAME)
2596             def_stmt = SSA_NAME_DEF_STMT (op);
2597
2598           /* Check that the other def is either defined in the loop
2599             ("vect_internal_def"), or it's an induction (defined by a
2600             loop-header phi-node).  */
2601           if (def_stmt
2602               && gimple_bb (def_stmt)
2603               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2604               && (is_gimple_assign (def_stmt)
2605                   || is_gimple_call (def_stmt)
2606                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2607                               == vect_induction_def
2608                   || (gimple_code (def_stmt) == GIMPLE_PHI
2609                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2610                                   == vect_internal_def
2611                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2612             {
2613               if (dump_enabled_p ())
2614                 {
2615                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2616                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2617                 }
2618
2619               swap_ssa_operands (next_stmt,
2620                                  gimple_assign_rhs1_ptr (next_stmt),
2621                                  gimple_assign_rhs2_ptr (next_stmt));
2622               update_stmt (next_stmt);
2623
2624               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2625                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2626             }
2627           else
2628             return false;
2629         }
2630
2631       lhs = gimple_assign_lhs (next_stmt);
2632       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2633     }
2634
2635   /* Save the chain for further analysis in SLP detection.  */
2636   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2637   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2638   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2639
2640   return true;
2641 }
2642
2643
2644 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2645    reduction operation CODE has a handled computation expression.  */
2646
2647 bool
2648 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2649                       enum tree_code code)
2650 {
2651   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2652   auto_bitmap visited;
2653   tree lookfor = PHI_RESULT (phi);
2654   ssa_op_iter curri;
2655   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2656   while (USE_FROM_PTR (curr) != loop_arg)
2657     curr = op_iter_next_use (&curri);
2658   curri.i = curri.numops;
2659   do
2660     {
2661       path.safe_push (std::make_pair (curri, curr));
2662       tree use = USE_FROM_PTR (curr);
2663       if (use == lookfor)
2664         break;
2665       gimple *def = SSA_NAME_DEF_STMT (use);
2666       if (gimple_nop_p (def)
2667           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2668         {
2669 pop:
2670           do
2671             {
2672               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2673               curri = x.first;
2674               curr = x.second;
2675               do
2676                 curr = op_iter_next_use (&curri);
2677               /* Skip already visited or non-SSA operands (from iterating
2678                  over PHI args).  */
2679               while (curr != NULL_USE_OPERAND_P
2680                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2681                          || ! bitmap_set_bit (visited,
2682                                               SSA_NAME_VERSION
2683                                                 (USE_FROM_PTR (curr)))));
2684             }
2685           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2686           if (curr == NULL_USE_OPERAND_P)
2687             break;
2688         }
2689       else
2690         {
2691           if (gimple_code (def) == GIMPLE_PHI)
2692             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2693           else
2694             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2695           while (curr != NULL_USE_OPERAND_P
2696                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2697                      || ! bitmap_set_bit (visited,
2698                                           SSA_NAME_VERSION
2699                                             (USE_FROM_PTR (curr)))))
2700             curr = op_iter_next_use (&curri);
2701           if (curr == NULL_USE_OPERAND_P)
2702             goto pop;
2703         }
2704     }
2705   while (1);
2706   if (dump_file && (dump_flags & TDF_DETAILS))
2707     {
2708       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2709       unsigned i;
2710       std::pair<ssa_op_iter, use_operand_p> *x;
2711       FOR_EACH_VEC_ELT (path, i, x)
2712         {
2713           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2714           dump_printf (MSG_NOTE, " ");
2715         }
2716       dump_printf (MSG_NOTE, "\n");
2717     }
2718
2719   /* Check whether the reduction path detected is valid.  */
2720   bool fail = path.length () == 0;
2721   bool neg = false;
2722   for (unsigned i = 1; i < path.length (); ++i)
2723     {
2724       gimple *use_stmt = USE_STMT (path[i].second);
2725       tree op = USE_FROM_PTR (path[i].second);
2726       if (! has_single_use (op)
2727           || ! is_gimple_assign (use_stmt))
2728         {
2729           fail = true;
2730           break;
2731         }
2732       if (gimple_assign_rhs_code (use_stmt) != code)
2733         {
2734           if (code == PLUS_EXPR
2735               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2736             {
2737               /* Track whether we negate the reduction value each iteration.  */
2738               if (gimple_assign_rhs2 (use_stmt) == op)
2739                 neg = ! neg;
2740             }
2741           else
2742             {
2743               fail = true;
2744               break;
2745             }
2746         }
2747     }
2748   return ! fail && ! neg;
2749 }
2750
2751
2752 /* Function vect_is_simple_reduction
2753
2754    (1) Detect a cross-iteration def-use cycle that represents a simple
2755    reduction computation.  We look for the following pattern:
2756
2757    loop_header:
2758      a1 = phi < a0, a2 >
2759      a3 = ...
2760      a2 = operation (a3, a1)
2761
2762    or
2763
2764    a3 = ...
2765    loop_header:
2766      a1 = phi < a0, a2 >
2767      a2 = operation (a3, a1)
2768
2769    such that:
2770    1. operation is commutative and associative and it is safe to
2771       change the order of the computation
2772    2. no uses for a2 in the loop (a2 is used out of the loop)
2773    3. no uses of a1 in the loop besides the reduction operation
2774    4. no uses of a1 outside the loop.
2775
2776    Conditions 1,4 are tested here.
2777    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2778
2779    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2780    nested cycles.
2781
2782    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2783    reductions:
2784
2785      a1 = phi < a0, a2 >
2786      inner loop (def of a3)
2787      a2 = phi < a3 >
2788
2789    (4) Detect condition expressions, ie:
2790      for (int i = 0; i < N; i++)
2791        if (a[i] < val)
2792         ret_val = a[i];
2793
2794 */
2795
2796 static gimple *
2797 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2798                           bool *double_reduc,
2799                           bool need_wrapping_integral_overflow,
2800                           enum vect_reduction_type *v_reduc_type)
2801 {
2802   struct loop *loop = (gimple_bb (phi))->loop_father;
2803   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2804   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2805   enum tree_code orig_code, code;
2806   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2807   tree type;
2808   int nloop_uses;
2809   tree name;
2810   imm_use_iterator imm_iter;
2811   use_operand_p use_p;
2812   bool phi_def;
2813
2814   *double_reduc = false;
2815   *v_reduc_type = TREE_CODE_REDUCTION;
2816
2817   tree phi_name = PHI_RESULT (phi);
2818   /* ???  If there are no uses of the PHI result the inner loop reduction
2819      won't be detected as possibly double-reduction by vectorizable_reduction
2820      because that tries to walk the PHI arg from the preheader edge which
2821      can be constant.  See PR60382.  */
2822   if (has_zero_uses (phi_name))
2823     return NULL;
2824   nloop_uses = 0;
2825   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2826     {
2827       gimple *use_stmt = USE_STMT (use_p);
2828       if (is_gimple_debug (use_stmt))
2829         continue;
2830
2831       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2832         {
2833           if (dump_enabled_p ())
2834             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2835                              "intermediate value used outside loop.\n");
2836
2837           return NULL;
2838         }
2839
2840       nloop_uses++;
2841       if (nloop_uses > 1)
2842         {
2843           if (dump_enabled_p ())
2844             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845                              "reduction value used in loop.\n");
2846           return NULL;
2847         }
2848
2849       phi_use_stmt = use_stmt;
2850     }
2851
2852   edge latch_e = loop_latch_edge (loop);
2853   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2854   if (TREE_CODE (loop_arg) != SSA_NAME)
2855     {
2856       if (dump_enabled_p ())
2857         {
2858           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2859                            "reduction: not ssa_name: ");
2860           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2861           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2862         }
2863       return NULL;
2864     }
2865
2866   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2867   if (is_gimple_assign (def_stmt))
2868     {
2869       name = gimple_assign_lhs (def_stmt);
2870       phi_def = false;
2871     }
2872   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2873     {
2874       name = PHI_RESULT (def_stmt);
2875       phi_def = true;
2876     }
2877   else
2878     {
2879       if (dump_enabled_p ())
2880         {
2881           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2882                            "reduction: unhandled reduction operation: ");
2883           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2884         }
2885       return NULL;
2886     }
2887
2888   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2889     return NULL;
2890
2891   nloop_uses = 0;
2892   auto_vec<gphi *, 3> lcphis;
2893   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2894     {
2895       gimple *use_stmt = USE_STMT (use_p);
2896       if (is_gimple_debug (use_stmt))
2897         continue;
2898       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2899         nloop_uses++;
2900       else
2901         /* We can have more than one loop-closed PHI.  */
2902         lcphis.safe_push (as_a <gphi *> (use_stmt));
2903       if (nloop_uses > 1)
2904         {
2905           if (dump_enabled_p ())
2906             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2907                              "reduction used in loop.\n");
2908           return NULL;
2909         }
2910     }
2911
2912   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2913      defined in the inner loop.  */
2914   if (phi_def)
2915     {
2916       op1 = PHI_ARG_DEF (def_stmt, 0);
2917
2918       if (gimple_phi_num_args (def_stmt) != 1
2919           || TREE_CODE (op1) != SSA_NAME)
2920         {
2921           if (dump_enabled_p ())
2922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2923                              "unsupported phi node definition.\n");
2924
2925           return NULL;
2926         }
2927
2928       def1 = SSA_NAME_DEF_STMT (op1);
2929       if (gimple_bb (def1)
2930           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2931           && loop->inner
2932           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2933           && is_gimple_assign (def1)
2934           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2935         {
2936           if (dump_enabled_p ())
2937             report_vect_op (MSG_NOTE, def_stmt,
2938                             "detected double reduction: ");
2939
2940           *double_reduc = true;
2941           return def_stmt;
2942         }
2943
2944       return NULL;
2945     }
2946
2947   /* If we are vectorizing an inner reduction we are executing that
2948      in the original order only in case we are not dealing with a
2949      double reduction.  */
2950   bool check_reduction = true;
2951   if (flow_loop_nested_p (vect_loop, loop))
2952     {
2953       gphi *lcphi;
2954       unsigned i;
2955       check_reduction = false;
2956       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2957         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2958           {
2959             gimple *use_stmt = USE_STMT (use_p);
2960             if (is_gimple_debug (use_stmt))
2961               continue;
2962             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2963               check_reduction = true;
2964           }
2965     }
2966
2967   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2968   code = orig_code = gimple_assign_rhs_code (def_stmt);
2969
2970   /* We can handle "res -= x[i]", which is non-associative by
2971      simply rewriting this into "res += -x[i]".  Avoid changing
2972      gimple instruction for the first simple tests and only do this
2973      if we're allowed to change code at all.  */
2974   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2975     code = PLUS_EXPR;
2976
2977   if (code == COND_EXPR)
2978     {
2979       if (! nested_in_vect_loop)
2980         *v_reduc_type = COND_REDUCTION;
2981
2982       op3 = gimple_assign_rhs1 (def_stmt);
2983       if (COMPARISON_CLASS_P (op3))
2984         {
2985           op4 = TREE_OPERAND (op3, 1);
2986           op3 = TREE_OPERAND (op3, 0);
2987         }
2988       if (op3 == phi_name || op4 == phi_name)
2989         {
2990           if (dump_enabled_p ())
2991             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2992                             "reduction: condition depends on previous"
2993                             " iteration: ");
2994           return NULL;
2995         }
2996
2997       op1 = gimple_assign_rhs2 (def_stmt);
2998       op2 = gimple_assign_rhs3 (def_stmt);
2999     }
3000   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3001     {
3002       if (dump_enabled_p ())
3003         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3004                         "reduction: not commutative/associative: ");
3005       return NULL;
3006     }
3007   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3008     {
3009       op1 = gimple_assign_rhs1 (def_stmt);
3010       op2 = gimple_assign_rhs2 (def_stmt);
3011     }
3012   else
3013     {
3014       if (dump_enabled_p ())
3015         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3016                         "reduction: not handled operation: ");
3017       return NULL;
3018     }
3019
3020   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3021     {
3022       if (dump_enabled_p ())
3023         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3024                         "reduction: both uses not ssa_names: ");
3025
3026       return NULL;
3027     }
3028
3029   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3030   if ((TREE_CODE (op1) == SSA_NAME
3031        && !types_compatible_p (type,TREE_TYPE (op1)))
3032       || (TREE_CODE (op2) == SSA_NAME
3033           && !types_compatible_p (type, TREE_TYPE (op2)))
3034       || (op3 && TREE_CODE (op3) == SSA_NAME
3035           && !types_compatible_p (type, TREE_TYPE (op3)))
3036       || (op4 && TREE_CODE (op4) == SSA_NAME
3037           && !types_compatible_p (type, TREE_TYPE (op4))))
3038     {
3039       if (dump_enabled_p ())
3040         {
3041           dump_printf_loc (MSG_NOTE, vect_location,
3042                            "reduction: multiple types: operation type: ");
3043           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3044           dump_printf (MSG_NOTE, ", operands types: ");
3045           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3046                              TREE_TYPE (op1));
3047           dump_printf (MSG_NOTE, ",");
3048           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3049                              TREE_TYPE (op2));
3050           if (op3)
3051             {
3052               dump_printf (MSG_NOTE, ",");
3053               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3054                                  TREE_TYPE (op3));
3055             }
3056
3057           if (op4)
3058             {
3059               dump_printf (MSG_NOTE, ",");
3060               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3061                                  TREE_TYPE (op4));
3062             }
3063           dump_printf (MSG_NOTE, "\n");
3064         }
3065
3066       return NULL;
3067     }
3068
3069   /* Check that it's ok to change the order of the computation.
3070      Generally, when vectorizing a reduction we change the order of the
3071      computation.  This may change the behavior of the program in some
3072      cases, so we need to check that this is ok.  One exception is when
3073      vectorizing an outer-loop: the inner-loop is executed sequentially,
3074      and therefore vectorizing reductions in the inner-loop during
3075      outer-loop vectorization is safe.  */
3076
3077   if (*v_reduc_type != COND_REDUCTION
3078       && check_reduction)
3079     {
3080       /* CHECKME: check for !flag_finite_math_only too?  */
3081       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3082         {
3083           /* Changing the order of operations changes the semantics.  */
3084           if (dump_enabled_p ())
3085             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3086                         "reduction: unsafe fp math optimization: ");
3087           return NULL;
3088         }
3089       else if (INTEGRAL_TYPE_P (type))
3090         {
3091           if (!operation_no_trapping_overflow (type, code))
3092             {
3093               /* Changing the order of operations changes the semantics.  */
3094               if (dump_enabled_p ())
3095                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3096                                 "reduction: unsafe int math optimization"
3097                                 " (overflow traps): ");
3098               return NULL;
3099             }
3100           if (need_wrapping_integral_overflow
3101               && !TYPE_OVERFLOW_WRAPS (type)
3102               && operation_can_overflow (code))
3103             {
3104               /* Changing the order of operations changes the semantics.  */
3105               if (dump_enabled_p ())
3106                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3107                                 "reduction: unsafe int math optimization"
3108                                 " (overflow doesn't wrap): ");
3109               return NULL;
3110             }
3111         }
3112       else if (SAT_FIXED_POINT_TYPE_P (type))
3113         {
3114           /* Changing the order of operations changes the semantics.  */
3115           if (dump_enabled_p ())
3116           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3117                           "reduction: unsafe fixed-point math optimization: ");
3118           return NULL;
3119         }
3120     }
3121
3122   /* Reduction is safe. We're dealing with one of the following:
3123      1) integer arithmetic and no trapv
3124      2) floating point arithmetic, and special flags permit this optimization
3125      3) nested cycle (i.e., outer loop vectorization).  */
3126   if (TREE_CODE (op1) == SSA_NAME)
3127     def1 = SSA_NAME_DEF_STMT (op1);
3128
3129   if (TREE_CODE (op2) == SSA_NAME)
3130     def2 = SSA_NAME_DEF_STMT (op2);
3131
3132   if (code != COND_EXPR
3133       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3134     {
3135       if (dump_enabled_p ())
3136         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3137       return NULL;
3138     }
3139
3140   /* Check that one def is the reduction def, defined by PHI,
3141      the other def is either defined in the loop ("vect_internal_def"),
3142      or it's an induction (defined by a loop-header phi-node).  */
3143
3144   if (def2 && def2 == phi
3145       && (code == COND_EXPR
3146           || !def1 || gimple_nop_p (def1)
3147           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3148           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3149               && (is_gimple_assign (def1)
3150                   || is_gimple_call (def1)
3151                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3152                       == vect_induction_def
3153                   || (gimple_code (def1) == GIMPLE_PHI
3154                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3155                           == vect_internal_def
3156                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3157     {
3158       if (dump_enabled_p ())
3159         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3160       return def_stmt;
3161     }
3162
3163   if (def1 && def1 == phi
3164       && (code == COND_EXPR
3165           || !def2 || gimple_nop_p (def2)
3166           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3167           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3168               && (is_gimple_assign (def2)
3169                   || is_gimple_call (def2)
3170                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3171                        == vect_induction_def
3172                   || (gimple_code (def2) == GIMPLE_PHI
3173                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3174                            == vect_internal_def
3175                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3176     {
3177       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3178         {
3179           /* Check if we can swap operands (just for simplicity - so that
3180              the rest of the code can assume that the reduction variable
3181              is always the last (second) argument).  */
3182           if (code == COND_EXPR)
3183             {
3184               /* Swap cond_expr by inverting the condition.  */
3185               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3186               enum tree_code invert_code = ERROR_MARK;
3187               enum tree_code cond_code = TREE_CODE (cond_expr);
3188
3189               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3190                 {
3191                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3192                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3193                 }
3194               if (invert_code != ERROR_MARK)
3195                 {
3196                   TREE_SET_CODE (cond_expr, invert_code);
3197                   swap_ssa_operands (def_stmt,
3198                                      gimple_assign_rhs2_ptr (def_stmt),
3199                                      gimple_assign_rhs3_ptr (def_stmt));
3200                 }
3201               else
3202                 {
3203                   if (dump_enabled_p ())
3204                     report_vect_op (MSG_NOTE, def_stmt,
3205                                     "detected reduction: cannot swap operands "
3206                                     "for cond_expr");
3207                   return NULL;
3208                 }
3209             }
3210           else
3211             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3212                                gimple_assign_rhs2_ptr (def_stmt));
3213
3214           if (dump_enabled_p ())
3215             report_vect_op (MSG_NOTE, def_stmt,
3216                             "detected reduction: need to swap operands: ");
3217
3218           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3219             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3220         }
3221       else
3222         {
3223           if (dump_enabled_p ())
3224             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3225         }
3226
3227       return def_stmt;
3228     }
3229
3230   /* Try to find SLP reduction chain.  */
3231   if (! nested_in_vect_loop
3232       && code != COND_EXPR
3233       && orig_code != MINUS_EXPR
3234       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3235     {
3236       if (dump_enabled_p ())
3237         report_vect_op (MSG_NOTE, def_stmt,
3238                         "reduction: detected reduction chain: ");
3239
3240       return def_stmt;
3241     }
3242
3243   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3244   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3245   while (first)
3246     {
3247       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3248       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3249       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3250       first = next;
3251     }
3252
3253   /* Look for the expression computing loop_arg from loop PHI result.  */
3254   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3255                             code))
3256     return def_stmt;
3257
3258   if (dump_enabled_p ())
3259     {
3260       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3261                       "reduction: unknown pattern: ");
3262     }
3263
3264   return NULL;
3265 }
3266
3267 /* Wrapper around vect_is_simple_reduction, which will modify code
3268    in-place if it enables detection of more reductions.  Arguments
3269    as there.  */
3270
3271 gimple *
3272 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3273                              bool *double_reduc,
3274                              bool need_wrapping_integral_overflow)
3275 {
3276   enum vect_reduction_type v_reduc_type;
3277   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3278                                           need_wrapping_integral_overflow,
3279                                           &v_reduc_type);
3280   if (def)
3281     {
3282       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3283       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3284       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3285       reduc_def_info = vinfo_for_stmt (def);
3286       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3287     }
3288   return def;
3289 }
3290
3291 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3292 int
3293 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3294                              int *peel_iters_epilogue,
3295                              stmt_vector_for_cost *scalar_cost_vec,
3296                              stmt_vector_for_cost *prologue_cost_vec,
3297                              stmt_vector_for_cost *epilogue_cost_vec)
3298 {
3299   int retval = 0;
3300   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3301
3302   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3303     {
3304       *peel_iters_epilogue = assumed_vf / 2;
3305       if (dump_enabled_p ())
3306         dump_printf_loc (MSG_NOTE, vect_location,
3307                          "cost model: epilogue peel iters set to vf/2 "
3308                          "because loop iterations are unknown .\n");
3309
3310       /* If peeled iterations are known but number of scalar loop
3311          iterations are unknown, count a taken branch per peeled loop.  */
3312       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3313                                  NULL, 0, vect_prologue);
3314       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3315                                  NULL, 0, vect_epilogue);
3316     }
3317   else
3318     {
3319       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3320       peel_iters_prologue = niters < peel_iters_prologue ?
3321                             niters : peel_iters_prologue;
3322       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3323       /* If we need to peel for gaps, but no peeling is required, we have to
3324          peel VF iterations.  */
3325       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3326         *peel_iters_epilogue = assumed_vf;
3327     }
3328
3329   stmt_info_for_cost *si;
3330   int j;
3331   if (peel_iters_prologue)
3332     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3333         {
3334           stmt_vec_info stmt_info
3335             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3336           retval += record_stmt_cost (prologue_cost_vec,
3337                                       si->count * peel_iters_prologue,
3338                                       si->kind, stmt_info, si->misalign,
3339                                       vect_prologue);
3340         }
3341   if (*peel_iters_epilogue)
3342     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3343         {
3344           stmt_vec_info stmt_info
3345             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3346           retval += record_stmt_cost (epilogue_cost_vec,
3347                                       si->count * *peel_iters_epilogue,
3348                                       si->kind, stmt_info, si->misalign,
3349                                       vect_epilogue);
3350         }
3351
3352   return retval;
3353 }
3354
3355 /* Function vect_estimate_min_profitable_iters
3356
3357    Return the number of iterations required for the vector version of the
3358    loop to be profitable relative to the cost of the scalar version of the
3359    loop.
3360
3361    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3362    of iterations for vectorization.  -1 value means loop vectorization
3363    is not profitable.  This returned value may be used for dynamic
3364    profitability check.
3365
3366    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3367    for static check against estimated number of iterations.  */
3368
3369 static void
3370 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3371                                     int *ret_min_profitable_niters,
3372                                     int *ret_min_profitable_estimate)
3373 {
3374   int min_profitable_iters;
3375   int min_profitable_estimate;
3376   int peel_iters_prologue;
3377   int peel_iters_epilogue;
3378   unsigned vec_inside_cost = 0;
3379   int vec_outside_cost = 0;
3380   unsigned vec_prologue_cost = 0;
3381   unsigned vec_epilogue_cost = 0;
3382   int scalar_single_iter_cost = 0;
3383   int scalar_outside_cost = 0;
3384   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3385   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3386   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3387
3388   /* Cost model disabled.  */
3389   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3390     {
3391       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3392       *ret_min_profitable_niters = 0;
3393       *ret_min_profitable_estimate = 0;
3394       return;
3395     }
3396
3397   /* Requires loop versioning tests to handle misalignment.  */
3398   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3399     {
3400       /*  FIXME: Make cost depend on complexity of individual check.  */
3401       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3402       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3403                             vect_prologue);
3404       dump_printf (MSG_NOTE,
3405                    "cost model: Adding cost of checks for loop "
3406                    "versioning to treat misalignment.\n");
3407     }
3408
3409   /* Requires loop versioning with alias checks.  */
3410   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3411     {
3412       /*  FIXME: Make cost depend on complexity of individual check.  */
3413       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3414       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3415                             vect_prologue);
3416       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3417       if (len)
3418         /* Count LEN - 1 ANDs and LEN comparisons.  */
3419         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3420                               NULL, 0, vect_prologue);
3421       dump_printf (MSG_NOTE,
3422                    "cost model: Adding cost of checks for loop "
3423                    "versioning aliasing.\n");
3424     }
3425
3426   /* Requires loop versioning with niter checks.  */
3427   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3428     {
3429       /*  FIXME: Make cost depend on complexity of individual check.  */
3430       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3431                             vect_prologue);
3432       dump_printf (MSG_NOTE,
3433                    "cost model: Adding cost of checks for loop "
3434                    "versioning niters.\n");
3435     }
3436
3437   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3438     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3439                           vect_prologue);
3440
3441   /* Count statements in scalar loop.  Using this as scalar cost for a single
3442      iteration for now.
3443
3444      TODO: Add outer loop support.
3445
3446      TODO: Consider assigning different costs to different scalar
3447      statements.  */
3448
3449   scalar_single_iter_cost
3450     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3451
3452   /* Add additional cost for the peeled instructions in prologue and epilogue
3453      loop.
3454
3455      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3456      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3457
3458      TODO: Build an expression that represents peel_iters for prologue and
3459      epilogue to be used in a run-time test.  */
3460
3461   if (npeel  < 0)
3462     {
3463       peel_iters_prologue = assumed_vf / 2;
3464       dump_printf (MSG_NOTE, "cost model: "
3465                    "prologue peel iters set to vf/2.\n");
3466
3467       /* If peeling for alignment is unknown, loop bound of main loop becomes
3468          unknown.  */
3469       peel_iters_epilogue = assumed_vf / 2;
3470       dump_printf (MSG_NOTE, "cost model: "
3471                    "epilogue peel iters set to vf/2 because "
3472                    "peeling for alignment is unknown.\n");
3473
3474       /* If peeled iterations are unknown, count a taken branch and a not taken
3475          branch per peeled loop. Even if scalar loop iterations are known,
3476          vector iterations are not known since peeled prologue iterations are
3477          not known. Hence guards remain the same.  */
3478       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3479                             NULL, 0, vect_prologue);
3480       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3481                             NULL, 0, vect_prologue);
3482       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3483                             NULL, 0, vect_epilogue);
3484       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3485                             NULL, 0, vect_epilogue);
3486       stmt_info_for_cost *si;
3487       int j;
3488       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3489         {
3490           struct _stmt_vec_info *stmt_info
3491             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3492           (void) add_stmt_cost (target_cost_data,
3493                                 si->count * peel_iters_prologue,
3494                                 si->kind, stmt_info, si->misalign,
3495                                 vect_prologue);
3496           (void) add_stmt_cost (target_cost_data,
3497                                 si->count * peel_iters_epilogue,
3498                                 si->kind, stmt_info, si->misalign,
3499                                 vect_epilogue);
3500         }
3501     }
3502   else
3503     {
3504       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3505       stmt_info_for_cost *si;
3506       int j;
3507       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3508
3509       prologue_cost_vec.create (2);
3510       epilogue_cost_vec.create (2);
3511       peel_iters_prologue = npeel;
3512
3513       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3514                                           &peel_iters_epilogue,
3515                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3516                                             (loop_vinfo),
3517                                           &prologue_cost_vec,
3518                                           &epilogue_cost_vec);
3519
3520       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3521         {
3522           struct _stmt_vec_info *stmt_info
3523             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3524           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3525                                 si->misalign, vect_prologue);
3526         }
3527
3528       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3529         {
3530           struct _stmt_vec_info *stmt_info
3531             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3532           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3533                                 si->misalign, vect_epilogue);
3534         }
3535
3536       prologue_cost_vec.release ();
3537       epilogue_cost_vec.release ();
3538     }
3539
3540   /* FORNOW: The scalar outside cost is incremented in one of the
3541      following ways:
3542
3543      1. The vectorizer checks for alignment and aliasing and generates
3544      a condition that allows dynamic vectorization.  A cost model
3545      check is ANDED with the versioning condition.  Hence scalar code
3546      path now has the added cost of the versioning check.
3547
3548        if (cost > th & versioning_check)
3549          jmp to vector code
3550
3551      Hence run-time scalar is incremented by not-taken branch cost.
3552
3553      2. The vectorizer then checks if a prologue is required.  If the
3554      cost model check was not done before during versioning, it has to
3555      be done before the prologue check.
3556
3557        if (cost <= th)
3558          prologue = scalar_iters
3559        if (prologue == 0)
3560          jmp to vector code
3561        else
3562          execute prologue
3563        if (prologue == num_iters)
3564          go to exit
3565
3566      Hence the run-time scalar cost is incremented by a taken branch,
3567      plus a not-taken branch, plus a taken branch cost.
3568
3569      3. The vectorizer then checks if an epilogue is required.  If the
3570      cost model check was not done before during prologue check, it
3571      has to be done with the epilogue check.
3572
3573        if (prologue == 0)
3574          jmp to vector code
3575        else
3576          execute prologue
3577        if (prologue == num_iters)
3578          go to exit
3579        vector code:
3580          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3581            jmp to epilogue
3582
3583      Hence the run-time scalar cost should be incremented by 2 taken
3584      branches.
3585
3586      TODO: The back end may reorder the BBS's differently and reverse
3587      conditions/branch directions.  Change the estimates below to
3588      something more reasonable.  */
3589
3590   /* If the number of iterations is known and we do not do versioning, we can
3591      decide whether to vectorize at compile time.  Hence the scalar version
3592      do not carry cost model guard costs.  */
3593   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3594       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3595     {
3596       /* Cost model check occurs at versioning.  */
3597       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3598         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3599       else
3600         {
3601           /* Cost model check occurs at prologue generation.  */
3602           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3603             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3604               + vect_get_stmt_cost (cond_branch_not_taken);
3605           /* Cost model check occurs at epilogue generation.  */
3606           else
3607             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3608         }
3609     }
3610
3611   /* Complete the target-specific cost calculations.  */
3612   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3613                &vec_inside_cost, &vec_epilogue_cost);
3614
3615   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3616
3617   if (dump_enabled_p ())
3618     {
3619       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3620       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3621                    vec_inside_cost);
3622       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3623                    vec_prologue_cost);
3624       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3625                    vec_epilogue_cost);
3626       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3627                    scalar_single_iter_cost);
3628       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3629                    scalar_outside_cost);
3630       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3631                    vec_outside_cost);
3632       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3633                    peel_iters_prologue);
3634       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3635                    peel_iters_epilogue);
3636     }
3637
3638   /* Calculate number of iterations required to make the vector version
3639      profitable, relative to the loop bodies only.  The following condition
3640      must hold true:
3641      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3642      where
3643      SIC = scalar iteration cost, VIC = vector iteration cost,
3644      VOC = vector outside cost, VF = vectorization factor,
3645      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3646      SOC = scalar outside cost for run time cost model check.  */
3647
3648   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3649     {
3650       if (vec_outside_cost <= 0)
3651         min_profitable_iters = 0;
3652       else
3653         {
3654           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3655                                   * assumed_vf
3656                                   - vec_inside_cost * peel_iters_prologue
3657                                   - vec_inside_cost * peel_iters_epilogue)
3658                                  / ((scalar_single_iter_cost * assumed_vf)
3659                                     - vec_inside_cost);
3660
3661           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3662               <= (((int) vec_inside_cost * min_profitable_iters)
3663                   + (((int) vec_outside_cost - scalar_outside_cost)
3664                      * assumed_vf)))
3665             min_profitable_iters++;
3666         }
3667     }
3668   /* vector version will never be profitable.  */
3669   else
3670     {
3671       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3672         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3673                     "did not happen for a simd loop");
3674
3675       if (dump_enabled_p ())
3676         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3677                          "cost model: the vector iteration cost = %d "
3678                          "divided by the scalar iteration cost = %d "
3679                          "is greater or equal to the vectorization factor = %d"
3680                          ".\n",
3681                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3682       *ret_min_profitable_niters = -1;
3683       *ret_min_profitable_estimate = -1;
3684       return;
3685     }
3686
3687   dump_printf (MSG_NOTE,
3688                "  Calculated minimum iters for profitability: %d\n",
3689                min_profitable_iters);
3690
3691   /* We want the vectorized loop to execute at least once.  */
3692   if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
3693     min_profitable_iters = assumed_vf + peel_iters_prologue;
3694
3695   if (dump_enabled_p ())
3696     dump_printf_loc (MSG_NOTE, vect_location,
3697                      "  Runtime profitability threshold = %d\n",
3698                      min_profitable_iters);
3699
3700   *ret_min_profitable_niters = min_profitable_iters;
3701
3702   /* Calculate number of iterations required to make the vector version
3703      profitable, relative to the loop bodies only.
3704
3705      Non-vectorized variant is SIC * niters and it must win over vector
3706      variant on the expected loop trip count.  The following condition must hold true:
3707      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3708
3709   if (vec_outside_cost <= 0)
3710     min_profitable_estimate = 0;
3711   else
3712     {
3713       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3714                                  * assumed_vf
3715                                  - vec_inside_cost * peel_iters_prologue
3716                                  - vec_inside_cost * peel_iters_epilogue)
3717                                  / ((scalar_single_iter_cost * assumed_vf)
3718                                    - vec_inside_cost);
3719     }
3720   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3721   if (dump_enabled_p ())
3722     dump_printf_loc (MSG_NOTE, vect_location,
3723                      "  Static estimate profitability threshold = %d\n",
3724                      min_profitable_estimate);
3725
3726   *ret_min_profitable_estimate = min_profitable_estimate;
3727 }
3728
3729 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3730    vector elements (not bits) for a vector with NELT elements.  */
3731 static void
3732 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3733                               vec_perm_builder *sel)
3734 {
3735   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3736      by vec_perm_indices.  */
3737   sel->new_vector (nelt, 1, 3);
3738   for (unsigned int i = 0; i < 3; i++)
3739     sel->quick_push (i + offset);
3740 }
3741
3742 /* Checks whether the target supports whole-vector shifts for vectors of mode
3743    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3744    it supports vec_perm_const with masks for all necessary shift amounts.  */
3745 static bool
3746 have_whole_vector_shift (machine_mode mode)
3747 {
3748   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3749     return true;
3750
3751   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3752   vec_perm_builder sel;
3753   vec_perm_indices indices;
3754   for (i = nelt/2; i >= 1; i/=2)
3755     {
3756       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3757       indices.new_vector (sel, 2, nelt);
3758       if (!can_vec_perm_const_p (mode, indices, false))
3759         return false;
3760     }
3761   return true;
3762 }
3763
3764 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3765    functions. Design better to avoid maintenance issues.  */
3766
3767 /* Function vect_model_reduction_cost.
3768
3769    Models cost for a reduction operation, including the vector ops
3770    generated within the strip-mine loop, the initial definition before
3771    the loop, and the epilogue code that must be generated.  */
3772
3773 static void
3774 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3775                            int ncopies)
3776 {
3777   int prologue_cost = 0, epilogue_cost = 0;
3778   enum tree_code code;
3779   optab optab;
3780   tree vectype;
3781   gimple *orig_stmt;
3782   machine_mode mode;
3783   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3784   struct loop *loop = NULL;
3785   void *target_cost_data;
3786
3787   if (loop_vinfo)
3788     {
3789       loop = LOOP_VINFO_LOOP (loop_vinfo);
3790       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3791     }
3792   else
3793     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3794
3795   /* Condition reductions generate two reductions in the loop.  */
3796   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3797     ncopies *= 2;
3798
3799   /* Cost of reduction op inside loop.  */
3800   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3801                                         stmt_info, 0, vect_body);
3802
3803   vectype = STMT_VINFO_VECTYPE (stmt_info);
3804   mode = TYPE_MODE (vectype);
3805   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3806
3807   if (!orig_stmt)
3808     orig_stmt = STMT_VINFO_STMT (stmt_info);
3809
3810   code = gimple_assign_rhs_code (orig_stmt);
3811
3812   /* Add in cost for initial definition.
3813      For cond reduction we have four vectors: initial index, step, initial
3814      result of the data reduction, initial value of the index reduction.  */
3815   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3816                        == COND_REDUCTION ? 4 : 1;
3817   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3818                                   scalar_to_vec, stmt_info, 0,
3819                                   vect_prologue);
3820
3821   /* Determine cost of epilogue code.
3822
3823      We have a reduction operator that will reduce the vector in one statement.
3824      Also requires scalar extract.  */
3825
3826   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3827     {
3828       if (reduc_fn != IFN_LAST)
3829         {
3830           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3831             {
3832               /* An EQ stmt and an COND_EXPR stmt.  */
3833               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3834                                               vector_stmt, stmt_info, 0,
3835                                               vect_epilogue);
3836               /* Reduction of the max index and a reduction of the found
3837                  values.  */
3838               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3839                                               vec_to_scalar, stmt_info, 0,
3840                                               vect_epilogue);
3841               /* A broadcast of the max value.  */
3842               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3843                                               scalar_to_vec, stmt_info, 0,
3844                                               vect_epilogue);
3845             }
3846           else
3847             {
3848               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3849                                               stmt_info, 0, vect_epilogue);
3850               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3851                                               vec_to_scalar, stmt_info, 0,
3852                                               vect_epilogue);
3853             }
3854         }
3855       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3856         {
3857           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3858           /* Extraction of scalar elements.  */
3859           epilogue_cost += add_stmt_cost (target_cost_data,
3860                                           2 * estimated_nunits,
3861                                           vec_to_scalar, stmt_info, 0,
3862                                           vect_epilogue);
3863           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3864           epilogue_cost += add_stmt_cost (target_cost_data,
3865                                           2 * estimated_nunits - 3,
3866                                           scalar_stmt, stmt_info, 0,
3867                                           vect_epilogue);
3868         }
3869       else
3870         {
3871           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3872           tree bitsize =
3873             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3874           int element_bitsize = tree_to_uhwi (bitsize);
3875           int nelements = vec_size_in_bits / element_bitsize;
3876
3877           if (code == COND_EXPR)
3878             code = MAX_EXPR;
3879
3880           optab = optab_for_tree_code (code, vectype, optab_default);
3881
3882           /* We have a whole vector shift available.  */
3883           if (optab != unknown_optab
3884               && VECTOR_MODE_P (mode)
3885               && optab_handler (optab, mode) != CODE_FOR_nothing
3886               && have_whole_vector_shift (mode))
3887             {
3888               /* Final reduction via vector shifts and the reduction operator.
3889                  Also requires scalar extract.  */
3890               epilogue_cost += add_stmt_cost (target_cost_data,
3891                                               exact_log2 (nelements) * 2,
3892                                               vector_stmt, stmt_info, 0,
3893                                               vect_epilogue);
3894               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3895                                               vec_to_scalar, stmt_info, 0,
3896                                               vect_epilogue);
3897             }
3898           else
3899             /* Use extracts and reduction op for final reduction.  For N
3900                elements, we have N extracts and N-1 reduction ops.  */
3901             epilogue_cost += add_stmt_cost (target_cost_data,
3902                                             nelements + nelements - 1,
3903                                             vector_stmt, stmt_info, 0,
3904                                             vect_epilogue);
3905         }
3906     }
3907
3908   if (dump_enabled_p ())
3909     dump_printf (MSG_NOTE,
3910                  "vect_model_reduction_cost: inside_cost = %d, "
3911                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3912                  prologue_cost, epilogue_cost);
3913 }
3914
3915
3916 /* Function vect_model_induction_cost.
3917
3918    Models cost for induction operations.  */
3919
3920 static void
3921 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3922 {
3923   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3924   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3925   unsigned inside_cost, prologue_cost;
3926
3927   if (PURE_SLP_STMT (stmt_info))
3928     return;
3929
3930   /* loop cost for vec_loop.  */
3931   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3932                                stmt_info, 0, vect_body);
3933
3934   /* prologue cost for vec_init and vec_step.  */
3935   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3936                                  stmt_info, 0, vect_prologue);
3937
3938   if (dump_enabled_p ())
3939     dump_printf_loc (MSG_NOTE, vect_location,
3940                      "vect_model_induction_cost: inside_cost = %d, "
3941                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3942 }
3943
3944
3945
3946 /* Function get_initial_def_for_reduction
3947
3948    Input:
3949    STMT - a stmt that performs a reduction operation in the loop.
3950    INIT_VAL - the initial value of the reduction variable
3951
3952    Output:
3953    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3954         of the reduction (used for adjusting the epilog - see below).
3955    Return a vector variable, initialized according to the operation that STMT
3956         performs. This vector will be used as the initial value of the
3957         vector of partial results.
3958
3959    Option1 (adjust in epilog): Initialize the vector as follows:
3960      add/bit or/xor:    [0,0,...,0,0]
3961      mult/bit and:      [1,1,...,1,1]
3962      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3963    and when necessary (e.g. add/mult case) let the caller know
3964    that it needs to adjust the result by init_val.
3965
3966    Option2: Initialize the vector as follows:
3967      add/bit or/xor:    [init_val,0,0,...,0]
3968      mult/bit and:      [init_val,1,1,...,1]
3969      min/max/cond_expr: [init_val,init_val,...,init_val]
3970    and no adjustments are needed.
3971
3972    For example, for the following code:
3973
3974    s = init_val;
3975    for (i=0;i<n;i++)
3976      s = s + a[i];
3977
3978    STMT is 's = s + a[i]', and the reduction variable is 's'.
3979    For a vector of 4 units, we want to return either [0,0,0,init_val],
3980    or [0,0,0,0] and let the caller know that it needs to adjust
3981    the result at the end by 'init_val'.
3982
3983    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3984    initialization vector is simpler (same element in all entries), if
3985    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3986
3987    A cost model should help decide between these two schemes.  */
3988
3989 tree
3990 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3991                                tree *adjustment_def)
3992 {
3993   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3994   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3995   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3996   tree scalar_type = TREE_TYPE (init_val);
3997   tree vectype = get_vectype_for_scalar_type (scalar_type);
3998   enum tree_code code = gimple_assign_rhs_code (stmt);
3999   tree def_for_init;
4000   tree init_def;
4001   bool nested_in_vect_loop = false;
4002   REAL_VALUE_TYPE real_init_val = dconst0;
4003   int int_init_val = 0;
4004   gimple *def_stmt = NULL;
4005   gimple_seq stmts = NULL;
4006
4007   gcc_assert (vectype);
4008
4009   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4010               || SCALAR_FLOAT_TYPE_P (scalar_type));
4011
4012   if (nested_in_vect_loop_p (loop, stmt))
4013     nested_in_vect_loop = true;
4014   else
4015     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4016
4017   /* In case of double reduction we only create a vector variable to be put
4018      in the reduction phi node.  The actual statement creation is done in
4019      vect_create_epilog_for_reduction.  */
4020   if (adjustment_def && nested_in_vect_loop
4021       && TREE_CODE (init_val) == SSA_NAME
4022       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4023       && gimple_code (def_stmt) == GIMPLE_PHI
4024       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4025       && vinfo_for_stmt (def_stmt)
4026       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4027           == vect_double_reduction_def)
4028     {
4029       *adjustment_def = NULL;
4030       return vect_create_destination_var (init_val, vectype);
4031     }
4032
4033   /* In case of a nested reduction do not use an adjustment def as
4034      that case is not supported by the epilogue generation correctly
4035      if ncopies is not one.  */
4036   if (adjustment_def && nested_in_vect_loop)
4037     {
4038       *adjustment_def = NULL;
4039       return vect_get_vec_def_for_operand (init_val, stmt);
4040     }
4041
4042   switch (code)
4043     {
4044     case WIDEN_SUM_EXPR:
4045     case DOT_PROD_EXPR:
4046     case SAD_EXPR:
4047     case PLUS_EXPR:
4048     case MINUS_EXPR:
4049     case BIT_IOR_EXPR:
4050     case BIT_XOR_EXPR:
4051     case MULT_EXPR:
4052     case BIT_AND_EXPR:
4053       {
4054         /* ADJUSTMENT_DEF is NULL when called from
4055            vect_create_epilog_for_reduction to vectorize double reduction.  */
4056         if (adjustment_def)
4057           *adjustment_def = init_val;
4058
4059         if (code == MULT_EXPR)
4060           {
4061             real_init_val = dconst1;
4062             int_init_val = 1;
4063           }
4064
4065         if (code == BIT_AND_EXPR)
4066           int_init_val = -1;
4067
4068         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4069           def_for_init = build_real (scalar_type, real_init_val);
4070         else
4071           def_for_init = build_int_cst (scalar_type, int_init_val);
4072
4073         if (adjustment_def)
4074           /* Option1: the first element is '0' or '1' as well.  */
4075           init_def = gimple_build_vector_from_val (&stmts, vectype,
4076                                                    def_for_init);
4077         else
4078           {
4079             /* Option2: the first element is INIT_VAL.  */
4080             tree_vector_builder elts (vectype, 1, 2);
4081             elts.quick_push (init_val);
4082             elts.quick_push (def_for_init);
4083             init_def = gimple_build_vector (&stmts, &elts);
4084           }
4085       }
4086       break;
4087
4088     case MIN_EXPR:
4089     case MAX_EXPR:
4090     case COND_EXPR:
4091       {
4092         if (adjustment_def)
4093           {
4094             *adjustment_def = NULL_TREE;
4095             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4096               {
4097                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4098                 break;
4099               }
4100           }
4101         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4102         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4103       }
4104       break;
4105
4106     default:
4107       gcc_unreachable ();
4108     }
4109
4110   if (stmts)
4111     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4112   return init_def;
4113 }
4114
4115 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4116    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4117
4118 static void
4119 get_initial_defs_for_reduction (slp_tree slp_node,
4120                                 vec<tree> *vec_oprnds,
4121                                 unsigned int number_of_vectors,
4122                                 enum tree_code code, bool reduc_chain)
4123 {
4124   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4125   gimple *stmt = stmts[0];
4126   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4127   unsigned nunits;
4128   unsigned j, number_of_places_left_in_vector;
4129   tree vector_type, scalar_type;
4130   tree vop;
4131   int group_size = stmts.length ();
4132   unsigned int vec_num, i;
4133   unsigned number_of_copies = 1;
4134   vec<tree> voprnds;
4135   voprnds.create (number_of_vectors);
4136   tree neutral_op = NULL;
4137   struct loop *loop;
4138
4139   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4140   scalar_type = TREE_TYPE (vector_type);
4141   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4142
4143   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4144
4145   loop = (gimple_bb (stmt))->loop_father;
4146   gcc_assert (loop);
4147   edge pe = loop_preheader_edge (loop);
4148
4149   /* op is the reduction operand of the first stmt already.  */
4150   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4151      we need either neutral operands or the original operands.  See
4152      get_initial_def_for_reduction() for details.  */
4153   switch (code)
4154     {
4155     case WIDEN_SUM_EXPR:
4156     case DOT_PROD_EXPR:
4157     case SAD_EXPR:
4158     case PLUS_EXPR:
4159     case MINUS_EXPR:
4160     case BIT_IOR_EXPR:
4161     case BIT_XOR_EXPR:
4162       neutral_op = build_zero_cst (scalar_type);
4163       break;
4164
4165     case MULT_EXPR:
4166       neutral_op = build_one_cst (scalar_type);
4167       break;
4168
4169     case BIT_AND_EXPR:
4170       neutral_op = build_all_ones_cst (scalar_type);
4171       break;
4172
4173     /* For MIN/MAX we don't have an easy neutral operand but
4174        the initial values can be used fine here.  Only for
4175        a reduction chain we have to force a neutral element.  */
4176     case MAX_EXPR:
4177     case MIN_EXPR:
4178       if (! reduc_chain)
4179         neutral_op = NULL;
4180       else
4181         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4182       break;
4183
4184     default:
4185       gcc_assert (! reduc_chain);
4186       neutral_op = NULL;
4187     }
4188
4189   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4190      created vectors. It is greater than 1 if unrolling is performed.
4191
4192      For example, we have two scalar operands, s1 and s2 (e.g., group of
4193      strided accesses of size two), while NUNITS is four (i.e., four scalars
4194      of this type can be packed in a vector).  The output vector will contain
4195      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4196      will be 2).
4197
4198      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4199      containing the operands.
4200
4201      For example, NUNITS is four as before, and the group size is 8
4202      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4203      {s5, s6, s7, s8}.  */
4204
4205   number_of_copies = nunits * number_of_vectors / group_size;
4206
4207   number_of_places_left_in_vector = nunits;
4208   tree_vector_builder elts (vector_type, nunits, 1);
4209   elts.quick_grow (nunits);
4210   for (j = 0; j < number_of_copies; j++)
4211     {
4212       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4213         {
4214           tree op;
4215           /* Get the def before the loop.  In reduction chain we have only
4216              one initial value.  */
4217           if ((j != (number_of_copies - 1)
4218                || (reduc_chain && i != 0))
4219               && neutral_op)
4220             op = neutral_op;
4221           else
4222             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4223
4224           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4225           number_of_places_left_in_vector--;
4226           elts[number_of_places_left_in_vector] = op;
4227
4228           if (number_of_places_left_in_vector == 0)
4229             {
4230               gimple_seq ctor_seq = NULL;
4231               tree init = gimple_build_vector (&ctor_seq, &elts);
4232               if (ctor_seq != NULL)
4233                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4234               voprnds.quick_push (init);
4235
4236               number_of_places_left_in_vector = nunits;
4237               elts.new_vector (vector_type, nunits, 1);
4238               elts.quick_grow (nunits);
4239             }
4240         }
4241     }
4242
4243   /* Since the vectors are created in the reverse order, we should invert
4244      them.  */
4245   vec_num = voprnds.length ();
4246   for (j = vec_num; j != 0; j--)
4247     {
4248       vop = voprnds[j - 1];
4249       vec_oprnds->quick_push (vop);
4250     }
4251
4252   voprnds.release ();
4253
4254   /* In case that VF is greater than the unrolling factor needed for the SLP
4255      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4256      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4257      to replicate the vectors.  */
4258   tree neutral_vec = NULL;
4259   while (number_of_vectors > vec_oprnds->length ())
4260     {
4261       if (neutral_op)
4262         {
4263           if (!neutral_vec)
4264             {
4265               gimple_seq ctor_seq = NULL;
4266               neutral_vec = gimple_build_vector_from_val
4267                 (&ctor_seq, vector_type, neutral_op);
4268               if (ctor_seq != NULL)
4269                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4270             }
4271           vec_oprnds->quick_push (neutral_vec);
4272         }
4273       else
4274         {
4275           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4276             vec_oprnds->quick_push (vop);
4277         }
4278     }
4279 }
4280
4281
4282 /* Function vect_create_epilog_for_reduction
4283
4284    Create code at the loop-epilog to finalize the result of a reduction
4285    computation.
4286
4287    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4288      reduction statements.
4289    STMT is the scalar reduction stmt that is being vectorized.
4290    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4291      number of elements that we can fit in a vectype (nunits).  In this case
4292      we have to generate more than one vector stmt - i.e - we need to "unroll"
4293      the vector stmt by a factor VF/nunits.  For more details see documentation
4294      in vectorizable_operation.
4295    REDUC_FN is the internal function for the epilog reduction.
4296    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4297      computation.
4298    REDUC_INDEX is the index of the operand in the right hand side of the
4299      statement that is defined by REDUCTION_PHI.
4300    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4301    SLP_NODE is an SLP node containing a group of reduction statements. The
4302      first one in this group is STMT.
4303    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4304      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4305      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4306      any value of the IV in the loop.
4307    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4308
4309    This function:
4310    1. Creates the reduction def-use cycles: sets the arguments for
4311       REDUCTION_PHIS:
4312       The loop-entry argument is the vectorized initial-value of the reduction.
4313       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4314       sums.
4315    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4316       by calling the function specified by REDUC_FN if available, or by
4317       other means (whole-vector shifts or a scalar loop).
4318       The function also creates a new phi node at the loop exit to preserve
4319       loop-closed form, as illustrated below.
4320
4321      The flow at the entry to this function:
4322
4323         loop:
4324           vec_def = phi <null, null>            # REDUCTION_PHI
4325           VECT_DEF = vector_stmt                # vectorized form of STMT
4326           s_loop = scalar_stmt                  # (scalar) STMT
4327         loop_exit:
4328           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4329           use <s_out0>
4330           use <s_out0>
4331
4332      The above is transformed by this function into:
4333
4334         loop:
4335           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4336           VECT_DEF = vector_stmt                # vectorized form of STMT
4337           s_loop = scalar_stmt                  # (scalar) STMT
4338         loop_exit:
4339           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4340           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4341           v_out2 = reduce <v_out1>
4342           s_out3 = extract_field <v_out2, 0>
4343           s_out4 = adjust_result <s_out3>
4344           use <s_out4>
4345           use <s_out4>
4346 */
4347
4348 static void
4349 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4350                                   gimple *reduc_def_stmt,
4351                                   int ncopies, internal_fn reduc_fn,
4352                                   vec<gimple *> reduction_phis,
4353                                   bool double_reduc,
4354                                   slp_tree slp_node,
4355                                   slp_instance slp_node_instance,
4356                                   tree induc_val, enum tree_code induc_code)
4357 {
4358   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4359   stmt_vec_info prev_phi_info;
4360   tree vectype;
4361   machine_mode mode;
4362   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4363   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4364   basic_block exit_bb;
4365   tree scalar_dest;
4366   tree scalar_type;
4367   gimple *new_phi = NULL, *phi;
4368   gimple_stmt_iterator exit_gsi;
4369   tree vec_dest;
4370   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4371   gimple *epilog_stmt = NULL;
4372   enum tree_code code = gimple_assign_rhs_code (stmt);
4373   gimple *exit_phi;
4374   tree bitsize;
4375   tree adjustment_def = NULL;
4376   tree vec_initial_def = NULL;
4377   tree expr, def, initial_def = NULL;
4378   tree orig_name, scalar_result;
4379   imm_use_iterator imm_iter, phi_imm_iter;
4380   use_operand_p use_p, phi_use_p;
4381   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4382   bool nested_in_vect_loop = false;
4383   auto_vec<gimple *> new_phis;
4384   auto_vec<gimple *> inner_phis;
4385   enum vect_def_type dt = vect_unknown_def_type;
4386   int j, i;
4387   auto_vec<tree> scalar_results;
4388   unsigned int group_size = 1, k, ratio;
4389   auto_vec<tree> vec_initial_defs;
4390   auto_vec<gimple *> phis;
4391   bool slp_reduc = false;
4392   tree new_phi_result;
4393   gimple *inner_phi = NULL;
4394   tree induction_index = NULL_TREE;
4395
4396   if (slp_node)
4397     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4398
4399   if (nested_in_vect_loop_p (loop, stmt))
4400     {
4401       outer_loop = loop;
4402       loop = loop->inner;
4403       nested_in_vect_loop = true;
4404       gcc_assert (!slp_node);
4405     }
4406
4407   vectype = STMT_VINFO_VECTYPE (stmt_info);
4408   gcc_assert (vectype);
4409   mode = TYPE_MODE (vectype);
4410
4411   /* 1. Create the reduction def-use cycle:
4412      Set the arguments of REDUCTION_PHIS, i.e., transform
4413
4414         loop:
4415           vec_def = phi <null, null>            # REDUCTION_PHI
4416           VECT_DEF = vector_stmt                # vectorized form of STMT
4417           ...
4418
4419      into:
4420
4421         loop:
4422           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4423           VECT_DEF = vector_stmt                # vectorized form of STMT
4424           ...
4425
4426      (in case of SLP, do it for all the phis). */
4427
4428   /* Get the loop-entry arguments.  */
4429   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4430   if (slp_node)
4431     {
4432       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4433       vec_initial_defs.reserve (vec_num);
4434       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4435                                       &vec_initial_defs, vec_num, code,
4436                                       GROUP_FIRST_ELEMENT (stmt_info));
4437     }
4438   else
4439     {
4440       /* Get at the scalar def before the loop, that defines the initial value
4441          of the reduction variable.  */
4442       gimple *def_stmt;
4443       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4444                                            loop_preheader_edge (loop));
4445       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4446          and we can't use zero for induc_val, use initial_def.  Similarly
4447          for REDUC_MIN and initial_def larger than the base.  */
4448       if (TREE_CODE (initial_def) == INTEGER_CST
4449           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4450               == INTEGER_INDUC_COND_REDUCTION)
4451           && !integer_zerop (induc_val)
4452           && ((induc_code == MAX_EXPR
4453                && tree_int_cst_lt (initial_def, induc_val))
4454               || (induc_code == MIN_EXPR
4455                   && tree_int_cst_lt (induc_val, initial_def))))
4456         induc_val = initial_def;
4457       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4458       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4459                                                        &adjustment_def);
4460       vec_initial_defs.create (1);
4461       vec_initial_defs.quick_push (vec_initial_def);
4462     }
4463
4464   /* Set phi nodes arguments.  */
4465   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4466     {
4467       tree vec_init_def = vec_initial_defs[i];
4468       tree def = vect_defs[i];
4469       for (j = 0; j < ncopies; j++)
4470         {
4471           if (j != 0)
4472             {
4473               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4474               if (nested_in_vect_loop)
4475                 vec_init_def
4476                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4477                                                     vec_init_def);
4478             }
4479
4480           /* Set the loop-entry arg of the reduction-phi.  */
4481
4482           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4483               == INTEGER_INDUC_COND_REDUCTION)
4484             {
4485               /* Initialise the reduction phi to zero.  This prevents initial
4486                  values of non-zero interferring with the reduction op.  */
4487               gcc_assert (ncopies == 1);
4488               gcc_assert (i == 0);
4489
4490               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4491               tree induc_val_vec
4492                 = build_vector_from_val (vec_init_def_type, induc_val);
4493
4494               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4495                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4496             }
4497           else
4498             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4499                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4500
4501           /* Set the loop-latch arg for the reduction-phi.  */
4502           if (j > 0)
4503             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4504
4505           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4506                        UNKNOWN_LOCATION);
4507
4508           if (dump_enabled_p ())
4509             {
4510               dump_printf_loc (MSG_NOTE, vect_location,
4511                                "transform reduction: created def-use cycle: ");
4512               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4513               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4514             }
4515         }
4516     }
4517
4518   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4519      which is updated with the current index of the loop for every match of
4520      the original loop's cond_expr (VEC_STMT).  This results in a vector
4521      containing the last time the condition passed for that vector lane.
4522      The first match will be a 1 to allow 0 to be used for non-matching
4523      indexes.  If there are no matches at all then the vector will be all
4524      zeroes.  */
4525   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4526     {
4527       tree indx_before_incr, indx_after_incr;
4528       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4529       int k;
4530
4531       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4532       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4533
4534       int scalar_precision
4535         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4536       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4537       tree cr_index_vector_type = build_vector_type
4538         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4539
4540       /* First we create a simple vector induction variable which starts
4541          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4542          vector size (STEP).  */
4543
4544       /* Create a {1,2,3,...} vector.  */
4545       tree_vector_builder vtemp (cr_index_vector_type, 1, 3);
4546       for (k = 0; k < 3; ++k)
4547         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4548       tree series_vect = vtemp.build ();
4549
4550       /* Create a vector of the step value.  */
4551       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4552       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4553
4554       /* Create an induction variable.  */
4555       gimple_stmt_iterator incr_gsi;
4556       bool insert_after;
4557       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4558       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4559                  insert_after, &indx_before_incr, &indx_after_incr);
4560
4561       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4562          filled with zeros (VEC_ZERO).  */
4563
4564       /* Create a vector of 0s.  */
4565       tree zero = build_zero_cst (cr_index_scalar_type);
4566       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4567
4568       /* Create a vector phi node.  */
4569       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4570       new_phi = create_phi_node (new_phi_tree, loop->header);
4571       set_vinfo_for_stmt (new_phi,
4572                           new_stmt_vec_info (new_phi, loop_vinfo));
4573       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4574                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4575
4576       /* Now take the condition from the loops original cond_expr
4577          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4578          every match uses values from the induction variable
4579          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4580          (NEW_PHI_TREE).
4581          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4582          the new cond_expr (INDEX_COND_EXPR).  */
4583
4584       /* Duplicate the condition from vec_stmt.  */
4585       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4586
4587       /* Create a conditional, where the condition is taken from vec_stmt
4588          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4589          else is the phi (NEW_PHI_TREE).  */
4590       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4591                                      ccompare, indx_before_incr,
4592                                      new_phi_tree);
4593       induction_index = make_ssa_name (cr_index_vector_type);
4594       gimple *index_condition = gimple_build_assign (induction_index,
4595                                                      index_cond_expr);
4596       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4597       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4598                                                         loop_vinfo);
4599       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4600       set_vinfo_for_stmt (index_condition, index_vec_info);
4601
4602       /* Update the phi with the vec cond.  */
4603       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4604                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4605     }
4606
4607   /* 2. Create epilog code.
4608         The reduction epilog code operates across the elements of the vector
4609         of partial results computed by the vectorized loop.
4610         The reduction epilog code consists of:
4611
4612         step 1: compute the scalar result in a vector (v_out2)
4613         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4614         step 3: adjust the scalar result (s_out3) if needed.
4615
4616         Step 1 can be accomplished using one the following three schemes:
4617           (scheme 1) using reduc_fn, if available.
4618           (scheme 2) using whole-vector shifts, if available.
4619           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4620                      combined.
4621
4622           The overall epilog code looks like this:
4623
4624           s_out0 = phi <s_loop>         # original EXIT_PHI
4625           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4626           v_out2 = reduce <v_out1>              # step 1
4627           s_out3 = extract_field <v_out2, 0>    # step 2
4628           s_out4 = adjust_result <s_out3>       # step 3
4629
4630           (step 3 is optional, and steps 1 and 2 may be combined).
4631           Lastly, the uses of s_out0 are replaced by s_out4.  */
4632
4633
4634   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4635          v_out1 = phi <VECT_DEF>
4636          Store them in NEW_PHIS.  */
4637
4638   exit_bb = single_exit (loop)->dest;
4639   prev_phi_info = NULL;
4640   new_phis.create (vect_defs.length ());
4641   FOR_EACH_VEC_ELT (vect_defs, i, def)
4642     {
4643       for (j = 0; j < ncopies; j++)
4644         {
4645           tree new_def = copy_ssa_name (def);
4646           phi = create_phi_node (new_def, exit_bb);
4647           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4648           if (j == 0)
4649             new_phis.quick_push (phi);
4650           else
4651             {
4652               def = vect_get_vec_def_for_stmt_copy (dt, def);
4653               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4654             }
4655
4656           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4657           prev_phi_info = vinfo_for_stmt (phi);
4658         }
4659     }
4660
4661   /* The epilogue is created for the outer-loop, i.e., for the loop being
4662      vectorized.  Create exit phis for the outer loop.  */
4663   if (double_reduc)
4664     {
4665       loop = outer_loop;
4666       exit_bb = single_exit (loop)->dest;
4667       inner_phis.create (vect_defs.length ());
4668       FOR_EACH_VEC_ELT (new_phis, i, phi)
4669         {
4670           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4671           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4672           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4673                            PHI_RESULT (phi));
4674           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4675                                                             loop_vinfo));
4676           inner_phis.quick_push (phi);
4677           new_phis[i] = outer_phi;
4678           prev_phi_info = vinfo_for_stmt (outer_phi);
4679           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4680             {
4681               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4682               new_result = copy_ssa_name (PHI_RESULT (phi));
4683               outer_phi = create_phi_node (new_result, exit_bb);
4684               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4685                                PHI_RESULT (phi));
4686               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4687                                                                 loop_vinfo));
4688               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4689               prev_phi_info = vinfo_for_stmt (outer_phi);
4690             }
4691         }
4692     }
4693
4694   exit_gsi = gsi_after_labels (exit_bb);
4695
4696   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4697          (i.e. when reduc_fn is not available) and in the final adjustment
4698          code (if needed).  Also get the original scalar reduction variable as
4699          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4700          represents a reduction pattern), the tree-code and scalar-def are
4701          taken from the original stmt that the pattern-stmt (STMT) replaces.
4702          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4703          are taken from STMT.  */
4704
4705   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4706   if (!orig_stmt)
4707     {
4708       /* Regular reduction  */
4709       orig_stmt = stmt;
4710     }
4711   else
4712     {
4713       /* Reduction pattern  */
4714       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4715       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4716       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4717     }
4718
4719   code = gimple_assign_rhs_code (orig_stmt);
4720   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4721      partial results are added and not subtracted.  */
4722   if (code == MINUS_EXPR)
4723     code = PLUS_EXPR;
4724
4725   scalar_dest = gimple_assign_lhs (orig_stmt);
4726   scalar_type = TREE_TYPE (scalar_dest);
4727   scalar_results.create (group_size);
4728   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4729   bitsize = TYPE_SIZE (scalar_type);
4730
4731   /* In case this is a reduction in an inner-loop while vectorizing an outer
4732      loop - we don't need to extract a single scalar result at the end of the
4733      inner-loop (unless it is double reduction, i.e., the use of reduction is
4734      outside the outer-loop).  The final vector of partial results will be used
4735      in the vectorized outer-loop, or reduced to a scalar result at the end of
4736      the outer-loop.  */
4737   if (nested_in_vect_loop && !double_reduc)
4738     goto vect_finalize_reduction;
4739
4740   /* SLP reduction without reduction chain, e.g.,
4741      # a1 = phi <a2, a0>
4742      # b1 = phi <b2, b0>
4743      a2 = operation (a1)
4744      b2 = operation (b1)  */
4745   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4746
4747   /* In case of reduction chain, e.g.,
4748      # a1 = phi <a3, a0>
4749      a2 = operation (a1)
4750      a3 = operation (a2),
4751
4752      we may end up with more than one vector result.  Here we reduce them to
4753      one vector.  */
4754   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4755     {
4756       tree first_vect = PHI_RESULT (new_phis[0]);
4757       gassign *new_vec_stmt = NULL;
4758       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4759       for (k = 1; k < new_phis.length (); k++)
4760         {
4761           gimple *next_phi = new_phis[k];
4762           tree second_vect = PHI_RESULT (next_phi);
4763           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4764           new_vec_stmt = gimple_build_assign (tem, code,
4765                                               first_vect, second_vect);
4766           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4767           first_vect = tem;
4768         }
4769
4770       new_phi_result = first_vect;
4771       if (new_vec_stmt)
4772         {
4773           new_phis.truncate (0);
4774           new_phis.safe_push (new_vec_stmt);
4775         }
4776     }
4777   /* Likewise if we couldn't use a single defuse cycle.  */
4778   else if (ncopies > 1)
4779     {
4780       gcc_assert (new_phis.length () == 1);
4781       tree first_vect = PHI_RESULT (new_phis[0]);
4782       gassign *new_vec_stmt = NULL;
4783       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4784       gimple *next_phi = new_phis[0];
4785       for (int k = 1; k < ncopies; ++k)
4786         {
4787           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4788           tree second_vect = PHI_RESULT (next_phi);
4789           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4790           new_vec_stmt = gimple_build_assign (tem, code,
4791                                               first_vect, second_vect);
4792           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4793           first_vect = tem;
4794         }
4795       new_phi_result = first_vect;
4796       new_phis.truncate (0);
4797       new_phis.safe_push (new_vec_stmt);
4798     }
4799   else
4800     new_phi_result = PHI_RESULT (new_phis[0]);
4801
4802   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4803       && reduc_fn != IFN_LAST)
4804     {
4805       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4806          various data values where the condition matched and another vector
4807          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4808          need to extract the last matching index (which will be the index with
4809          highest value) and use this to index into the data vector.
4810          For the case where there were no matches, the data vector will contain
4811          all default values and the index vector will be all zeros.  */
4812
4813       /* Get various versions of the type of the vector of indexes.  */
4814       tree index_vec_type = TREE_TYPE (induction_index);
4815       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4816       tree index_scalar_type = TREE_TYPE (index_vec_type);
4817       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4818         (index_vec_type);
4819
4820       /* Get an unsigned integer version of the type of the data vector.  */
4821       int scalar_precision
4822         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4823       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4824       tree vectype_unsigned = build_vector_type
4825         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4826
4827       /* First we need to create a vector (ZERO_VEC) of zeros and another
4828          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4829          can create using a MAX reduction and then expanding.
4830          In the case where the loop never made any matches, the max index will
4831          be zero.  */
4832
4833       /* Vector of {0, 0, 0,...}.  */
4834       tree zero_vec = make_ssa_name (vectype);
4835       tree zero_vec_rhs = build_zero_cst (vectype);
4836       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4837       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4838
4839       /* Find maximum value from the vector of found indexes.  */
4840       tree max_index = make_ssa_name (index_scalar_type);
4841       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4842                                                           1, induction_index);
4843       gimple_call_set_lhs (max_index_stmt, max_index);
4844       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4845
4846       /* Vector of {max_index, max_index, max_index,...}.  */
4847       tree max_index_vec = make_ssa_name (index_vec_type);
4848       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4849                                                       max_index);
4850       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4851                                                         max_index_vec_rhs);
4852       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4853
4854       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4855          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4856          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4857          otherwise.  Only one value should match, resulting in a vector
4858          (VEC_COND) with one data value and the rest zeros.
4859          In the case where the loop never made any matches, every index will
4860          match, resulting in a vector with all data values (which will all be
4861          the default value).  */
4862
4863       /* Compare the max index vector to the vector of found indexes to find
4864          the position of the max value.  */
4865       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4866       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4867                                                       induction_index,
4868                                                       max_index_vec);
4869       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4870
4871       /* Use the compare to choose either values from the data vector or
4872          zero.  */
4873       tree vec_cond = make_ssa_name (vectype);
4874       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4875                                                    vec_compare, new_phi_result,
4876                                                    zero_vec);
4877       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4878
4879       /* Finally we need to extract the data value from the vector (VEC_COND)
4880          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4881          reduction, but because this doesn't exist, we can use a MAX reduction
4882          instead.  The data value might be signed or a float so we need to cast
4883          it first.
4884          In the case where the loop never made any matches, the data values are
4885          all identical, and so will reduce down correctly.  */
4886
4887       /* Make the matched data values unsigned.  */
4888       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4889       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4890                                        vec_cond);
4891       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4892                                                         VIEW_CONVERT_EXPR,
4893                                                         vec_cond_cast_rhs);
4894       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4895
4896       /* Reduce down to a scalar value.  */
4897       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4898       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4899                                                            1, vec_cond_cast);
4900       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4901       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4902
4903       /* Convert the reduced value back to the result type and set as the
4904          result.  */
4905       gimple_seq stmts = NULL;
4906       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4907                                data_reduc);
4908       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4909       scalar_results.safe_push (new_temp);
4910     }
4911   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4912            && reduc_fn == IFN_LAST)
4913     {
4914       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4915          idx = 0;
4916          idx_val = induction_index[0];
4917          val = data_reduc[0];
4918          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4919            if (induction_index[i] > idx_val)
4920              val = data_reduc[i], idx_val = induction_index[i];
4921          return val;  */
4922
4923       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4924       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4925       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4926       unsigned HOST_WIDE_INT v_size
4927         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4928       tree idx_val = NULL_TREE, val = NULL_TREE;
4929       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4930         {
4931           tree old_idx_val = idx_val;
4932           tree old_val = val;
4933           idx_val = make_ssa_name (idx_eltype);
4934           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4935                                              build3 (BIT_FIELD_REF, idx_eltype,
4936                                                      induction_index,
4937                                                      bitsize_int (el_size),
4938                                                      bitsize_int (off)));
4939           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4940           val = make_ssa_name (data_eltype);
4941           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4942                                              build3 (BIT_FIELD_REF,
4943                                                      data_eltype,
4944                                                      new_phi_result,
4945                                                      bitsize_int (el_size),
4946                                                      bitsize_int (off)));
4947           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4948           if (off != 0)
4949             {
4950               tree new_idx_val = idx_val;
4951               tree new_val = val;
4952               if (off != v_size - el_size)
4953                 {
4954                   new_idx_val = make_ssa_name (idx_eltype);
4955                   epilog_stmt = gimple_build_assign (new_idx_val,
4956                                                      MAX_EXPR, idx_val,
4957                                                      old_idx_val);
4958                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4959                 }
4960               new_val = make_ssa_name (data_eltype);
4961               epilog_stmt = gimple_build_assign (new_val,
4962                                                  COND_EXPR,
4963                                                  build2 (GT_EXPR,
4964                                                          boolean_type_node,
4965                                                          idx_val,
4966                                                          old_idx_val),
4967                                                  val, old_val);
4968               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4969               idx_val = new_idx_val;
4970               val = new_val;
4971             }
4972         }
4973       /* Convert the reduced value back to the result type and set as the
4974          result.  */
4975       gimple_seq stmts = NULL;
4976       val = gimple_convert (&stmts, scalar_type, val);
4977       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4978       scalar_results.safe_push (val);
4979     }
4980
4981   /* 2.3 Create the reduction code, using one of the three schemes described
4982          above. In SLP we simply need to extract all the elements from the
4983          vector (without reducing them), so we use scalar shifts.  */
4984   else if (reduc_fn != IFN_LAST && !slp_reduc)
4985     {
4986       tree tmp;
4987       tree vec_elem_type;
4988
4989       /* Case 1:  Create:
4990          v_out2 = reduc_expr <v_out1>  */
4991
4992       if (dump_enabled_p ())
4993         dump_printf_loc (MSG_NOTE, vect_location,
4994                          "Reduce using direct vector reduction.\n");
4995
4996       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4997       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4998         {
4999           tree tmp_dest
5000             = vect_create_destination_var (scalar_dest, vec_elem_type);
5001           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5002                                                     new_phi_result);
5003           gimple_set_lhs (epilog_stmt, tmp_dest);
5004           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5005           gimple_set_lhs (epilog_stmt, new_temp);
5006           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5007
5008           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5009                                              new_temp);
5010         }
5011       else
5012         {
5013           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5014                                                     new_phi_result);
5015           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5016         }
5017
5018       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5019       gimple_set_lhs (epilog_stmt, new_temp);
5020       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5021
5022       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5023            == INTEGER_INDUC_COND_REDUCTION)
5024           && !operand_equal_p (initial_def, induc_val, 0))
5025         {
5026           /* Earlier we set the initial value to be a vector if induc_val
5027              values.  Check the result and if it is induc_val then replace
5028              with the original initial value, unless induc_val is
5029              the same as initial_def already.  */
5030           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5031                                   induc_val);
5032
5033           tmp = make_ssa_name (new_scalar_dest);
5034           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5035                                              initial_def, new_temp);
5036           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5037           new_temp = tmp;
5038         }
5039
5040       scalar_results.safe_push (new_temp);
5041     }
5042   else
5043     {
5044       bool reduce_with_shift = have_whole_vector_shift (mode);
5045       int element_bitsize = tree_to_uhwi (bitsize);
5046       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5047       tree vec_temp;
5048
5049       /* COND reductions all do the final reduction with MAX_EXPR
5050          or MIN_EXPR.  */
5051       if (code == COND_EXPR)
5052         {
5053           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5054               == INTEGER_INDUC_COND_REDUCTION)
5055             code = induc_code;
5056           else
5057             code = MAX_EXPR;
5058         }
5059
5060       /* Regardless of whether we have a whole vector shift, if we're
5061          emulating the operation via tree-vect-generic, we don't want
5062          to use it.  Only the first round of the reduction is likely
5063          to still be profitable via emulation.  */
5064       /* ??? It might be better to emit a reduction tree code here, so that
5065          tree-vect-generic can expand the first round via bit tricks.  */
5066       if (!VECTOR_MODE_P (mode))
5067         reduce_with_shift = false;
5068       else
5069         {
5070           optab optab = optab_for_tree_code (code, vectype, optab_default);
5071           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5072             reduce_with_shift = false;
5073         }
5074
5075       if (reduce_with_shift && !slp_reduc)
5076         {
5077           int nelements = vec_size_in_bits / element_bitsize;
5078           vec_perm_builder sel;
5079           vec_perm_indices indices;
5080
5081           int elt_offset;
5082
5083           tree zero_vec = build_zero_cst (vectype);
5084           /* Case 2: Create:
5085              for (offset = nelements/2; offset >= 1; offset/=2)
5086                 {
5087                   Create:  va' = vec_shift <va, offset>
5088                   Create:  va = vop <va, va'>
5089                 }  */
5090
5091           tree rhs;
5092
5093           if (dump_enabled_p ())
5094             dump_printf_loc (MSG_NOTE, vect_location,
5095                              "Reduce using vector shifts\n");
5096
5097           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5098           new_temp = new_phi_result;
5099           for (elt_offset = nelements / 2;
5100                elt_offset >= 1;
5101                elt_offset /= 2)
5102             {
5103               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5104               indices.new_vector (sel, 2, nelements);
5105               tree mask = vect_gen_perm_mask_any (vectype, indices);
5106               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5107                                                  new_temp, zero_vec, mask);
5108               new_name = make_ssa_name (vec_dest, epilog_stmt);
5109               gimple_assign_set_lhs (epilog_stmt, new_name);
5110               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5111
5112               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5113                                                  new_temp);
5114               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5115               gimple_assign_set_lhs (epilog_stmt, new_temp);
5116               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5117             }
5118
5119           /* 2.4  Extract the final scalar result.  Create:
5120              s_out3 = extract_field <v_out2, bitpos>  */
5121
5122           if (dump_enabled_p ())
5123             dump_printf_loc (MSG_NOTE, vect_location,
5124                              "extract scalar result\n");
5125
5126           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5127                         bitsize, bitsize_zero_node);
5128           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5129           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5130           gimple_assign_set_lhs (epilog_stmt, new_temp);
5131           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5132           scalar_results.safe_push (new_temp);
5133         }
5134       else
5135         {
5136           /* Case 3: Create:
5137              s = extract_field <v_out2, 0>
5138              for (offset = element_size;
5139                   offset < vector_size;
5140                   offset += element_size;)
5141                {
5142                  Create:  s' = extract_field <v_out2, offset>
5143                  Create:  s = op <s, s'>  // For non SLP cases
5144                }  */
5145
5146           if (dump_enabled_p ())
5147             dump_printf_loc (MSG_NOTE, vect_location,
5148                              "Reduce using scalar code.\n");
5149
5150           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5151           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5152             {
5153               int bit_offset;
5154               if (gimple_code (new_phi) == GIMPLE_PHI)
5155                 vec_temp = PHI_RESULT (new_phi);
5156               else
5157                 vec_temp = gimple_assign_lhs (new_phi);
5158               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5159                                  bitsize_zero_node);
5160               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5161               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5162               gimple_assign_set_lhs (epilog_stmt, new_temp);
5163               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5164
5165               /* In SLP we don't need to apply reduction operation, so we just
5166                  collect s' values in SCALAR_RESULTS.  */
5167               if (slp_reduc)
5168                 scalar_results.safe_push (new_temp);
5169
5170               for (bit_offset = element_bitsize;
5171                    bit_offset < vec_size_in_bits;
5172                    bit_offset += element_bitsize)
5173                 {
5174                   tree bitpos = bitsize_int (bit_offset);
5175                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5176                                      bitsize, bitpos);
5177
5178                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5179                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5180                   gimple_assign_set_lhs (epilog_stmt, new_name);
5181                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5182
5183                   if (slp_reduc)
5184                     {
5185                       /* In SLP we don't need to apply reduction operation, so
5186                          we just collect s' values in SCALAR_RESULTS.  */
5187                       new_temp = new_name;
5188                       scalar_results.safe_push (new_name);
5189                     }
5190                   else
5191                     {
5192                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5193                                                          new_name, new_temp);
5194                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5195                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5196                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5197                     }
5198                 }
5199             }
5200
5201           /* The only case where we need to reduce scalar results in SLP, is
5202              unrolling.  If the size of SCALAR_RESULTS is greater than
5203              GROUP_SIZE, we reduce them combining elements modulo
5204              GROUP_SIZE.  */
5205           if (slp_reduc)
5206             {
5207               tree res, first_res, new_res;
5208               gimple *new_stmt;
5209
5210               /* Reduce multiple scalar results in case of SLP unrolling.  */
5211               for (j = group_size; scalar_results.iterate (j, &res);
5212                    j++)
5213                 {
5214                   first_res = scalar_results[j % group_size];
5215                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5216                                                   first_res, res);
5217                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5218                   gimple_assign_set_lhs (new_stmt, new_res);
5219                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5220                   scalar_results[j % group_size] = new_res;
5221                 }
5222             }
5223           else
5224             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5225             scalar_results.safe_push (new_temp);
5226         }
5227
5228       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5229            == INTEGER_INDUC_COND_REDUCTION)
5230           && !operand_equal_p (initial_def, induc_val, 0))
5231         {
5232           /* Earlier we set the initial value to be a vector if induc_val
5233              values.  Check the result and if it is induc_val then replace
5234              with the original initial value, unless induc_val is
5235              the same as initial_def already.  */
5236           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5237                                   induc_val);
5238
5239           tree tmp = make_ssa_name (new_scalar_dest);
5240           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5241                                              initial_def, new_temp);
5242           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5243           scalar_results[0] = tmp;
5244         }
5245     }
5246
5247 vect_finalize_reduction:
5248
5249   if (double_reduc)
5250     loop = loop->inner;
5251
5252   /* 2.5 Adjust the final result by the initial value of the reduction
5253          variable. (When such adjustment is not needed, then
5254          'adjustment_def' is zero).  For example, if code is PLUS we create:
5255          new_temp = loop_exit_def + adjustment_def  */
5256
5257   if (adjustment_def)
5258     {
5259       gcc_assert (!slp_reduc);
5260       if (nested_in_vect_loop)
5261         {
5262           new_phi = new_phis[0];
5263           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5264           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5265           new_dest = vect_create_destination_var (scalar_dest, vectype);
5266         }
5267       else
5268         {
5269           new_temp = scalar_results[0];
5270           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5271           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5272           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5273         }
5274
5275       epilog_stmt = gimple_build_assign (new_dest, expr);
5276       new_temp = make_ssa_name (new_dest, epilog_stmt);
5277       gimple_assign_set_lhs (epilog_stmt, new_temp);
5278       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5279       if (nested_in_vect_loop)
5280         {
5281           set_vinfo_for_stmt (epilog_stmt,
5282                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5283           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5284                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5285
5286           if (!double_reduc)
5287             scalar_results.quick_push (new_temp);
5288           else
5289             scalar_results[0] = new_temp;
5290         }
5291       else
5292         scalar_results[0] = new_temp;
5293
5294       new_phis[0] = epilog_stmt;
5295     }
5296
5297   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5298           phis with new adjusted scalar results, i.e., replace use <s_out0>
5299           with use <s_out4>.
5300
5301      Transform:
5302         loop_exit:
5303           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5304           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5305           v_out2 = reduce <v_out1>
5306           s_out3 = extract_field <v_out2, 0>
5307           s_out4 = adjust_result <s_out3>
5308           use <s_out0>
5309           use <s_out0>
5310
5311      into:
5312
5313         loop_exit:
5314           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5315           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5316           v_out2 = reduce <v_out1>
5317           s_out3 = extract_field <v_out2, 0>
5318           s_out4 = adjust_result <s_out3>
5319           use <s_out4>
5320           use <s_out4> */
5321
5322
5323   /* In SLP reduction chain we reduce vector results into one vector if
5324      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5325      the last stmt in the reduction chain, since we are looking for the loop
5326      exit phi node.  */
5327   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5328     {
5329       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5330       /* Handle reduction patterns.  */
5331       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5332         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5333
5334       scalar_dest = gimple_assign_lhs (dest_stmt);
5335       group_size = 1;
5336     }
5337
5338   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5339      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5340      need to match SCALAR_RESULTS with corresponding statements.  The first
5341      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5342      the first vector stmt, etc.
5343      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5344   if (group_size > new_phis.length ())
5345     {
5346       ratio = group_size / new_phis.length ();
5347       gcc_assert (!(group_size % new_phis.length ()));
5348     }
5349   else
5350     ratio = 1;
5351
5352   for (k = 0; k < group_size; k++)
5353     {
5354       if (k % ratio == 0)
5355         {
5356           epilog_stmt = new_phis[k / ratio];
5357           reduction_phi = reduction_phis[k / ratio];
5358           if (double_reduc)
5359             inner_phi = inner_phis[k / ratio];
5360         }
5361
5362       if (slp_reduc)
5363         {
5364           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5365
5366           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5367           /* SLP statements can't participate in patterns.  */
5368           gcc_assert (!orig_stmt);
5369           scalar_dest = gimple_assign_lhs (current_stmt);
5370         }
5371
5372       phis.create (3);
5373       /* Find the loop-closed-use at the loop exit of the original scalar
5374          result.  (The reduction result is expected to have two immediate uses -
5375          one at the latch block, and one at the loop exit).  */
5376       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5377         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5378             && !is_gimple_debug (USE_STMT (use_p)))
5379           phis.safe_push (USE_STMT (use_p));
5380
5381       /* While we expect to have found an exit_phi because of loop-closed-ssa
5382          form we can end up without one if the scalar cycle is dead.  */
5383
5384       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5385         {
5386           if (outer_loop)
5387             {
5388               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5389               gphi *vect_phi;
5390
5391               /* FORNOW. Currently not supporting the case that an inner-loop
5392                  reduction is not used in the outer-loop (but only outside the
5393                  outer-loop), unless it is double reduction.  */
5394               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5395                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5396                           || double_reduc);
5397
5398               if (double_reduc)
5399                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5400               else
5401                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5402               if (!double_reduc
5403                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5404                       != vect_double_reduction_def)
5405                 continue;
5406
5407               /* Handle double reduction:
5408
5409                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5410                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5411                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5412                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5413
5414                  At that point the regular reduction (stmt2 and stmt3) is
5415                  already vectorized, as well as the exit phi node, stmt4.
5416                  Here we vectorize the phi node of double reduction, stmt1, and
5417                  update all relevant statements.  */
5418
5419               /* Go through all the uses of s2 to find double reduction phi
5420                  node, i.e., stmt1 above.  */
5421               orig_name = PHI_RESULT (exit_phi);
5422               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5423                 {
5424                   stmt_vec_info use_stmt_vinfo;
5425                   stmt_vec_info new_phi_vinfo;
5426                   tree vect_phi_init, preheader_arg, vect_phi_res;
5427                   basic_block bb = gimple_bb (use_stmt);
5428                   gimple *use;
5429
5430                   /* Check that USE_STMT is really double reduction phi
5431                      node.  */
5432                   if (gimple_code (use_stmt) != GIMPLE_PHI
5433                       || gimple_phi_num_args (use_stmt) != 2
5434                       || bb->loop_father != outer_loop)
5435                     continue;
5436                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5437                   if (!use_stmt_vinfo
5438                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5439                           != vect_double_reduction_def)
5440                     continue;
5441
5442                   /* Create vector phi node for double reduction:
5443                      vs1 = phi <vs0, vs2>
5444                      vs1 was created previously in this function by a call to
5445                        vect_get_vec_def_for_operand and is stored in
5446                        vec_initial_def;
5447                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5448                      vs0 is created here.  */
5449
5450                   /* Create vector phi node.  */
5451                   vect_phi = create_phi_node (vec_initial_def, bb);
5452                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5453                                     loop_vec_info_for_loop (outer_loop));
5454                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5455
5456                   /* Create vs0 - initial def of the double reduction phi.  */
5457                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5458                                              loop_preheader_edge (outer_loop));
5459                   vect_phi_init = get_initial_def_for_reduction
5460                     (stmt, preheader_arg, NULL);
5461
5462                   /* Update phi node arguments with vs0 and vs2.  */
5463                   add_phi_arg (vect_phi, vect_phi_init,
5464                                loop_preheader_edge (outer_loop),
5465                                UNKNOWN_LOCATION);
5466                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5467                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5468                   if (dump_enabled_p ())
5469                     {
5470                       dump_printf_loc (MSG_NOTE, vect_location,
5471                                        "created double reduction phi node: ");
5472                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5473                     }
5474
5475                   vect_phi_res = PHI_RESULT (vect_phi);
5476
5477                   /* Replace the use, i.e., set the correct vs1 in the regular
5478                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5479                      loop is redundant.  */
5480                   use = reduction_phi;
5481                   for (j = 0; j < ncopies; j++)
5482                     {
5483                       edge pr_edge = loop_preheader_edge (loop);
5484                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5485                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5486                     }
5487                 }
5488             }
5489         }
5490
5491       phis.release ();
5492       if (nested_in_vect_loop)
5493         {
5494           if (double_reduc)
5495             loop = outer_loop;
5496           else
5497             continue;
5498         }
5499
5500       phis.create (3);
5501       /* Find the loop-closed-use at the loop exit of the original scalar
5502          result.  (The reduction result is expected to have two immediate uses,
5503          one at the latch block, and one at the loop exit).  For double
5504          reductions we are looking for exit phis of the outer loop.  */
5505       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5506         {
5507           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5508             {
5509               if (!is_gimple_debug (USE_STMT (use_p)))
5510                 phis.safe_push (USE_STMT (use_p));
5511             }
5512           else
5513             {
5514               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5515                 {
5516                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5517
5518                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5519                     {
5520                       if (!flow_bb_inside_loop_p (loop,
5521                                              gimple_bb (USE_STMT (phi_use_p)))
5522                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5523                         phis.safe_push (USE_STMT (phi_use_p));
5524                     }
5525                 }
5526             }
5527         }
5528
5529       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5530         {
5531           /* Replace the uses:  */
5532           orig_name = PHI_RESULT (exit_phi);
5533           scalar_result = scalar_results[k];
5534           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5535             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5536               SET_USE (use_p, scalar_result);
5537         }
5538
5539       phis.release ();
5540     }
5541 }
5542
5543
5544 /* Function is_nonwrapping_integer_induction.
5545
5546    Check if STMT (which is part of loop LOOP) both increments and
5547    does not cause overflow.  */
5548
5549 static bool
5550 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5551 {
5552   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5553   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5554   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5555   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5556   widest_int ni, max_loop_value, lhs_max;
5557   bool overflow = false;
5558
5559   /* Make sure the loop is integer based.  */
5560   if (TREE_CODE (base) != INTEGER_CST
5561       || TREE_CODE (step) != INTEGER_CST)
5562     return false;
5563
5564   /* Check that the max size of the loop will not wrap.  */
5565
5566   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5567     return true;
5568
5569   if (! max_stmt_executions (loop, &ni))
5570     return false;
5571
5572   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5573                             &overflow);
5574   if (overflow)
5575     return false;
5576
5577   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5578                             TYPE_SIGN (lhs_type), &overflow);
5579   if (overflow)
5580     return false;
5581
5582   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5583           <= TYPE_PRECISION (lhs_type));
5584 }
5585
5586 /* Function vectorizable_reduction.
5587
5588    Check if STMT performs a reduction operation that can be vectorized.
5589    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5590    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5591    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5592
5593    This function also handles reduction idioms (patterns) that have been
5594    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5595    of this form:
5596      X = pattern_expr (arg0, arg1, ..., X)
5597    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5598    sequence that had been detected and replaced by the pattern-stmt (STMT).
5599
5600    This function also handles reduction of condition expressions, for example:
5601      for (int i = 0; i < N; i++)
5602        if (a[i] < value)
5603          last = a[i];
5604    This is handled by vectorising the loop and creating an additional vector
5605    containing the loop indexes for which "a[i] < value" was true.  In the
5606    function epilogue this is reduced to a single max value and then used to
5607    index into the vector of results.
5608
5609    In some cases of reduction patterns, the type of the reduction variable X is
5610    different than the type of the other arguments of STMT.
5611    In such cases, the vectype that is used when transforming STMT into a vector
5612    stmt is different than the vectype that is used to determine the
5613    vectorization factor, because it consists of a different number of elements
5614    than the actual number of elements that are being operated upon in parallel.
5615
5616    For example, consider an accumulation of shorts into an int accumulator.
5617    On some targets it's possible to vectorize this pattern operating on 8
5618    shorts at a time (hence, the vectype for purposes of determining the
5619    vectorization factor should be V8HI); on the other hand, the vectype that
5620    is used to create the vector form is actually V4SI (the type of the result).
5621
5622    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5623    indicates what is the actual level of parallelism (V8HI in the example), so
5624    that the right vectorization factor would be derived.  This vectype
5625    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5626    be used to create the vectorized stmt.  The right vectype for the vectorized
5627    stmt is obtained from the type of the result X:
5628         get_vectype_for_scalar_type (TREE_TYPE (X))
5629
5630    This means that, contrary to "regular" reductions (or "regular" stmts in
5631    general), the following equation:
5632       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5633    does *NOT* necessarily hold for reduction patterns.  */
5634
5635 bool
5636 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5637                         gimple **vec_stmt, slp_tree slp_node,
5638                         slp_instance slp_node_instance)
5639 {
5640   tree vec_dest;
5641   tree scalar_dest;
5642   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5643   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5644   tree vectype_in = NULL_TREE;
5645   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5646   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5647   enum tree_code code, orig_code;
5648   internal_fn reduc_fn;
5649   machine_mode vec_mode;
5650   int op_type;
5651   optab optab;
5652   tree new_temp = NULL_TREE;
5653   gimple *def_stmt;
5654   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5655   gimple *cond_reduc_def_stmt = NULL;
5656   enum tree_code cond_reduc_op_code = ERROR_MARK;
5657   tree scalar_type;
5658   bool is_simple_use;
5659   gimple *orig_stmt;
5660   stmt_vec_info orig_stmt_info = NULL;
5661   int i;
5662   int ncopies;
5663   int epilog_copies;
5664   stmt_vec_info prev_stmt_info, prev_phi_info;
5665   bool single_defuse_cycle = false;
5666   gimple *new_stmt = NULL;
5667   int j;
5668   tree ops[3];
5669   enum vect_def_type dts[3];
5670   bool nested_cycle = false, found_nested_cycle_def = false;
5671   bool double_reduc = false;
5672   basic_block def_bb;
5673   struct loop * def_stmt_loop, *outer_loop = NULL;
5674   tree def_arg;
5675   gimple *def_arg_stmt;
5676   auto_vec<tree> vec_oprnds0;
5677   auto_vec<tree> vec_oprnds1;
5678   auto_vec<tree> vec_oprnds2;
5679   auto_vec<tree> vect_defs;
5680   auto_vec<gimple *> phis;
5681   int vec_num;
5682   tree def0, tem;
5683   bool first_p = true;
5684   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5685   tree cond_reduc_val = NULL_TREE;
5686
5687   /* Make sure it was already recognized as a reduction computation.  */
5688   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5689       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5690     return false;
5691
5692   if (nested_in_vect_loop_p (loop, stmt))
5693     {
5694       outer_loop = loop;
5695       loop = loop->inner;
5696       nested_cycle = true;
5697     }
5698
5699   /* In case of reduction chain we switch to the first stmt in the chain, but
5700      we don't update STMT_INFO, since only the last stmt is marked as reduction
5701      and has reduction properties.  */
5702   if (GROUP_FIRST_ELEMENT (stmt_info)
5703       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5704     {
5705       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5706       first_p = false;
5707     }
5708
5709   if (gimple_code (stmt) == GIMPLE_PHI)
5710     {
5711       /* Analysis is fully done on the reduction stmt invocation.  */
5712       if (! vec_stmt)
5713         {
5714           if (slp_node)
5715             slp_node_instance->reduc_phis = slp_node;
5716
5717           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5718           return true;
5719         }
5720
5721       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5722       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5723         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5724
5725       gcc_assert (is_gimple_assign (reduc_stmt));
5726       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5727         {
5728           tree op = gimple_op (reduc_stmt, k);
5729           if (op == gimple_phi_result (stmt))
5730             continue;
5731           if (k == 1
5732               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5733             continue;
5734           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5735           if (! vectype_in
5736               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5737             vectype_in = tem;
5738           break;
5739         }
5740       gcc_assert (vectype_in);
5741
5742       if (slp_node)
5743         ncopies = 1;
5744       else
5745         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5746
5747       use_operand_p use_p;
5748       gimple *use_stmt;
5749       if (ncopies > 1
5750           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5751               <= vect_used_only_live)
5752           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5753           && (use_stmt == reduc_stmt
5754               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5755                   == reduc_stmt)))
5756         single_defuse_cycle = true;
5757
5758       /* Create the destination vector  */
5759       scalar_dest = gimple_assign_lhs (reduc_stmt);
5760       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5761
5762       if (slp_node)
5763         /* The size vect_schedule_slp_instance computes is off for us.  */
5764         vec_num = vect_get_num_vectors
5765           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5766            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
5767            vectype_in);
5768       else
5769         vec_num = 1;
5770
5771       /* Generate the reduction PHIs upfront.  */
5772       prev_phi_info = NULL;
5773       for (j = 0; j < ncopies; j++)
5774         {
5775           if (j == 0 || !single_defuse_cycle)
5776             {
5777               for (i = 0; i < vec_num; i++)
5778                 {
5779                   /* Create the reduction-phi that defines the reduction
5780                      operand.  */
5781                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5782                   set_vinfo_for_stmt (new_phi,
5783                                       new_stmt_vec_info (new_phi, loop_vinfo));
5784
5785                   if (slp_node)
5786                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5787                   else
5788                     {
5789                       if (j == 0)
5790                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5791                       else
5792                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5793                       prev_phi_info = vinfo_for_stmt (new_phi);
5794                     }
5795                 }
5796             }
5797         }
5798
5799       return true;
5800     }
5801
5802   /* 1. Is vectorizable reduction?  */
5803   /* Not supportable if the reduction variable is used in the loop, unless
5804      it's a reduction chain.  */
5805   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5806       && !GROUP_FIRST_ELEMENT (stmt_info))
5807     return false;
5808
5809   /* Reductions that are not used even in an enclosing outer-loop,
5810      are expected to be "live" (used out of the loop).  */
5811   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5812       && !STMT_VINFO_LIVE_P (stmt_info))
5813     return false;
5814
5815   /* 2. Has this been recognized as a reduction pattern?
5816
5817      Check if STMT represents a pattern that has been recognized
5818      in earlier analysis stages.  For stmts that represent a pattern,
5819      the STMT_VINFO_RELATED_STMT field records the last stmt in
5820      the original sequence that constitutes the pattern.  */
5821
5822   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5823   if (orig_stmt)
5824     {
5825       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5826       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5827       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5828     }
5829
5830   /* 3. Check the operands of the operation.  The first operands are defined
5831         inside the loop body. The last operand is the reduction variable,
5832         which is defined by the loop-header-phi.  */
5833
5834   gcc_assert (is_gimple_assign (stmt));
5835
5836   /* Flatten RHS.  */
5837   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5838     {
5839     case GIMPLE_BINARY_RHS:
5840       code = gimple_assign_rhs_code (stmt);
5841       op_type = TREE_CODE_LENGTH (code);
5842       gcc_assert (op_type == binary_op);
5843       ops[0] = gimple_assign_rhs1 (stmt);
5844       ops[1] = gimple_assign_rhs2 (stmt);
5845       break;
5846
5847     case GIMPLE_TERNARY_RHS:
5848       code = gimple_assign_rhs_code (stmt);
5849       op_type = TREE_CODE_LENGTH (code);
5850       gcc_assert (op_type == ternary_op);
5851       ops[0] = gimple_assign_rhs1 (stmt);
5852       ops[1] = gimple_assign_rhs2 (stmt);
5853       ops[2] = gimple_assign_rhs3 (stmt);
5854       break;
5855
5856     case GIMPLE_UNARY_RHS:
5857       return false;
5858
5859     default:
5860       gcc_unreachable ();
5861     }
5862
5863   if (code == COND_EXPR && slp_node)
5864     return false;
5865
5866   scalar_dest = gimple_assign_lhs (stmt);
5867   scalar_type = TREE_TYPE (scalar_dest);
5868   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5869       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5870     return false;
5871
5872   /* Do not try to vectorize bit-precision reductions.  */
5873   if (!type_has_mode_precision_p (scalar_type))
5874     return false;
5875
5876   /* All uses but the last are expected to be defined in the loop.
5877      The last use is the reduction variable.  In case of nested cycle this
5878      assumption is not true: we use reduc_index to record the index of the
5879      reduction variable.  */
5880   gimple *reduc_def_stmt = NULL;
5881   int reduc_index = -1;
5882   for (i = 0; i < op_type; i++)
5883     {
5884       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5885       if (i == 0 && code == COND_EXPR)
5886         continue;
5887
5888       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5889                                           &def_stmt, &dts[i], &tem);
5890       dt = dts[i];
5891       gcc_assert (is_simple_use);
5892       if (dt == vect_reduction_def)
5893         {
5894           reduc_def_stmt = def_stmt;
5895           reduc_index = i;
5896           continue;
5897         }
5898       else if (tem)
5899         {
5900           /* To properly compute ncopies we are interested in the widest
5901              input type in case we're looking at a widening accumulation.  */
5902           if (!vectype_in
5903               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5904             vectype_in = tem;
5905         }
5906
5907       if (dt != vect_internal_def
5908           && dt != vect_external_def
5909           && dt != vect_constant_def
5910           && dt != vect_induction_def
5911           && !(dt == vect_nested_cycle && nested_cycle))
5912         return false;
5913
5914       if (dt == vect_nested_cycle)
5915         {
5916           found_nested_cycle_def = true;
5917           reduc_def_stmt = def_stmt;
5918           reduc_index = i;
5919         }
5920
5921       if (i == 1 && code == COND_EXPR)
5922         {
5923           /* Record how value of COND_EXPR is defined.  */
5924           if (dt == vect_constant_def)
5925             {
5926               cond_reduc_dt = dt;
5927               cond_reduc_val = ops[i];
5928             }
5929           if (dt == vect_induction_def
5930               && def_stmt != NULL
5931               && is_nonwrapping_integer_induction (def_stmt, loop))
5932             {
5933               cond_reduc_dt = dt;
5934               cond_reduc_def_stmt = def_stmt;
5935             }
5936         }
5937     }
5938
5939   if (!vectype_in)
5940     vectype_in = vectype_out;
5941
5942   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5943      directy used in stmt.  */
5944   if (reduc_index == -1)
5945     {
5946       if (orig_stmt)
5947         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5948       else
5949         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5950     }
5951
5952   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5953     return false;
5954
5955   if (!(reduc_index == -1
5956         || dts[reduc_index] == vect_reduction_def
5957         || dts[reduc_index] == vect_nested_cycle
5958         || ((dts[reduc_index] == vect_internal_def
5959              || dts[reduc_index] == vect_external_def
5960              || dts[reduc_index] == vect_constant_def
5961              || dts[reduc_index] == vect_induction_def)
5962             && nested_cycle && found_nested_cycle_def)))
5963     {
5964       /* For pattern recognized stmts, orig_stmt might be a reduction,
5965          but some helper statements for the pattern might not, or
5966          might be COND_EXPRs with reduction uses in the condition.  */
5967       gcc_assert (orig_stmt);
5968       return false;
5969     }
5970
5971   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5972   enum vect_reduction_type v_reduc_type
5973     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5974   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5975
5976   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5977   /* If we have a condition reduction, see if we can simplify it further.  */
5978   if (v_reduc_type == COND_REDUCTION)
5979     {
5980       if (cond_reduc_dt == vect_induction_def)
5981         {
5982           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
5983           tree base
5984             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5985           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5986
5987           gcc_assert (TREE_CODE (base) == INTEGER_CST
5988                       && TREE_CODE (step) == INTEGER_CST);
5989           cond_reduc_val = NULL_TREE;
5990           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5991              above base; punt if base is the minimum value of the type for
5992              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5993           if (tree_int_cst_sgn (step) == -1)
5994             {
5995               cond_reduc_op_code = MIN_EXPR;
5996               if (tree_int_cst_sgn (base) == -1)
5997                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5998               else if (tree_int_cst_lt (base,
5999                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6000                 cond_reduc_val
6001                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6002             }
6003           else
6004             {
6005               cond_reduc_op_code = MAX_EXPR;
6006               if (tree_int_cst_sgn (base) == 1)
6007                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6008               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6009                                         base))
6010                 cond_reduc_val
6011                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6012             }
6013           if (cond_reduc_val)
6014             {
6015               if (dump_enabled_p ())
6016                 dump_printf_loc (MSG_NOTE, vect_location,
6017                                  "condition expression based on "
6018                                  "integer induction.\n");
6019               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6020                 = INTEGER_INDUC_COND_REDUCTION;
6021             }
6022         }
6023
6024       /* Loop peeling modifies initial value of reduction PHI, which
6025          makes the reduction stmt to be transformed different to the
6026          original stmt analyzed.  We need to record reduction code for
6027          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6028          it can be used directly at transform stage.  */
6029       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6030           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6031         {
6032           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6033           gcc_assert (cond_reduc_dt == vect_constant_def);
6034           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6035         }
6036       else if (cond_reduc_dt == vect_constant_def)
6037         {
6038           enum vect_def_type cond_initial_dt;
6039           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6040           tree cond_initial_val
6041             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6042
6043           gcc_assert (cond_reduc_val != NULL_TREE);
6044           vect_is_simple_use (cond_initial_val, loop_vinfo,
6045                               &def_stmt, &cond_initial_dt);
6046           if (cond_initial_dt == vect_constant_def
6047               && types_compatible_p (TREE_TYPE (cond_initial_val),
6048                                      TREE_TYPE (cond_reduc_val)))
6049             {
6050               tree e = fold_binary (LE_EXPR, boolean_type_node,
6051                                     cond_initial_val, cond_reduc_val);
6052               if (e && (integer_onep (e) || integer_zerop (e)))
6053                 {
6054                   if (dump_enabled_p ())
6055                     dump_printf_loc (MSG_NOTE, vect_location,
6056                                      "condition expression based on "
6057                                      "compile time constant.\n");
6058                   /* Record reduction code at analysis stage.  */
6059                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6060                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6061                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6062                     = CONST_COND_REDUCTION;
6063                 }
6064             }
6065         }
6066     }
6067
6068   if (orig_stmt)
6069     gcc_assert (tmp == orig_stmt
6070                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6071   else
6072     /* We changed STMT to be the first stmt in reduction chain, hence we
6073        check that in this case the first element in the chain is STMT.  */
6074     gcc_assert (stmt == tmp
6075                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6076
6077   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6078     return false;
6079
6080   if (slp_node)
6081     ncopies = 1;
6082   else
6083     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6084
6085   gcc_assert (ncopies >= 1);
6086
6087   vec_mode = TYPE_MODE (vectype_in);
6088
6089   if (code == COND_EXPR)
6090     {
6091       /* Only call during the analysis stage, otherwise we'll lose
6092          STMT_VINFO_TYPE.  */
6093       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6094                                                 ops[reduc_index], 0, NULL))
6095         {
6096           if (dump_enabled_p ())
6097             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6098                              "unsupported condition in reduction\n");
6099           return false;
6100         }
6101     }
6102   else
6103     {
6104       /* 4. Supportable by target?  */
6105
6106       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6107           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6108         {
6109           /* Shifts and rotates are only supported by vectorizable_shifts,
6110              not vectorizable_reduction.  */
6111           if (dump_enabled_p ())
6112             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6113                              "unsupported shift or rotation.\n");
6114           return false;
6115         }
6116
6117       /* 4.1. check support for the operation in the loop  */
6118       optab = optab_for_tree_code (code, vectype_in, optab_default);
6119       if (!optab)
6120         {
6121           if (dump_enabled_p ())
6122             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6123                              "no optab.\n");
6124
6125           return false;
6126         }
6127
6128       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6129         {
6130           if (dump_enabled_p ())
6131             dump_printf (MSG_NOTE, "op not supported by target.\n");
6132
6133           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6134               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6135             return false;
6136
6137           if (dump_enabled_p ())
6138             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6139         }
6140
6141       /* Worthwhile without SIMD support?  */
6142       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6143           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6144         {
6145           if (dump_enabled_p ())
6146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6147                              "not worthwhile without SIMD support.\n");
6148
6149           return false;
6150         }
6151     }
6152
6153   /* 4.2. Check support for the epilog operation.
6154
6155           If STMT represents a reduction pattern, then the type of the
6156           reduction variable may be different than the type of the rest
6157           of the arguments.  For example, consider the case of accumulation
6158           of shorts into an int accumulator; The original code:
6159                         S1: int_a = (int) short_a;
6160           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6161
6162           was replaced with:
6163                         STMT: int_acc = widen_sum <short_a, int_acc>
6164
6165           This means that:
6166           1. The tree-code that is used to create the vector operation in the
6167              epilog code (that reduces the partial results) is not the
6168              tree-code of STMT, but is rather the tree-code of the original
6169              stmt from the pattern that STMT is replacing.  I.e, in the example
6170              above we want to use 'widen_sum' in the loop, but 'plus' in the
6171              epilog.
6172           2. The type (mode) we use to check available target support
6173              for the vector operation to be created in the *epilog*, is
6174              determined by the type of the reduction variable (in the example
6175              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6176              However the type (mode) we use to check available target support
6177              for the vector operation to be created *inside the loop*, is
6178              determined by the type of the other arguments to STMT (in the
6179              example we'd check this: optab_handler (widen_sum_optab,
6180              vect_short_mode)).
6181
6182           This is contrary to "regular" reductions, in which the types of all
6183           the arguments are the same as the type of the reduction variable.
6184           For "regular" reductions we can therefore use the same vector type
6185           (and also the same tree-code) when generating the epilog code and
6186           when generating the code inside the loop.  */
6187
6188   if (orig_stmt)
6189     {
6190       /* This is a reduction pattern: get the vectype from the type of the
6191          reduction variable, and get the tree-code from orig_stmt.  */
6192       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6193                   == TREE_CODE_REDUCTION);
6194       orig_code = gimple_assign_rhs_code (orig_stmt);
6195       gcc_assert (vectype_out);
6196       vec_mode = TYPE_MODE (vectype_out);
6197     }
6198   else
6199     {
6200       /* Regular reduction: use the same vectype and tree-code as used for
6201          the vector code inside the loop can be used for the epilog code. */
6202       orig_code = code;
6203
6204       if (code == MINUS_EXPR)
6205         orig_code = PLUS_EXPR;
6206
6207       /* For simple condition reductions, replace with the actual expression
6208          we want to base our reduction around.  */
6209       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6210         {
6211           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6212           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6213         }
6214       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6215                == INTEGER_INDUC_COND_REDUCTION)
6216         orig_code = cond_reduc_op_code;
6217     }
6218
6219   if (nested_cycle)
6220     {
6221       def_bb = gimple_bb (reduc_def_stmt);
6222       def_stmt_loop = def_bb->loop_father;
6223       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6224                                        loop_preheader_edge (def_stmt_loop));
6225       if (TREE_CODE (def_arg) == SSA_NAME
6226           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6227           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6228           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6229           && vinfo_for_stmt (def_arg_stmt)
6230           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6231               == vect_double_reduction_def)
6232         double_reduc = true;
6233     }
6234
6235   reduc_fn = IFN_LAST;
6236
6237   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6238     {
6239       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6240         {
6241           if (reduc_fn != IFN_LAST
6242               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6243                                                   OPTIMIZE_FOR_SPEED))
6244             {
6245               if (dump_enabled_p ())
6246                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6247                                  "reduc op not supported by target.\n");
6248
6249               reduc_fn = IFN_LAST;
6250             }
6251         }
6252       else
6253         {
6254           if (!nested_cycle || double_reduc)
6255             {
6256               if (dump_enabled_p ())
6257                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6258                                  "no reduc code for scalar code.\n");
6259
6260               return false;
6261             }
6262         }
6263     }
6264   else
6265     {
6266       int scalar_precision
6267         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6268       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6269       cr_index_vector_type = build_vector_type
6270         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6271
6272       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6273                                           OPTIMIZE_FOR_SPEED))
6274         reduc_fn = IFN_REDUC_MAX;
6275     }
6276
6277   if ((double_reduc
6278        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6279       && ncopies > 1)
6280     {
6281       if (dump_enabled_p ())
6282         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6283                          "multiple types in double reduction or condition "
6284                          "reduction.\n");
6285       return false;
6286     }
6287
6288   /* In case of widenning multiplication by a constant, we update the type
6289      of the constant to be the type of the other operand.  We check that the
6290      constant fits the type in the pattern recognition pass.  */
6291   if (code == DOT_PROD_EXPR
6292       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6293     {
6294       if (TREE_CODE (ops[0]) == INTEGER_CST)
6295         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6296       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6297         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6298       else
6299         {
6300           if (dump_enabled_p ())
6301             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6302                              "invalid types in dot-prod\n");
6303
6304           return false;
6305         }
6306     }
6307
6308   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6309     {
6310       widest_int ni;
6311
6312       if (! max_loop_iterations (loop, &ni))
6313         {
6314           if (dump_enabled_p ())
6315             dump_printf_loc (MSG_NOTE, vect_location,
6316                              "loop count not known, cannot create cond "
6317                              "reduction.\n");
6318           return false;
6319         }
6320       /* Convert backedges to iterations.  */
6321       ni += 1;
6322
6323       /* The additional index will be the same type as the condition.  Check
6324          that the loop can fit into this less one (because we'll use up the
6325          zero slot for when there are no matches).  */
6326       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6327       if (wi::geu_p (ni, wi::to_widest (max_index)))
6328         {
6329           if (dump_enabled_p ())
6330             dump_printf_loc (MSG_NOTE, vect_location,
6331                              "loop size is greater than data size.\n");
6332           return false;
6333         }
6334     }
6335
6336   /* In case the vectorization factor (VF) is bigger than the number
6337      of elements that we can fit in a vectype (nunits), we have to generate
6338      more than one vector stmt - i.e - we need to "unroll" the
6339      vector stmt by a factor VF/nunits.  For more details see documentation
6340      in vectorizable_operation.  */
6341
6342   /* If the reduction is used in an outer loop we need to generate
6343      VF intermediate results, like so (e.g. for ncopies=2):
6344         r0 = phi (init, r0)
6345         r1 = phi (init, r1)
6346         r0 = x0 + r0;
6347         r1 = x1 + r1;
6348     (i.e. we generate VF results in 2 registers).
6349     In this case we have a separate def-use cycle for each copy, and therefore
6350     for each copy we get the vector def for the reduction variable from the
6351     respective phi node created for this copy.
6352
6353     Otherwise (the reduction is unused in the loop nest), we can combine
6354     together intermediate results, like so (e.g. for ncopies=2):
6355         r = phi (init, r)
6356         r = x0 + r;
6357         r = x1 + r;
6358    (i.e. we generate VF/2 results in a single register).
6359    In this case for each copy we get the vector def for the reduction variable
6360    from the vectorized reduction operation generated in the previous iteration.
6361
6362    This only works when we see both the reduction PHI and its only consumer
6363    in vectorizable_reduction and there are no intermediate stmts
6364    participating.  */
6365   use_operand_p use_p;
6366   gimple *use_stmt;
6367   if (ncopies > 1
6368       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6369       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6370       && (use_stmt == stmt
6371           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6372     {
6373       single_defuse_cycle = true;
6374       epilog_copies = 1;
6375     }
6376   else
6377     epilog_copies = ncopies;
6378
6379   /* If the reduction stmt is one of the patterns that have lane
6380      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6381   if ((ncopies > 1
6382        && ! single_defuse_cycle)
6383       && (code == DOT_PROD_EXPR
6384           || code == WIDEN_SUM_EXPR
6385           || code == SAD_EXPR))
6386     {
6387       if (dump_enabled_p ())
6388         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6389                          "multi def-use cycle not possible for lane-reducing "
6390                          "reduction operation\n");
6391       return false;
6392     }
6393
6394   if (!vec_stmt) /* transformation not required.  */
6395     {
6396       if (first_p)
6397         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6398       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6399       return true;
6400     }
6401
6402   /* Transform.  */
6403
6404   if (dump_enabled_p ())
6405     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6406
6407   /* FORNOW: Multiple types are not supported for condition.  */
6408   if (code == COND_EXPR)
6409     gcc_assert (ncopies == 1);
6410
6411   /* Create the destination vector  */
6412   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6413
6414   prev_stmt_info = NULL;
6415   prev_phi_info = NULL;
6416   if (slp_node)
6417     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6418   else
6419     {
6420       vec_num = 1;
6421       vec_oprnds0.create (1);
6422       vec_oprnds1.create (1);
6423       if (op_type == ternary_op)
6424         vec_oprnds2.create (1);
6425     }
6426
6427   phis.create (vec_num);
6428   vect_defs.create (vec_num);
6429   if (!slp_node)
6430     vect_defs.quick_push (NULL_TREE);
6431
6432   if (slp_node)
6433     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6434   else
6435     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6436
6437   for (j = 0; j < ncopies; j++)
6438     {
6439       if (code == COND_EXPR)
6440         {
6441           gcc_assert (!slp_node);
6442           vectorizable_condition (stmt, gsi, vec_stmt,
6443                                   PHI_RESULT (phis[0]),
6444                                   reduc_index, NULL);
6445           /* Multiple types are not supported for condition.  */
6446           break;
6447         }
6448
6449       /* Handle uses.  */
6450       if (j == 0)
6451         {
6452           if (slp_node)
6453             {
6454               /* Get vec defs for all the operands except the reduction index,
6455                  ensuring the ordering of the ops in the vector is kept.  */
6456               auto_vec<tree, 3> slp_ops;
6457               auto_vec<vec<tree>, 3> vec_defs;
6458
6459               slp_ops.quick_push (ops[0]);
6460               slp_ops.quick_push (ops[1]);
6461               if (op_type == ternary_op)
6462                 slp_ops.quick_push (ops[2]);
6463
6464               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6465
6466               vec_oprnds0.safe_splice (vec_defs[0]);
6467               vec_defs[0].release ();
6468               vec_oprnds1.safe_splice (vec_defs[1]);
6469               vec_defs[1].release ();
6470               if (op_type == ternary_op)
6471                 {
6472                   vec_oprnds2.safe_splice (vec_defs[2]);
6473                   vec_defs[2].release ();
6474                 }
6475             }
6476           else
6477             {
6478               vec_oprnds0.quick_push
6479                 (vect_get_vec_def_for_operand (ops[0], stmt));
6480               vec_oprnds1.quick_push
6481                 (vect_get_vec_def_for_operand (ops[1], stmt));
6482               if (op_type == ternary_op)
6483                 vec_oprnds2.quick_push
6484                   (vect_get_vec_def_for_operand (ops[2], stmt));
6485             }
6486         }
6487       else
6488         {
6489           if (!slp_node)
6490             {
6491               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6492
6493               if (single_defuse_cycle && reduc_index == 0)
6494                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6495               else
6496                 vec_oprnds0[0]
6497                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6498               if (single_defuse_cycle && reduc_index == 1)
6499                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6500               else
6501                 vec_oprnds1[0]
6502                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6503               if (op_type == ternary_op)
6504                 {
6505                   if (single_defuse_cycle && reduc_index == 2)
6506                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6507                   else
6508                     vec_oprnds2[0]
6509                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6510                 }
6511             }
6512         }
6513
6514       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6515         {
6516           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6517           if (op_type == ternary_op)
6518             vop[2] = vec_oprnds2[i];
6519
6520           new_temp = make_ssa_name (vec_dest, new_stmt);
6521           new_stmt = gimple_build_assign (new_temp, code,
6522                                           vop[0], vop[1], vop[2]);
6523           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6524
6525           if (slp_node)
6526             {
6527               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6528               vect_defs.quick_push (new_temp);
6529             }
6530           else
6531             vect_defs[0] = new_temp;
6532         }
6533
6534       if (slp_node)
6535         continue;
6536
6537       if (j == 0)
6538         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6539       else
6540         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6541
6542       prev_stmt_info = vinfo_for_stmt (new_stmt);
6543     }
6544
6545   /* Finalize the reduction-phi (set its arguments) and create the
6546      epilog reduction code.  */
6547   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6548     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6549
6550   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6551                                     epilog_copies, reduc_fn, phis,
6552                                     double_reduc, slp_node, slp_node_instance,
6553                                     cond_reduc_val, cond_reduc_op_code);
6554
6555   return true;
6556 }
6557
6558 /* Function vect_min_worthwhile_factor.
6559
6560    For a loop where we could vectorize the operation indicated by CODE,
6561    return the minimum vectorization factor that makes it worthwhile
6562    to use generic vectors.  */
6563 static unsigned int
6564 vect_min_worthwhile_factor (enum tree_code code)
6565 {
6566   switch (code)
6567     {
6568     case PLUS_EXPR:
6569     case MINUS_EXPR:
6570     case NEGATE_EXPR:
6571       return 4;
6572
6573     case BIT_AND_EXPR:
6574     case BIT_IOR_EXPR:
6575     case BIT_XOR_EXPR:
6576     case BIT_NOT_EXPR:
6577       return 2;
6578
6579     default:
6580       return INT_MAX;
6581     }
6582 }
6583
6584 /* Return true if VINFO indicates we are doing loop vectorization and if
6585    it is worth decomposing CODE operations into scalar operations for
6586    that loop's vectorization factor.  */
6587
6588 bool
6589 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6590 {
6591   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6592   unsigned HOST_WIDE_INT value;
6593   return (loop_vinfo
6594           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6595           && value >= vect_min_worthwhile_factor (code));
6596 }
6597
6598 /* Function vectorizable_induction
6599
6600    Check if PHI performs an induction computation that can be vectorized.
6601    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6602    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6603    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6604
6605 bool
6606 vectorizable_induction (gimple *phi,
6607                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6608                         gimple **vec_stmt, slp_tree slp_node)
6609 {
6610   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6611   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6612   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6613   unsigned ncopies;
6614   bool nested_in_vect_loop = false;
6615   struct loop *iv_loop;
6616   tree vec_def;
6617   edge pe = loop_preheader_edge (loop);
6618   basic_block new_bb;
6619   tree new_vec, vec_init, vec_step, t;
6620   tree new_name;
6621   gimple *new_stmt;
6622   gphi *induction_phi;
6623   tree induc_def, vec_dest;
6624   tree init_expr, step_expr;
6625   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6626   unsigned i;
6627   tree expr;
6628   gimple_seq stmts;
6629   imm_use_iterator imm_iter;
6630   use_operand_p use_p;
6631   gimple *exit_phi;
6632   edge latch_e;
6633   tree loop_arg;
6634   gimple_stmt_iterator si;
6635   basic_block bb = gimple_bb (phi);
6636
6637   if (gimple_code (phi) != GIMPLE_PHI)
6638     return false;
6639
6640   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6641     return false;
6642
6643   /* Make sure it was recognized as induction computation.  */
6644   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6645     return false;
6646
6647   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6648   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6649
6650   if (slp_node)
6651     ncopies = 1;
6652   else
6653     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6654   gcc_assert (ncopies >= 1);
6655
6656   /* FORNOW. These restrictions should be relaxed.  */
6657   if (nested_in_vect_loop_p (loop, phi))
6658     {
6659       imm_use_iterator imm_iter;
6660       use_operand_p use_p;
6661       gimple *exit_phi;
6662       edge latch_e;
6663       tree loop_arg;
6664
6665       if (ncopies > 1)
6666         {
6667           if (dump_enabled_p ())
6668             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6669                              "multiple types in nested loop.\n");
6670           return false;
6671         }
6672
6673       /* FORNOW: outer loop induction with SLP not supported.  */
6674       if (STMT_SLP_TYPE (stmt_info))
6675         return false;
6676
6677       exit_phi = NULL;
6678       latch_e = loop_latch_edge (loop->inner);
6679       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6680       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6681         {
6682           gimple *use_stmt = USE_STMT (use_p);
6683           if (is_gimple_debug (use_stmt))
6684             continue;
6685
6686           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6687             {
6688               exit_phi = use_stmt;
6689               break;
6690             }
6691         }
6692       if (exit_phi)
6693         {
6694           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6695           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6696                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6697             {
6698               if (dump_enabled_p ())
6699                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6700                                  "inner-loop induction only used outside "
6701                                  "of the outer vectorized loop.\n");
6702               return false;
6703             }
6704         }
6705
6706       nested_in_vect_loop = true;
6707       iv_loop = loop->inner;
6708     }
6709   else
6710     iv_loop = loop;
6711   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6712
6713   if (!vec_stmt) /* transformation not required.  */
6714     {
6715       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6716       if (dump_enabled_p ())
6717         dump_printf_loc (MSG_NOTE, vect_location,
6718                          "=== vectorizable_induction ===\n");
6719       vect_model_induction_cost (stmt_info, ncopies);
6720       return true;
6721     }
6722
6723   /* Transform.  */
6724
6725   /* Compute a vector variable, initialized with the first VF values of
6726      the induction variable.  E.g., for an iv with IV_PHI='X' and
6727      evolution S, for a vector of 4 units, we want to compute:
6728      [X, X + S, X + 2*S, X + 3*S].  */
6729
6730   if (dump_enabled_p ())
6731     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6732
6733   latch_e = loop_latch_edge (iv_loop);
6734   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6735
6736   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6737   gcc_assert (step_expr != NULL_TREE);
6738
6739   pe = loop_preheader_edge (iv_loop);
6740   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6741                                      loop_preheader_edge (iv_loop));
6742
6743   /* Convert the step to the desired type.  */
6744   stmts = NULL;
6745   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6746   if (stmts)
6747     {
6748       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6749       gcc_assert (!new_bb);
6750     }
6751
6752   /* Find the first insertion point in the BB.  */
6753   si = gsi_after_labels (bb);
6754
6755   /* For SLP induction we have to generate several IVs as for example
6756      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6757      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6758      [VF*S, VF*S, VF*S, VF*S] for all.  */
6759   if (slp_node)
6760     {
6761       /* Convert the init to the desired type.  */
6762       stmts = NULL;
6763       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6764       if (stmts)
6765         {
6766           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6767           gcc_assert (!new_bb);
6768         }
6769
6770       /* Generate [VF*S, VF*S, ... ].  */
6771       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6772         {
6773           expr = build_int_cst (integer_type_node, vf);
6774           expr = fold_convert (TREE_TYPE (step_expr), expr);
6775         }
6776       else
6777         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6778       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6779                               expr, step_expr);
6780       if (! CONSTANT_CLASS_P (new_name))
6781         new_name = vect_init_vector (phi, new_name,
6782                                      TREE_TYPE (step_expr), NULL);
6783       new_vec = build_vector_from_val (vectype, new_name);
6784       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6785
6786       /* Now generate the IVs.  */
6787       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6788       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6789       unsigned elts = nunits * nvects;
6790       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6791       gcc_assert (elts % group_size == 0);
6792       tree elt = init_expr;
6793       unsigned ivn;
6794       for (ivn = 0; ivn < nivs; ++ivn)
6795         {
6796           tree_vector_builder elts (vectype, nunits, 1);
6797           stmts = NULL;
6798           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6799             {
6800               if (ivn*nunits + eltn >= group_size
6801                   && (ivn*nunits + eltn) % group_size == 0)
6802                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6803                                     elt, step_expr);
6804               elts.quick_push (elt);
6805             }
6806           vec_init = gimple_build_vector (&stmts, &elts);
6807           if (stmts)
6808             {
6809               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6810               gcc_assert (!new_bb);
6811             }
6812
6813           /* Create the induction-phi that defines the induction-operand.  */
6814           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6815           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6816           set_vinfo_for_stmt (induction_phi,
6817                               new_stmt_vec_info (induction_phi, loop_vinfo));
6818           induc_def = PHI_RESULT (induction_phi);
6819
6820           /* Create the iv update inside the loop  */
6821           vec_def = make_ssa_name (vec_dest);
6822           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6823           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6824           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6825
6826           /* Set the arguments of the phi node:  */
6827           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6828           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6829                        UNKNOWN_LOCATION);
6830
6831           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6832         }
6833
6834       /* Re-use IVs when we can.  */
6835       if (ivn < nvects)
6836         {
6837           unsigned vfp
6838             = least_common_multiple (group_size, nunits) / group_size;
6839           /* Generate [VF'*S, VF'*S, ... ].  */
6840           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6841             {
6842               expr = build_int_cst (integer_type_node, vfp);
6843               expr = fold_convert (TREE_TYPE (step_expr), expr);
6844             }
6845           else
6846             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6847           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6848                                   expr, step_expr);
6849           if (! CONSTANT_CLASS_P (new_name))
6850             new_name = vect_init_vector (phi, new_name,
6851                                          TREE_TYPE (step_expr), NULL);
6852           new_vec = build_vector_from_val (vectype, new_name);
6853           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6854           for (; ivn < nvects; ++ivn)
6855             {
6856               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6857               tree def;
6858               if (gimple_code (iv) == GIMPLE_PHI)
6859                 def = gimple_phi_result (iv);
6860               else
6861                 def = gimple_assign_lhs (iv);
6862               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6863                                               PLUS_EXPR,
6864                                               def, vec_step);
6865               if (gimple_code (iv) == GIMPLE_PHI)
6866                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6867               else
6868                 {
6869                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6870                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6871                 }
6872               set_vinfo_for_stmt (new_stmt,
6873                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6874               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6875             }
6876         }
6877
6878       return true;
6879     }
6880
6881   /* Create the vector that holds the initial_value of the induction.  */
6882   if (nested_in_vect_loop)
6883     {
6884       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6885          been created during vectorization of previous stmts.  We obtain it
6886          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6887       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6888       /* If the initial value is not of proper type, convert it.  */
6889       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6890         {
6891           new_stmt
6892             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6893                                                           vect_simple_var,
6894                                                           "vec_iv_"),
6895                                    VIEW_CONVERT_EXPR,
6896                                    build1 (VIEW_CONVERT_EXPR, vectype,
6897                                            vec_init));
6898           vec_init = gimple_assign_lhs (new_stmt);
6899           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6900                                                  new_stmt);
6901           gcc_assert (!new_bb);
6902           set_vinfo_for_stmt (new_stmt,
6903                               new_stmt_vec_info (new_stmt, loop_vinfo));
6904         }
6905     }
6906   else
6907     {
6908       /* iv_loop is the loop to be vectorized. Create:
6909          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6910       stmts = NULL;
6911       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6912
6913       tree_vector_builder elts (vectype, nunits, 1);
6914       elts.quick_push (new_name);
6915       for (i = 1; i < nunits; i++)
6916         {
6917           /* Create: new_name_i = new_name + step_expr  */
6918           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6919                                    new_name, step_expr);
6920           elts.quick_push (new_name);
6921         }
6922       /* Create a vector from [new_name_0, new_name_1, ...,
6923          new_name_nunits-1]  */
6924       vec_init = gimple_build_vector (&stmts, &elts);
6925       if (stmts)
6926         {
6927           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6928           gcc_assert (!new_bb);
6929         }
6930     }
6931
6932
6933   /* Create the vector that holds the step of the induction.  */
6934   if (nested_in_vect_loop)
6935     /* iv_loop is nested in the loop to be vectorized. Generate:
6936        vec_step = [S, S, S, S]  */
6937     new_name = step_expr;
6938   else
6939     {
6940       /* iv_loop is the loop to be vectorized. Generate:
6941           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6942       gimple_seq seq = NULL;
6943       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6944         {
6945           expr = build_int_cst (integer_type_node, vf);
6946           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6947         }
6948       else
6949         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6950       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6951                                expr, step_expr);
6952       if (seq)
6953         {
6954           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6955           gcc_assert (!new_bb);
6956         }
6957     }
6958
6959   t = unshare_expr (new_name);
6960   gcc_assert (CONSTANT_CLASS_P (new_name)
6961               || TREE_CODE (new_name) == SSA_NAME);
6962   new_vec = build_vector_from_val (vectype, t);
6963   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6964
6965
6966   /* Create the following def-use cycle:
6967      loop prolog:
6968          vec_init = ...
6969          vec_step = ...
6970      loop:
6971          vec_iv = PHI <vec_init, vec_loop>
6972          ...
6973          STMT
6974          ...
6975          vec_loop = vec_iv + vec_step;  */
6976
6977   /* Create the induction-phi that defines the induction-operand.  */
6978   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6979   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6980   set_vinfo_for_stmt (induction_phi,
6981                       new_stmt_vec_info (induction_phi, loop_vinfo));
6982   induc_def = PHI_RESULT (induction_phi);
6983
6984   /* Create the iv update inside the loop  */
6985   vec_def = make_ssa_name (vec_dest);
6986   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6987   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6988   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6989
6990   /* Set the arguments of the phi node:  */
6991   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6992   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6993                UNKNOWN_LOCATION);
6994
6995   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6996
6997   /* In case that vectorization factor (VF) is bigger than the number
6998      of elements that we can fit in a vectype (nunits), we have to generate
6999      more than one vector stmt - i.e - we need to "unroll" the
7000      vector stmt by a factor VF/nunits.  For more details see documentation
7001      in vectorizable_operation.  */
7002
7003   if (ncopies > 1)
7004     {
7005       gimple_seq seq = NULL;
7006       stmt_vec_info prev_stmt_vinfo;
7007       /* FORNOW. This restriction should be relaxed.  */
7008       gcc_assert (!nested_in_vect_loop);
7009
7010       /* Create the vector that holds the step of the induction.  */
7011       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7012         {
7013           expr = build_int_cst (integer_type_node, nunits);
7014           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7015         }
7016       else
7017         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7018       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7019                                expr, step_expr);
7020       if (seq)
7021         {
7022           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7023           gcc_assert (!new_bb);
7024         }
7025
7026       t = unshare_expr (new_name);
7027       gcc_assert (CONSTANT_CLASS_P (new_name)
7028                   || TREE_CODE (new_name) == SSA_NAME);
7029       new_vec = build_vector_from_val (vectype, t);
7030       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7031
7032       vec_def = induc_def;
7033       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7034       for (i = 1; i < ncopies; i++)
7035         {
7036           /* vec_i = vec_prev + vec_step  */
7037           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7038                                           vec_def, vec_step);
7039           vec_def = make_ssa_name (vec_dest, new_stmt);
7040           gimple_assign_set_lhs (new_stmt, vec_def);
7041
7042           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7043           set_vinfo_for_stmt (new_stmt,
7044                               new_stmt_vec_info (new_stmt, loop_vinfo));
7045           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7046           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7047         }
7048     }
7049
7050   if (nested_in_vect_loop)
7051     {
7052       /* Find the loop-closed exit-phi of the induction, and record
7053          the final vector of induction results:  */
7054       exit_phi = NULL;
7055       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7056         {
7057           gimple *use_stmt = USE_STMT (use_p);
7058           if (is_gimple_debug (use_stmt))
7059             continue;
7060
7061           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7062             {
7063               exit_phi = use_stmt;
7064               break;
7065             }
7066         }
7067       if (exit_phi)
7068         {
7069           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7070           /* FORNOW. Currently not supporting the case that an inner-loop induction
7071              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7072           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7073                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7074
7075           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7076           if (dump_enabled_p ())
7077             {
7078               dump_printf_loc (MSG_NOTE, vect_location,
7079                                "vector of inductions after inner-loop:");
7080               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7081             }
7082         }
7083     }
7084
7085
7086   if (dump_enabled_p ())
7087     {
7088       dump_printf_loc (MSG_NOTE, vect_location,
7089                        "transform induction: created def-use cycle: ");
7090       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7091       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7092                         SSA_NAME_DEF_STMT (vec_def), 0);
7093     }
7094
7095   return true;
7096 }
7097
7098 /* Function vectorizable_live_operation.
7099
7100    STMT computes a value that is used outside the loop.  Check if
7101    it can be supported.  */
7102
7103 bool
7104 vectorizable_live_operation (gimple *stmt,
7105                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7106                              slp_tree slp_node, int slp_index,
7107                              gimple **vec_stmt)
7108 {
7109   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7110   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7111   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7112   imm_use_iterator imm_iter;
7113   tree lhs, lhs_type, bitsize, vec_bitsize;
7114   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7115   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7116   int ncopies;
7117   gimple *use_stmt;
7118   auto_vec<tree> vec_oprnds;
7119
7120   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7121
7122   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7123     return false;
7124
7125   /* FORNOW.  CHECKME.  */
7126   if (nested_in_vect_loop_p (loop, stmt))
7127     return false;
7128
7129   /* If STMT is not relevant and it is a simple assignment and its inputs are
7130      invariant then it can remain in place, unvectorized.  The original last
7131      scalar value that it computes will be used.  */
7132   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7133     {
7134       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7135       if (dump_enabled_p ())
7136         dump_printf_loc (MSG_NOTE, vect_location,
7137                          "statement is simple and uses invariant.  Leaving in "
7138                          "place.\n");
7139       return true;
7140     }
7141
7142   if (slp_node)
7143     ncopies = 1;
7144   else
7145     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7146
7147   if (!vec_stmt)
7148     /* No transformation required.  */
7149     return true;
7150
7151   /* If stmt has a related stmt, then use that for getting the lhs.  */
7152   if (is_pattern_stmt_p (stmt_info))
7153     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7154
7155   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7156         : gimple_get_lhs (stmt);
7157   lhs_type = TREE_TYPE (lhs);
7158
7159   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7160              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7161              : TYPE_SIZE (TREE_TYPE (vectype)));
7162   vec_bitsize = TYPE_SIZE (vectype);
7163
7164   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7165   tree vec_lhs, bitstart;
7166   if (slp_node)
7167     {
7168       gcc_assert (slp_index >= 0);
7169
7170       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7171       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7172
7173       /* Get the last occurrence of the scalar index from the concatenation of
7174          all the slp vectors. Calculate which slp vector it is and the index
7175          within.  */
7176       int pos = (num_vec * nunits) - num_scalar + slp_index;
7177       int vec_entry = pos / nunits;
7178       int vec_index = pos % nunits;
7179
7180       /* Get the correct slp vectorized stmt.  */
7181       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7182
7183       /* Get entry to use.  */
7184       bitstart = bitsize_int (vec_index);
7185       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7186     }
7187   else
7188     {
7189       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7190       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7191
7192       /* For multiple copies, get the last copy.  */
7193       for (int i = 1; i < ncopies; ++i)
7194         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7195                                                   vec_lhs);
7196
7197       /* Get the last lane in the vector.  */
7198       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7199     }
7200
7201   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7202      loop.  */
7203   gimple_seq stmts = NULL;
7204   tree bftype = TREE_TYPE (vectype);
7205   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7206     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7207   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7208   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7209                                    true, NULL_TREE);
7210   if (stmts)
7211     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7212
7213   /* Replace use of lhs with newly computed result.  If the use stmt is a
7214      single arg PHI, just replace all uses of PHI result.  It's necessary
7215      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7216   use_operand_p use_p;
7217   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7218     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7219         && !is_gimple_debug (use_stmt))
7220     {
7221       if (gimple_code (use_stmt) == GIMPLE_PHI
7222           && gimple_phi_num_args (use_stmt) == 1)
7223         {
7224           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7225         }
7226       else
7227         {
7228           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7229             SET_USE (use_p, new_tree);
7230         }
7231       update_stmt (use_stmt);
7232     }
7233
7234   return true;
7235 }
7236
7237 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7238
7239 static void
7240 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7241 {
7242   ssa_op_iter op_iter;
7243   imm_use_iterator imm_iter;
7244   def_operand_p def_p;
7245   gimple *ustmt;
7246
7247   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7248     {
7249       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7250         {
7251           basic_block bb;
7252
7253           if (!is_gimple_debug (ustmt))
7254             continue;
7255
7256           bb = gimple_bb (ustmt);
7257
7258           if (!flow_bb_inside_loop_p (loop, bb))
7259             {
7260               if (gimple_debug_bind_p (ustmt))
7261                 {
7262                   if (dump_enabled_p ())
7263                     dump_printf_loc (MSG_NOTE, vect_location,
7264                                      "killing debug use\n");
7265
7266                   gimple_debug_bind_reset_value (ustmt);
7267                   update_stmt (ustmt);
7268                 }
7269               else
7270                 gcc_unreachable ();
7271             }
7272         }
7273     }
7274 }
7275
7276 /* Given loop represented by LOOP_VINFO, return true if computation of
7277    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7278    otherwise.  */
7279
7280 static bool
7281 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7282 {
7283   /* Constant case.  */
7284   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7285     {
7286       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7287       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7288
7289       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7290       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7291       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7292         return true;
7293     }
7294
7295   widest_int max;
7296   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7297   /* Check the upper bound of loop niters.  */
7298   if (get_max_loop_iterations (loop, &max))
7299     {
7300       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7301       signop sgn = TYPE_SIGN (type);
7302       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7303       if (max < type_max)
7304         return true;
7305     }
7306   return false;
7307 }
7308
7309 /* Scale profiling counters by estimation for LOOP which is vectorized
7310    by factor VF.  */
7311
7312 static void
7313 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7314 {
7315   edge preheader = loop_preheader_edge (loop);
7316   /* Reduce loop iterations by the vectorization factor.  */
7317   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7318   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7319
7320   if (freq_h.nonzero_p ())
7321     {
7322       profile_probability p;
7323
7324       /* Avoid dropping loop body profile counter to 0 because of zero count
7325          in loop's preheader.  */
7326       if (!(freq_e == profile_count::zero ()))
7327         freq_e = freq_e.force_nonzero ();
7328       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7329       scale_loop_frequencies (loop, p);
7330     }
7331
7332   edge exit_e = single_exit (loop);
7333   exit_e->probability = profile_probability::always ()
7334                                  .apply_scale (1, new_est_niter + 1);
7335
7336   edge exit_l = single_pred_edge (loop->latch);
7337   profile_probability prob = exit_l->probability;
7338   exit_l->probability = exit_e->probability.invert ();
7339   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7340     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7341 }
7342
7343 /* Function vect_transform_loop.
7344
7345    The analysis phase has determined that the loop is vectorizable.
7346    Vectorize the loop - created vectorized stmts to replace the scalar
7347    stmts in the loop, and update the loop exit condition.
7348    Returns scalar epilogue loop if any.  */
7349
7350 struct loop *
7351 vect_transform_loop (loop_vec_info loop_vinfo)
7352 {
7353   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7354   struct loop *epilogue = NULL;
7355   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7356   int nbbs = loop->num_nodes;
7357   int i;
7358   tree niters_vector = NULL_TREE;
7359   tree step_vector = NULL_TREE;
7360   tree niters_vector_mult_vf = NULL_TREE;
7361   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7362   unsigned int lowest_vf = constant_lower_bound (vf);
7363   bool grouped_store;
7364   bool slp_scheduled = false;
7365   gimple *stmt, *pattern_stmt;
7366   gimple_seq pattern_def_seq = NULL;
7367   gimple_stmt_iterator pattern_def_si = gsi_none ();
7368   bool transform_pattern_stmt = false;
7369   bool check_profitability = false;
7370   unsigned int th;
7371
7372   if (dump_enabled_p ())
7373     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7374
7375   /* Use the more conservative vectorization threshold.  If the number
7376      of iterations is constant assume the cost check has been performed
7377      by our caller.  If the threshold makes all loops profitable that
7378      run at least the (estimated) vectorization factor number of times
7379      checking is pointless, too.  */
7380   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7381   if (th >= vect_vf_for_cost (loop_vinfo)
7382       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7383     {
7384       if (dump_enabled_p ())
7385         dump_printf_loc (MSG_NOTE, vect_location,
7386                          "Profitability threshold is %d loop iterations.\n",
7387                          th);
7388       check_profitability = true;
7389     }
7390
7391   /* Make sure there exists a single-predecessor exit bb.  Do this before
7392      versioning.   */
7393   edge e = single_exit (loop);
7394   if (! single_pred_p (e->dest))
7395     {
7396       split_loop_exit_edge (e);
7397       if (dump_enabled_p ())
7398         dump_printf (MSG_NOTE, "split exit edge\n");
7399     }
7400
7401   /* Version the loop first, if required, so the profitability check
7402      comes first.  */
7403
7404   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7405     {
7406       poly_uint64 versioning_threshold
7407         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7408       if (check_profitability
7409           && ordered_p (poly_uint64 (th), versioning_threshold))
7410         {
7411           versioning_threshold = ordered_max (poly_uint64 (th),
7412                                               versioning_threshold);
7413           check_profitability = false;
7414         }
7415       vect_loop_versioning (loop_vinfo, th, check_profitability,
7416                             versioning_threshold);
7417       check_profitability = false;
7418     }
7419
7420   /* Make sure there exists a single-predecessor exit bb also on the
7421      scalar loop copy.  Do this after versioning but before peeling
7422      so CFG structure is fine for both scalar and if-converted loop
7423      to make slpeel_duplicate_current_defs_from_edges face matched
7424      loop closed PHI nodes on the exit.  */
7425   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7426     {
7427       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7428       if (! single_pred_p (e->dest))
7429         {
7430           split_loop_exit_edge (e);
7431           if (dump_enabled_p ())
7432             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7433         }
7434     }
7435
7436   tree niters = vect_build_loop_niters (loop_vinfo);
7437   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7438   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7439   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7440   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
7441                               &step_vector, &niters_vector_mult_vf, th,
7442                               check_profitability, niters_no_overflow);
7443   if (niters_vector == NULL_TREE)
7444     {
7445       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
7446         {
7447           niters_vector
7448             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7449                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
7450           step_vector = build_one_cst (TREE_TYPE (niters));
7451         }
7452       else
7453         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7454                                      &step_vector, niters_no_overflow);
7455     }
7456
7457   /* 1) Make sure the loop header has exactly two entries
7458      2) Make sure we have a preheader basic block.  */
7459
7460   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7461
7462   split_edge (loop_preheader_edge (loop));
7463
7464   /* FORNOW: the vectorizer supports only loops which body consist
7465      of one basic block (header + empty latch). When the vectorizer will
7466      support more involved loop forms, the order by which the BBs are
7467      traversed need to be reconsidered.  */
7468
7469   for (i = 0; i < nbbs; i++)
7470     {
7471       basic_block bb = bbs[i];
7472       stmt_vec_info stmt_info;
7473
7474       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7475            gsi_next (&si))
7476         {
7477           gphi *phi = si.phi ();
7478           if (dump_enabled_p ())
7479             {
7480               dump_printf_loc (MSG_NOTE, vect_location,
7481                                "------>vectorizing phi: ");
7482               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7483             }
7484           stmt_info = vinfo_for_stmt (phi);
7485           if (!stmt_info)
7486             continue;
7487
7488           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7489             vect_loop_kill_debug_uses (loop, phi);
7490
7491           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7492               && !STMT_VINFO_LIVE_P (stmt_info))
7493             continue;
7494
7495           if (STMT_VINFO_VECTYPE (stmt_info)
7496               && (maybe_ne
7497                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
7498               && dump_enabled_p ())
7499             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7500
7501           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7502                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7503                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7504               && ! PURE_SLP_STMT (stmt_info))
7505             {
7506               if (dump_enabled_p ())
7507                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7508               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7509             }
7510         }
7511
7512       pattern_stmt = NULL;
7513       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7514            !gsi_end_p (si) || transform_pattern_stmt;)
7515         {
7516           bool is_store;
7517
7518           if (transform_pattern_stmt)
7519             stmt = pattern_stmt;
7520           else
7521             {
7522               stmt = gsi_stmt (si);
7523               /* During vectorization remove existing clobber stmts.  */
7524               if (gimple_clobber_p (stmt))
7525                 {
7526                   unlink_stmt_vdef (stmt);
7527                   gsi_remove (&si, true);
7528                   release_defs (stmt);
7529                   continue;
7530                 }
7531             }
7532
7533           if (dump_enabled_p ())
7534             {
7535               dump_printf_loc (MSG_NOTE, vect_location,
7536                                "------>vectorizing statement: ");
7537               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7538             }
7539
7540           stmt_info = vinfo_for_stmt (stmt);
7541
7542           /* vector stmts created in the outer-loop during vectorization of
7543              stmts in an inner-loop may not have a stmt_info, and do not
7544              need to be vectorized.  */
7545           if (!stmt_info)
7546             {
7547               gsi_next (&si);
7548               continue;
7549             }
7550
7551           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7552             vect_loop_kill_debug_uses (loop, stmt);
7553
7554           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7555               && !STMT_VINFO_LIVE_P (stmt_info))
7556             {
7557               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7558                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7559                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7560                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7561                 {
7562                   stmt = pattern_stmt;
7563                   stmt_info = vinfo_for_stmt (stmt);
7564                 }
7565               else
7566                 {
7567                   gsi_next (&si);
7568                   continue;
7569                 }
7570             }
7571           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7572                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7573                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7574                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7575             transform_pattern_stmt = true;
7576
7577           /* If pattern statement has def stmts, vectorize them too.  */
7578           if (is_pattern_stmt_p (stmt_info))
7579             {
7580               if (pattern_def_seq == NULL)
7581                 {
7582                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7583                   pattern_def_si = gsi_start (pattern_def_seq);
7584                 }
7585               else if (!gsi_end_p (pattern_def_si))
7586                 gsi_next (&pattern_def_si);
7587               if (pattern_def_seq != NULL)
7588                 {
7589                   gimple *pattern_def_stmt = NULL;
7590                   stmt_vec_info pattern_def_stmt_info = NULL;
7591
7592                   while (!gsi_end_p (pattern_def_si))
7593                     {
7594                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7595                       pattern_def_stmt_info
7596                         = vinfo_for_stmt (pattern_def_stmt);
7597                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7598                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7599                         break;
7600                       gsi_next (&pattern_def_si);
7601                     }
7602
7603                   if (!gsi_end_p (pattern_def_si))
7604                     {
7605                       if (dump_enabled_p ())
7606                         {
7607                           dump_printf_loc (MSG_NOTE, vect_location,
7608                                            "==> vectorizing pattern def "
7609                                            "stmt: ");
7610                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7611                                             pattern_def_stmt, 0);
7612                         }
7613
7614                       stmt = pattern_def_stmt;
7615                       stmt_info = pattern_def_stmt_info;
7616                     }
7617                   else
7618                     {
7619                       pattern_def_si = gsi_none ();
7620                       transform_pattern_stmt = false;
7621                     }
7622                 }
7623               else
7624                 transform_pattern_stmt = false;
7625             }
7626
7627           if (STMT_VINFO_VECTYPE (stmt_info))
7628             {
7629               unsigned int nunits
7630                 = (unsigned int)
7631                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7632               if (!STMT_SLP_TYPE (stmt_info)
7633                   && maybe_ne (nunits, vf)
7634                   && dump_enabled_p ())
7635                   /* For SLP VF is set according to unrolling factor, and not
7636                      to vector size, hence for SLP this print is not valid.  */
7637                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7638             }
7639
7640           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7641              reached.  */
7642           if (STMT_SLP_TYPE (stmt_info))
7643             {
7644               if (!slp_scheduled)
7645                 {
7646                   slp_scheduled = true;
7647
7648                   if (dump_enabled_p ())
7649                     dump_printf_loc (MSG_NOTE, vect_location,
7650                                      "=== scheduling SLP instances ===\n");
7651
7652                   vect_schedule_slp (loop_vinfo);
7653                 }
7654
7655               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7656               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7657                 {
7658                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7659                     {
7660                       pattern_def_seq = NULL;
7661                       gsi_next (&si);
7662                     }
7663                   continue;
7664                 }
7665             }
7666
7667           /* -------- vectorize statement ------------ */
7668           if (dump_enabled_p ())
7669             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7670
7671           grouped_store = false;
7672           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7673           if (is_store)
7674             {
7675               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7676                 {
7677                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7678                      interleaving chain was completed - free all the stores in
7679                      the chain.  */
7680                   gsi_next (&si);
7681                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7682                 }
7683               else
7684                 {
7685                   /* Free the attached stmt_vec_info and remove the stmt.  */
7686                   gimple *store = gsi_stmt (si);
7687                   free_stmt_vec_info (store);
7688                   unlink_stmt_vdef (store);
7689                   gsi_remove (&si, true);
7690                   release_defs (store);
7691                 }
7692
7693               /* Stores can only appear at the end of pattern statements.  */
7694               gcc_assert (!transform_pattern_stmt);
7695               pattern_def_seq = NULL;
7696             }
7697           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7698             {
7699               pattern_def_seq = NULL;
7700               gsi_next (&si);
7701             }
7702         }                       /* stmts in BB */
7703     }                           /* BBs in loop */
7704
7705   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
7706      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
7707   if (integer_onep (step_vector))
7708     niters_no_overflow = true;
7709   slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
7710                                    niters_vector_mult_vf,
7711                                    !niters_no_overflow);
7712
7713   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
7714   scale_profile_for_vect_loop (loop, assumed_vf);
7715
7716   /* The minimum number of iterations performed by the epilogue.  This
7717      is 1 when peeling for gaps because we always need a final scalar
7718      iteration.  */
7719   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7720   /* +1 to convert latch counts to loop iteration counts,
7721      -min_epilogue_iters to remove iterations that cannot be performed
7722        by the vector code.  */
7723   int bias = 1 - min_epilogue_iters;
7724   /* In these calculations the "- 1" converts loop iteration counts
7725      back to latch counts.  */
7726   if (loop->any_upper_bound)
7727     loop->nb_iterations_upper_bound
7728       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
7729                         lowest_vf) - 1;
7730   if (loop->any_likely_upper_bound)
7731     loop->nb_iterations_likely_upper_bound
7732       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
7733                         lowest_vf) - 1;
7734   if (loop->any_estimate)
7735     loop->nb_iterations_estimate
7736       = wi::udiv_floor (loop->nb_iterations_estimate + bias,
7737                         assumed_vf) - 1;
7738
7739   if (dump_enabled_p ())
7740     {
7741       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7742         {
7743           dump_printf_loc (MSG_NOTE, vect_location,
7744                            "LOOP VECTORIZED\n");
7745           if (loop->inner)
7746             dump_printf_loc (MSG_NOTE, vect_location,
7747                              "OUTER LOOP VECTORIZED\n");
7748           dump_printf (MSG_NOTE, "\n");
7749         }
7750       else
7751         dump_printf_loc (MSG_NOTE, vect_location,
7752                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7753                          current_vector_size);
7754     }
7755
7756   /* Free SLP instances here because otherwise stmt reference counting
7757      won't work.  */
7758   slp_instance instance;
7759   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7760     vect_free_slp_instance (instance);
7761   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7762   /* Clear-up safelen field since its value is invalid after vectorization
7763      since vectorized loop can have loop-carried dependencies.  */
7764   loop->safelen = 0;
7765
7766   /* Don't vectorize epilogue for epilogue.  */
7767   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7768     epilogue = NULL;
7769
7770   if (epilogue)
7771     {
7772         unsigned int vector_sizes
7773           = targetm.vectorize.autovectorize_vector_sizes ();
7774         vector_sizes &= current_vector_size - 1;
7775
7776         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7777           epilogue = NULL;
7778         else if (!vector_sizes)
7779           epilogue = NULL;
7780         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7781                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
7782                  && known_eq (vf, lowest_vf))
7783           {
7784             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7785             int ratio = current_vector_size / smallest_vec_size;
7786             unsigned HOST_WIDE_INT eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7787               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7788             eiters = eiters % lowest_vf;
7789
7790             epilogue->nb_iterations_upper_bound = eiters - 1;
7791
7792             if (eiters < lowest_vf / ratio)
7793               epilogue = NULL;
7794             }
7795     }
7796
7797   if (epilogue)
7798     {
7799       epilogue->force_vectorize = loop->force_vectorize;
7800       epilogue->safelen = loop->safelen;
7801       epilogue->dont_vectorize = false;
7802
7803       /* We may need to if-convert epilogue to vectorize it.  */
7804       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7805         tree_if_conversion (epilogue);
7806     }
7807
7808   return epilogue;
7809 }
7810
7811 /* The code below is trying to perform simple optimization - revert
7812    if-conversion for masked stores, i.e. if the mask of a store is zero
7813    do not perform it and all stored value producers also if possible.
7814    For example,
7815      for (i=0; i<n; i++)
7816        if (c[i])
7817         {
7818           p1[i] += 1;
7819           p2[i] = p3[i] +2;
7820         }
7821    this transformation will produce the following semi-hammock:
7822
7823    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7824      {
7825        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7826        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7827        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7828        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7829        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7830        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7831      }
7832 */
7833
7834 void
7835 optimize_mask_stores (struct loop *loop)
7836 {
7837   basic_block *bbs = get_loop_body (loop);
7838   unsigned nbbs = loop->num_nodes;
7839   unsigned i;
7840   basic_block bb;
7841   struct loop *bb_loop;
7842   gimple_stmt_iterator gsi;
7843   gimple *stmt;
7844   auto_vec<gimple *> worklist;
7845
7846   vect_location = find_loop_location (loop);
7847   /* Pick up all masked stores in loop if any.  */
7848   for (i = 0; i < nbbs; i++)
7849     {
7850       bb = bbs[i];
7851       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7852            gsi_next (&gsi))
7853         {
7854           stmt = gsi_stmt (gsi);
7855           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7856             worklist.safe_push (stmt);
7857         }
7858     }
7859
7860   free (bbs);
7861   if (worklist.is_empty ())
7862     return;
7863
7864   /* Loop has masked stores.  */
7865   while (!worklist.is_empty ())
7866     {
7867       gimple *last, *last_store;
7868       edge e, efalse;
7869       tree mask;
7870       basic_block store_bb, join_bb;
7871       gimple_stmt_iterator gsi_to;
7872       tree vdef, new_vdef;
7873       gphi *phi;
7874       tree vectype;
7875       tree zero;
7876
7877       last = worklist.pop ();
7878       mask = gimple_call_arg (last, 2);
7879       bb = gimple_bb (last);
7880       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7881          the same loop as if_bb.  It could be different to LOOP when two
7882          level loop-nest is vectorized and mask_store belongs to the inner
7883          one.  */
7884       e = split_block (bb, last);
7885       bb_loop = bb->loop_father;
7886       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7887       join_bb = e->dest;
7888       store_bb = create_empty_bb (bb);
7889       add_bb_to_loop (store_bb, bb_loop);
7890       e->flags = EDGE_TRUE_VALUE;
7891       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7892       /* Put STORE_BB to likely part.  */
7893       efalse->probability = profile_probability::unlikely ();
7894       store_bb->count = efalse->count ();
7895       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7896       if (dom_info_available_p (CDI_DOMINATORS))
7897         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7898       if (dump_enabled_p ())
7899         dump_printf_loc (MSG_NOTE, vect_location,
7900                          "Create new block %d to sink mask stores.",
7901                          store_bb->index);
7902       /* Create vector comparison with boolean result.  */
7903       vectype = TREE_TYPE (mask);
7904       zero = build_zero_cst (vectype);
7905       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7906       gsi = gsi_last_bb (bb);
7907       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7908       /* Create new PHI node for vdef of the last masked store:
7909          .MEM_2 = VDEF <.MEM_1>
7910          will be converted to
7911          .MEM.3 = VDEF <.MEM_1>
7912          and new PHI node will be created in join bb
7913          .MEM_2 = PHI <.MEM_1, .MEM_3>
7914       */
7915       vdef = gimple_vdef (last);
7916       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7917       gimple_set_vdef (last, new_vdef);
7918       phi = create_phi_node (vdef, join_bb);
7919       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7920
7921       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7922       while (true)
7923         {
7924           gimple_stmt_iterator gsi_from;
7925           gimple *stmt1 = NULL;
7926
7927           /* Move masked store to STORE_BB.  */
7928           last_store = last;
7929           gsi = gsi_for_stmt (last);
7930           gsi_from = gsi;
7931           /* Shift GSI to the previous stmt for further traversal.  */
7932           gsi_prev (&gsi);
7933           gsi_to = gsi_start_bb (store_bb);
7934           gsi_move_before (&gsi_from, &gsi_to);
7935           /* Setup GSI_TO to the non-empty block start.  */
7936           gsi_to = gsi_start_bb (store_bb);
7937           if (dump_enabled_p ())
7938             {
7939               dump_printf_loc (MSG_NOTE, vect_location,
7940                                "Move stmt to created bb\n");
7941               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7942             }
7943           /* Move all stored value producers if possible.  */
7944           while (!gsi_end_p (gsi))
7945             {
7946               tree lhs;
7947               imm_use_iterator imm_iter;
7948               use_operand_p use_p;
7949               bool res;
7950
7951               /* Skip debug statements.  */
7952               if (is_gimple_debug (gsi_stmt (gsi)))
7953                 {
7954                   gsi_prev (&gsi);
7955                   continue;
7956                 }
7957               stmt1 = gsi_stmt (gsi);
7958               /* Do not consider statements writing to memory or having
7959                  volatile operand.  */
7960               if (gimple_vdef (stmt1)
7961                   || gimple_has_volatile_ops (stmt1))
7962                 break;
7963               gsi_from = gsi;
7964               gsi_prev (&gsi);
7965               lhs = gimple_get_lhs (stmt1);
7966               if (!lhs)
7967                 break;
7968
7969               /* LHS of vectorized stmt must be SSA_NAME.  */
7970               if (TREE_CODE (lhs) != SSA_NAME)
7971                 break;
7972
7973               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7974                 {
7975                   /* Remove dead scalar statement.  */
7976                   if (has_zero_uses (lhs))
7977                     {
7978                       gsi_remove (&gsi_from, true);
7979                       continue;
7980                     }
7981                 }
7982
7983               /* Check that LHS does not have uses outside of STORE_BB.  */
7984               res = true;
7985               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7986                 {
7987                   gimple *use_stmt;
7988                   use_stmt = USE_STMT (use_p);
7989                   if (is_gimple_debug (use_stmt))
7990                     continue;
7991                   if (gimple_bb (use_stmt) != store_bb)
7992                     {
7993                       res = false;
7994                       break;
7995                     }
7996                 }
7997               if (!res)
7998                 break;
7999
8000               if (gimple_vuse (stmt1)
8001                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8002                 break;
8003
8004               /* Can move STMT1 to STORE_BB.  */
8005               if (dump_enabled_p ())
8006                 {
8007                   dump_printf_loc (MSG_NOTE, vect_location,
8008                                    "Move stmt to created bb\n");
8009                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8010                 }
8011               gsi_move_before (&gsi_from, &gsi_to);
8012               /* Shift GSI_TO for further insertion.  */
8013               gsi_prev (&gsi_to);
8014             }
8015           /* Put other masked stores with the same mask to STORE_BB.  */
8016           if (worklist.is_empty ()
8017               || gimple_call_arg (worklist.last (), 2) != mask
8018               || worklist.last () != stmt1)
8019             break;
8020           last = worklist.pop ();
8021         }
8022       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8023     }
8024 }