gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   unsigned nbbs = loop->num_nodes;
 188   poly_uint64 vectorization_factor = 1;
 189   tree scalar_type = NULL_TREE;
 190   gphi *phi;
 191   tree vectype;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               if (dump_enabled_p ())
 261                 dump_printf_loc (MSG_NOTE, vect_location,
 262                                  "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 263                                  TYPE_VECTOR_SUBPARTS (vectype));
 264
 265               vect_update_max_nunits (&vectorization_factor, vectype);
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           if (dump_enabled_p ())
 554             dump_printf_loc (MSG_NOTE, vect_location,
 555                              "nunits = " HOST_WIDE_INT_PRINT_DEC "\n",
 556                              TYPE_VECTOR_SUBPARTS (vf_vectype));
 557
 558           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     {
 571       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 572       dump_dec (MSG_NOTE, vectorization_factor);
 573       dump_printf (MSG_NOTE, "\n");
 574     }
 575
 576   if (known_le (vectorization_factor, 1U))
 577     {
 578       if (dump_enabled_p ())
 579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 580                          "not vectorized: unsupported data-type\n");
 581       return false;
 582     }
 583   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 584
 585   for (i = 0; i < mask_producers.length (); i++)
 586     {
 587       tree mask_type = NULL;
 588
 589       stmt = STMT_VINFO_STMT (mask_producers[i]);
 590
 591       if (is_gimple_assign (stmt)
 592           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 593           && !VECT_SCALAR_BOOLEAN_TYPE_P
 594                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 595         {
 596           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 597           mask_type = get_mask_type_for_scalar_type (scalar_type);
 598
 599           if (!mask_type)
 600             {
 601               if (dump_enabled_p ())
 602                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 603                                  "not vectorized: unsupported mask\n");
 604               return false;
 605             }
 606         }
 607       else
 608         {
 609           tree rhs;
 610           ssa_op_iter iter;
 611           gimple *def_stmt;
 612           enum vect_def_type dt;
 613
 614           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 615             {
 616               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 617                                        &def_stmt, &dt, &vectype))
 618                 {
 619                   if (dump_enabled_p ())
 620                     {
 621                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 622                                        "not vectorized: can't compute mask type "
 623                                        "for statement, ");
 624                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 625                                         0);
 626                     }
 627                   return false;
 628                 }
 629
 630               /* No vectype probably means external definition.
 631                  Allow it in case there is another operand which
 632                  allows to determine mask type.  */
 633               if (!vectype)
 634                 continue;
 635
 636               if (!mask_type)
 637                 mask_type = vectype;
 638               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 639                        != TYPE_VECTOR_SUBPARTS (vectype))
 640                 {
 641                   if (dump_enabled_p ())
 642                     {
 643                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 644                                        "not vectorized: different sized masks "
 645                                        "types in statement, ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          mask_type);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 649                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 650                                          vectype);
 651                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 652                     }
 653                   return false;
 654                 }
 655               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 656                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 657                 {
 658                   if (dump_enabled_p ())
 659                     {
 660                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 661                                        "not vectorized: mixed mask and "
 662                                        "nonmask vector types in statement, ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          mask_type);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 666                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 667                                          vectype);
 668                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 669                     }
 670                   return false;
 671                 }
 672             }
 673
 674           /* We may compare boolean value loaded as vector of integers.
 675              Fix mask_type in such case.  */
 676           if (mask_type
 677               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 678               && gimple_code (stmt) == GIMPLE_ASSIGN
 679               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 680             mask_type = build_same_sized_truth_vector_type (mask_type);
 681         }
 682
 683       /* No mask_type should mean loop invariant predicate.
 684          This is probably a subject for optimization in
 685          if-conversion.  */
 686       if (!mask_type)
 687         {
 688           if (dump_enabled_p ())
 689             {
 690               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 691                                "not vectorized: can't compute mask type "
 692                                "for statement, ");
 693               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 694                                 0);
 695             }
 696           return false;
 697         }
 698
 699       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 700     }
 701
 702   return true;
 703 }
 704
 705
 706 /* Function vect_is_simple_iv_evolution.
 707
 708    FORNOW: A simple evolution of an induction variables in the loop is
 709    considered a polynomial evolution.  */
 710
 711 static bool
 712 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 713                              tree * step)
 714 {
 715   tree init_expr;
 716   tree step_expr;
 717   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 718   basic_block bb;
 719
 720   /* When there is no evolution in this loop, the evolution function
 721      is not "simple".  */
 722   if (evolution_part == NULL_TREE)
 723     return false;
 724
 725   /* When the evolution is a polynomial of degree >= 2
 726      the evolution function is not "simple".  */
 727   if (tree_is_chrec (evolution_part))
 728     return false;
 729
 730   step_expr = evolution_part;
 731   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 732
 733   if (dump_enabled_p ())
 734     {
 735       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 737       dump_printf (MSG_NOTE, ",  init: ");
 738       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 739       dump_printf (MSG_NOTE, "\n");
 740     }
 741
 742   *init = init_expr;
 743   *step = step_expr;
 744
 745   if (TREE_CODE (step_expr) != INTEGER_CST
 746       && (TREE_CODE (step_expr) != SSA_NAME
 747           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 748               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 749           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 750               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 751                   || !flag_associative_math)))
 752       && (TREE_CODE (step_expr) != REAL_CST
 753           || !flag_associative_math))
 754     {
 755       if (dump_enabled_p ())
 756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 757                          "step unknown.\n");
 758       return false;
 759     }
 760
 761   return true;
 762 }
 763
 764 /* Function vect_analyze_scalar_cycles_1.
 765
 766    Examine the cross iteration def-use cycles of scalar variables
 767    in LOOP.  LOOP_VINFO represents the loop that is now being
 768    considered for vectorization (can be LOOP, or an outer-loop
 769    enclosing LOOP).  */
 770
 771 static void
 772 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 773 {
 774   basic_block bb = loop->header;
 775   tree init, step;
 776   auto_vec<gimple *, 64> worklist;
 777   gphi_iterator gsi;
 778   bool double_reduc;
 779
 780   if (dump_enabled_p ())
 781     dump_printf_loc (MSG_NOTE, vect_location,
 782                      "=== vect_analyze_scalar_cycles ===\n");
 783
 784   /* First - identify all inductions.  Reduction detection assumes that all the
 785      inductions have been identified, therefore, this order must not be
 786      changed.  */
 787   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 788     {
 789       gphi *phi = gsi.phi ();
 790       tree access_fn = NULL;
 791       tree def = PHI_RESULT (phi);
 792       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 793
 794       if (dump_enabled_p ())
 795         {
 796           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 797           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 798         }
 799
 800       /* Skip virtual phi's.  The data dependences that are associated with
 801          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 802       if (virtual_operand_p (def))
 803         continue;
 804
 805       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 806
 807       /* Analyze the evolution function.  */
 808       access_fn = analyze_scalar_evolution (loop, def);
 809       if (access_fn)
 810         {
 811           STRIP_NOPS (access_fn);
 812           if (dump_enabled_p ())
 813             {
 814               dump_printf_loc (MSG_NOTE, vect_location,
 815                                "Access function of PHI: ");
 816               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 817               dump_printf (MSG_NOTE, "\n");
 818             }
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 820             = initial_condition_in_loop_num (access_fn, loop->num);
 821           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 822             = evolution_part_in_loop_num (access_fn, loop->num);
 823         }
 824
 825       if (!access_fn
 826           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 827           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 828               && TREE_CODE (step) != INTEGER_CST))
 829         {
 830           worklist.safe_push (phi);
 831           continue;
 832         }
 833
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 835                   != NULL_TREE);
 836       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 837
 838       if (dump_enabled_p ())
 839         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 840       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 841     }
 842
 843
 844   /* Second - identify all reductions and nested cycles.  */
 845   while (worklist.length () > 0)
 846     {
 847       gimple *phi = worklist.pop ();
 848       tree def = PHI_RESULT (phi);
 849       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 850       gimple *reduc_stmt;
 851
 852       if (dump_enabled_p ())
 853         {
 854           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 855           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 856         }
 857
 858       gcc_assert (!virtual_operand_p (def)
 859                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 860
 861       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 862                                                 &double_reduc, false);
 863       if (reduc_stmt)
 864         {
 865           if (double_reduc)
 866             {
 867               if (dump_enabled_p ())
 868                 dump_printf_loc (MSG_NOTE, vect_location,
 869                                  "Detected double reduction.\n");
 870
 871               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 872               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 873                                                     vect_double_reduction_def;
 874             }
 875           else
 876             {
 877               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 878                 {
 879                   if (dump_enabled_p ())
 880                     dump_printf_loc (MSG_NOTE, vect_location,
 881                                      "Detected vectorizable nested cycle.\n");
 882
 883                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 884                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 885                                                              vect_nested_cycle;
 886                 }
 887               else
 888                 {
 889                   if (dump_enabled_p ())
 890                     dump_printf_loc (MSG_NOTE, vect_location,
 891                                      "Detected reduction.\n");
 892
 893                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 894                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 895                                                            vect_reduction_def;
 896                   /* Store the reduction cycles for possible vectorization in
 897                      loop-aware SLP if it was not detected as reduction
 898                      chain.  */
 899                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 900                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 901                 }
 902             }
 903         }
 904       else
 905         if (dump_enabled_p ())
 906           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 907                            "Unknown def-use cycle pattern.\n");
 908     }
 909 }
 910
 911
 912 /* Function vect_analyze_scalar_cycles.
 913
 914    Examine the cross iteration def-use cycles of scalar variables, by
 915    analyzing the loop-header PHIs of scalar variables.  Classify each
 916    cycle as one of the following: invariant, induction, reduction, unknown.
 917    We do that for the loop represented by LOOP_VINFO, and also to its
 918    inner-loop, if exists.
 919    Examples for scalar cycles:
 920
 921    Example1: reduction:
 922
 923               loop1:
 924               for (i=0; i<N; i++)
 925                  sum += a[i];
 926
 927    Example2: induction:
 928
 929               loop2:
 930               for (i=0; i<N; i++)
 931                  a[i] = i;  */
 932
 933 static void
 934 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 935 {
 936   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 937
 938   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 939
 940   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 941      Reductions in such inner-loop therefore have different properties than
 942      the reductions in the nest that gets vectorized:
 943      1. When vectorized, they are executed in the same order as in the original
 944         scalar loop, so we can't change the order of computation when
 945         vectorizing them.
 946      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 947         current checks are too strict.  */
 948
 949   if (loop->inner)
 950     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 951 }
 952
 953 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 954
 955 static void
 956 vect_fixup_reduc_chain (gimple *stmt)
 957 {
 958   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 959   gimple *stmtp;
 960   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 961               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 962   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 963   do
 964     {
 965       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 966       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 967       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 968       if (stmt)
 969         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 970           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971     }
 972   while (stmt);
 973   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 974 }
 975
 976 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 977
 978 static void
 979 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 980 {
 981   gimple *first;
 982   unsigned i;
 983
 984   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 985     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 986       {
 987         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 988         while (next)
 989           {
 990             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 991               break;
 992             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 993           }
 994         /* If not all stmt in the chain are patterns try to handle
 995            the chain without patterns.  */
 996         if (! next)
 997           {
 998             vect_fixup_reduc_chain (first);
 999             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1000               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1001           }
1002       }
1003 }
1004
1005 /* Function vect_get_loop_niters.
1006
1007    Determine how many iterations the loop is executed and place it
1008    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1009    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1010    niter information holds in ASSUMPTIONS.
1011
1012    Return the loop exit condition.  */
1013
1014
1015 static gcond *
1016 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1017                       tree *number_of_iterations, tree *number_of_iterationsm1)
1018 {
1019   edge exit = single_exit (loop);
1020   struct tree_niter_desc niter_desc;
1021   tree niter_assumptions, niter, may_be_zero;
1022   gcond *cond = get_loop_exit_condition (loop);
1023
1024   *assumptions = boolean_true_node;
1025   *number_of_iterationsm1 = chrec_dont_know;
1026   *number_of_iterations = chrec_dont_know;
1027   if (dump_enabled_p ())
1028     dump_printf_loc (MSG_NOTE, vect_location,
1029                      "=== get_loop_niters ===\n");
1030
1031   if (!exit)
1032     return cond;
1033
1034   niter = chrec_dont_know;
1035   may_be_zero = NULL_TREE;
1036   niter_assumptions = boolean_true_node;
1037   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1038       || chrec_contains_undetermined (niter_desc.niter))
1039     return cond;
1040
1041   niter_assumptions = niter_desc.assumptions;
1042   may_be_zero = niter_desc.may_be_zero;
1043   niter = niter_desc.niter;
1044
1045   if (may_be_zero && integer_zerop (may_be_zero))
1046     may_be_zero = NULL_TREE;
1047
1048   if (may_be_zero)
1049     {
1050       if (COMPARISON_CLASS_P (may_be_zero))
1051         {
1052           /* Try to combine may_be_zero with assumptions, this can simplify
1053              computation of niter expression.  */
1054           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1055             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1056                                              niter_assumptions,
1057                                              fold_build1 (TRUTH_NOT_EXPR,
1058                                                           boolean_type_node,
1059                                                           may_be_zero));
1060           else
1061             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1062                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1063
1064           may_be_zero = NULL_TREE;
1065         }
1066       else if (integer_nonzerop (may_be_zero))
1067         {
1068           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1069           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1070           return cond;
1071         }
1072       else
1073         return cond;
1074     }
1075
1076   *assumptions = niter_assumptions;
1077   *number_of_iterationsm1 = niter;
1078
1079   /* We want the number of loop header executions which is the number
1080      of latch executions plus one.
1081      ???  For UINT_MAX latch executions this number overflows to zero
1082      for loops like do { n++; } while (n != 0);  */
1083   if (niter && !chrec_contains_undetermined (niter))
1084     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1085                           build_int_cst (TREE_TYPE (niter), 1));
1086   *number_of_iterations = niter;
1087
1088   return cond;
1089 }
1090
1091 /* Function bb_in_loop_p
1092
1093    Used as predicate for dfs order traversal of the loop bbs.  */
1094
1095 static bool
1096 bb_in_loop_p (const_basic_block bb, const void *data)
1097 {
1098   const struct loop *const loop = (const struct loop *)data;
1099   if (flow_bb_inside_loop_p (loop, bb))
1100     return true;
1101   return false;
1102 }
1103
1104
1105 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1106    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1107
1108 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1109   : vec_info (vec_info::loop, init_cost (loop_in)),
1110     loop (loop_in),
1111     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1112     num_itersm1 (NULL_TREE),
1113     num_iters (NULL_TREE),
1114     num_iters_unchanged (NULL_TREE),
1115     num_iters_assumptions (NULL_TREE),
1116     th (0),
1117     versioning_threshold (0),
1118     vectorization_factor (0),
1119     max_vectorization_factor (0),
1120     unaligned_dr (NULL),
1121     peeling_for_alignment (0),
1122     ptr_mask (0),
1123     slp_unrolling_factor (1),
1124     single_scalar_iteration_cost (0),
1125     vectorizable (false),
1126     peeling_for_gaps (false),
1127     peeling_for_niter (false),
1128     operands_swapped (false),
1129     no_data_dependencies (false),
1130     has_mask_store (false),
1131     scalar_loop (NULL),
1132     orig_loop_info (NULL)
1133 {
1134   /* Create/Update stmt_info for all stmts in the loop.  */
1135   basic_block *body = get_loop_body (loop);
1136   for (unsigned int i = 0; i < loop->num_nodes; i++)
1137     {
1138       basic_block bb = body[i];
1139       gimple_stmt_iterator si;
1140
1141       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1142         {
1143           gimple *phi = gsi_stmt (si);
1144           gimple_set_uid (phi, 0);
1145           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1146         }
1147
1148       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1149         {
1150           gimple *stmt = gsi_stmt (si);
1151           gimple_set_uid (stmt, 0);
1152           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1153         }
1154     }
1155   free (body);
1156
1157   /* CHECKME: We want to visit all BBs before their successors (except for
1158      latch blocks, for which this assertion wouldn't hold).  In the simple
1159      case of the loop forms we allow, a dfs order of the BBs would the same
1160      as reversed postorder traversal, so we are safe.  */
1161
1162   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1163                                           bbs, loop->num_nodes, loop);
1164   gcc_assert (nbbs == loop->num_nodes);
1165 }
1166
1167
1168 /* Free all memory used by the _loop_vec_info, as well as all the
1169    stmt_vec_info structs of all the stmts in the loop.  */
1170
1171 _loop_vec_info::~_loop_vec_info ()
1172 {
1173   int nbbs;
1174   gimple_stmt_iterator si;
1175   int j;
1176
1177   nbbs = loop->num_nodes;
1178   for (j = 0; j < nbbs; j++)
1179     {
1180       basic_block bb = bbs[j];
1181       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1182         free_stmt_vec_info (gsi_stmt (si));
1183
1184       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1185         {
1186           gimple *stmt = gsi_stmt (si);
1187
1188           /* We may have broken canonical form by moving a constant
1189              into RHS1 of a commutative op.  Fix such occurrences.  */
1190           if (operands_swapped && is_gimple_assign (stmt))
1191             {
1192               enum tree_code code = gimple_assign_rhs_code (stmt);
1193
1194               if ((code == PLUS_EXPR
1195                    || code == POINTER_PLUS_EXPR
1196                    || code == MULT_EXPR)
1197                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1198                 swap_ssa_operands (stmt,
1199                                    gimple_assign_rhs1_ptr (stmt),
1200                                    gimple_assign_rhs2_ptr (stmt));
1201               else if (code == COND_EXPR
1202                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1203                 {
1204                   tree cond_expr = gimple_assign_rhs1 (stmt);
1205                   enum tree_code cond_code = TREE_CODE (cond_expr);
1206
1207                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1208                     {
1209                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1210                                                                   0));
1211                       cond_code = invert_tree_comparison (cond_code,
1212                                                           honor_nans);
1213                       if (cond_code != ERROR_MARK)
1214                         {
1215                           TREE_SET_CODE (cond_expr, cond_code);
1216                           swap_ssa_operands (stmt,
1217                                              gimple_assign_rhs2_ptr (stmt),
1218                                              gimple_assign_rhs3_ptr (stmt));
1219                         }
1220                     }
1221                 }
1222             }
1223
1224           /* Free stmt_vec_info.  */
1225           free_stmt_vec_info (stmt);
1226           gsi_next (&si);
1227         }
1228     }
1229
1230   free (bbs);
1231
1232   loop->aux = NULL;
1233 }
1234
1235
1236 /* Calculate the cost of one scalar iteration of the loop.  */
1237 static void
1238 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1239 {
1240   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1241   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1242   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1243   int innerloop_iters, i;
1244
1245   /* Count statements in scalar loop.  Using this as scalar cost for a single
1246      iteration for now.
1247
1248      TODO: Add outer loop support.
1249
1250      TODO: Consider assigning different costs to different scalar
1251      statements.  */
1252
1253   /* FORNOW.  */
1254   innerloop_iters = 1;
1255   if (loop->inner)
1256     innerloop_iters = 50; /* FIXME */
1257
1258   for (i = 0; i < nbbs; i++)
1259     {
1260       gimple_stmt_iterator si;
1261       basic_block bb = bbs[i];
1262
1263       if (bb->loop_father == loop->inner)
1264         factor = innerloop_iters;
1265       else
1266         factor = 1;
1267
1268       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1269         {
1270           gimple *stmt = gsi_stmt (si);
1271           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1272
1273           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1274             continue;
1275
1276           /* Skip stmts that are not vectorized inside the loop.  */
1277           if (stmt_info
1278               && !STMT_VINFO_RELEVANT_P (stmt_info)
1279               && (!STMT_VINFO_LIVE_P (stmt_info)
1280                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1281               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1282             continue;
1283
1284           vect_cost_for_stmt kind;
1285           if (STMT_VINFO_DATA_REF (stmt_info))
1286             {
1287               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1288                kind = scalar_load;
1289              else
1290                kind = scalar_store;
1291             }
1292           else
1293             kind = scalar_stmt;
1294
1295           scalar_single_iter_cost
1296             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1297                                  factor, kind, stmt_info, 0, vect_prologue);
1298         }
1299     }
1300   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1301     = scalar_single_iter_cost;
1302 }
1303
1304
1305 /* Function vect_analyze_loop_form_1.
1306
1307    Verify that certain CFG restrictions hold, including:
1308    - the loop has a pre-header
1309    - the loop has a single entry and exit
1310    - the loop exit condition is simple enough
1311    - the number of iterations can be analyzed, i.e, a countable loop.  The
1312      niter could be analyzed under some assumptions.  */
1313
1314 bool
1315 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1316                           tree *assumptions, tree *number_of_iterationsm1,
1317                           tree *number_of_iterations, gcond **inner_loop_cond)
1318 {
1319   if (dump_enabled_p ())
1320     dump_printf_loc (MSG_NOTE, vect_location,
1321                      "=== vect_analyze_loop_form ===\n");
1322
1323   /* Different restrictions apply when we are considering an inner-most loop,
1324      vs. an outer (nested) loop.
1325      (FORNOW. May want to relax some of these restrictions in the future).  */
1326
1327   if (!loop->inner)
1328     {
1329       /* Inner-most loop.  We currently require that the number of BBs is
1330          exactly 2 (the header and latch).  Vectorizable inner-most loops
1331          look like this:
1332
1333                         (pre-header)
1334                            |
1335                           header <--------+
1336                            | |            |
1337                            | +--> latch --+
1338                            |
1339                         (exit-bb)  */
1340
1341       if (loop->num_nodes != 2)
1342         {
1343           if (dump_enabled_p ())
1344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1345                              "not vectorized: control flow in loop.\n");
1346           return false;
1347         }
1348
1349       if (empty_block_p (loop->header))
1350         {
1351           if (dump_enabled_p ())
1352             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1353                              "not vectorized: empty loop.\n");
1354           return false;
1355         }
1356     }
1357   else
1358     {
1359       struct loop *innerloop = loop->inner;
1360       edge entryedge;
1361
1362       /* Nested loop. We currently require that the loop is doubly-nested,
1363          contains a single inner loop, and the number of BBs is exactly 5.
1364          Vectorizable outer-loops look like this:
1365
1366                         (pre-header)
1367                            |
1368                           header <---+
1369                            |         |
1370                           inner-loop |
1371                            |         |
1372                           tail ------+
1373                            |
1374                         (exit-bb)
1375
1376          The inner-loop has the properties expected of inner-most loops
1377          as described above.  */
1378
1379       if ((loop->inner)->inner || (loop->inner)->next)
1380         {
1381           if (dump_enabled_p ())
1382             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1383                              "not vectorized: multiple nested loops.\n");
1384           return false;
1385         }
1386
1387       if (loop->num_nodes != 5)
1388         {
1389           if (dump_enabled_p ())
1390             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1391                              "not vectorized: control flow in loop.\n");
1392           return false;
1393         }
1394
1395       entryedge = loop_preheader_edge (innerloop);
1396       if (entryedge->src != loop->header
1397           || !single_exit (innerloop)
1398           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1399         {
1400           if (dump_enabled_p ())
1401             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1402                              "not vectorized: unsupported outerloop form.\n");
1403           return false;
1404         }
1405
1406       /* Analyze the inner-loop.  */
1407       tree inner_niterm1, inner_niter, inner_assumptions;
1408       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1409                                       &inner_assumptions, &inner_niterm1,
1410                                       &inner_niter, NULL)
1411           /* Don't support analyzing niter under assumptions for inner
1412              loop.  */
1413           || !integer_onep (inner_assumptions))
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: Bad inner loop.\n");
1418           return false;
1419         }
1420
1421       if (!expr_invariant_in_loop_p (loop, inner_niter))
1422         {
1423           if (dump_enabled_p ())
1424             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1425                              "not vectorized: inner-loop count not"
1426                              " invariant.\n");
1427           return false;
1428         }
1429
1430       if (dump_enabled_p ())
1431         dump_printf_loc (MSG_NOTE, vect_location,
1432                          "Considering outer-loop vectorization.\n");
1433     }
1434
1435   if (!single_exit (loop)
1436       || EDGE_COUNT (loop->header->preds) != 2)
1437     {
1438       if (dump_enabled_p ())
1439         {
1440           if (!single_exit (loop))
1441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                              "not vectorized: multiple exits.\n");
1443           else if (EDGE_COUNT (loop->header->preds) != 2)
1444             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1445                              "not vectorized: too many incoming edges.\n");
1446         }
1447       return false;
1448     }
1449
1450   /* We assume that the loop exit condition is at the end of the loop. i.e,
1451      that the loop is represented as a do-while (with a proper if-guard
1452      before the loop if needed), where the loop header contains all the
1453      executable statements, and the latch is empty.  */
1454   if (!empty_block_p (loop->latch)
1455       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1456     {
1457       if (dump_enabled_p ())
1458         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1459                          "not vectorized: latch block not empty.\n");
1460       return false;
1461     }
1462
1463   /* Make sure the exit is not abnormal.  */
1464   edge e = single_exit (loop);
1465   if (e->flags & EDGE_ABNORMAL)
1466     {
1467       if (dump_enabled_p ())
1468         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                          "not vectorized: abnormal loop exit edge.\n");
1470       return false;
1471     }
1472
1473   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1474                                      number_of_iterationsm1);
1475   if (!*loop_cond)
1476     {
1477       if (dump_enabled_p ())
1478         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1479                          "not vectorized: complicated exit condition.\n");
1480       return false;
1481     }
1482
1483   if (integer_zerop (*assumptions)
1484       || !*number_of_iterations
1485       || chrec_contains_undetermined (*number_of_iterations))
1486     {
1487       if (dump_enabled_p ())
1488         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                          "not vectorized: number of iterations cannot be "
1490                          "computed.\n");
1491       return false;
1492     }
1493
1494   if (integer_zerop (*number_of_iterations))
1495     {
1496       if (dump_enabled_p ())
1497         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1498                          "not vectorized: number of iterations = 0.\n");
1499       return false;
1500     }
1501
1502   return true;
1503 }
1504
1505 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1506
1507 loop_vec_info
1508 vect_analyze_loop_form (struct loop *loop)
1509 {
1510   tree assumptions, number_of_iterations, number_of_iterationsm1;
1511   gcond *loop_cond, *inner_loop_cond = NULL;
1512
1513   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1514                                   &assumptions, &number_of_iterationsm1,
1515                                   &number_of_iterations, &inner_loop_cond))
1516     return NULL;
1517
1518   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1519   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1520   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1521   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1522   if (!integer_onep (assumptions))
1523     {
1524       /* We consider to vectorize this loop by versioning it under
1525          some assumptions.  In order to do this, we need to clear
1526          existing information computed by scev and niter analyzer.  */
1527       scev_reset_htab ();
1528       free_numbers_of_iterations_estimates (loop);
1529       /* Also set flag for this loop so that following scev and niter
1530          analysis are done under the assumptions.  */
1531       loop_constraint_set (loop, LOOP_C_FINITE);
1532       /* Also record the assumptions for versioning.  */
1533       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1534     }
1535
1536   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1537     {
1538       if (dump_enabled_p ())
1539         {
1540           dump_printf_loc (MSG_NOTE, vect_location,
1541                            "Symbolic number of iterations is ");
1542           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1543           dump_printf (MSG_NOTE, "\n");
1544         }
1545     }
1546
1547   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1548   if (inner_loop_cond)
1549     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1550       = loop_exit_ctrl_vec_info_type;
1551
1552   gcc_assert (!loop->aux);
1553   loop->aux = loop_vinfo;
1554   return loop_vinfo;
1555 }
1556
1557
1558
1559 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1560    statements update the vectorization factor.  */
1561
1562 static void
1563 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1564 {
1565   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1566   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1567   int nbbs = loop->num_nodes;
1568   poly_uint64 vectorization_factor;
1569   int i;
1570
1571   if (dump_enabled_p ())
1572     dump_printf_loc (MSG_NOTE, vect_location,
1573                      "=== vect_update_vf_for_slp ===\n");
1574
1575   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1576   gcc_assert (known_ne (vectorization_factor, 0U));
1577
1578   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1579      vectorization factor of the loop is the unrolling factor required by
1580      the SLP instances.  If that unrolling factor is 1, we say, that we
1581      perform pure SLP on loop - cross iteration parallelism is not
1582      exploited.  */
1583   bool only_slp_in_loop = true;
1584   for (i = 0; i < nbbs; i++)
1585     {
1586       basic_block bb = bbs[i];
1587       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1588            gsi_next (&si))
1589         {
1590           gimple *stmt = gsi_stmt (si);
1591           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1592           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1593               && STMT_VINFO_RELATED_STMT (stmt_info))
1594             {
1595               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1596               stmt_info = vinfo_for_stmt (stmt);
1597             }
1598           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1599                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1600               && !PURE_SLP_STMT (stmt_info))
1601             /* STMT needs both SLP and loop-based vectorization.  */
1602             only_slp_in_loop = false;
1603         }
1604     }
1605
1606   if (only_slp_in_loop)
1607     {
1608       dump_printf_loc (MSG_NOTE, vect_location,
1609                        "Loop contains only SLP stmts\n");
1610       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1611     }
1612   else
1613     {
1614       dump_printf_loc (MSG_NOTE, vect_location,
1615                        "Loop contains SLP and non-SLP stmts\n");
1616       /* Both the vectorization factor and unroll factor have the form
1617          current_vector_size * X for some rational X, so they must have
1618          a common multiple.  */
1619       vectorization_factor
1620         = force_common_multiple (vectorization_factor,
1621                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1622     }
1623
1624   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1625   if (dump_enabled_p ())
1626     {
1627       dump_printf_loc (MSG_NOTE, vect_location,
1628                        "Updating vectorization factor to ");
1629       dump_dec (MSG_NOTE, vectorization_factor);
1630       dump_printf (MSG_NOTE, ".\n");
1631     }
1632 }
1633
1634 /* Function vect_analyze_loop_operations.
1635
1636    Scan the loop stmts and make sure they are all vectorizable.  */
1637
1638 static bool
1639 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1640 {
1641   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1642   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1643   int nbbs = loop->num_nodes;
1644   int i;
1645   stmt_vec_info stmt_info;
1646   bool need_to_vectorize = false;
1647   bool ok;
1648
1649   if (dump_enabled_p ())
1650     dump_printf_loc (MSG_NOTE, vect_location,
1651                      "=== vect_analyze_loop_operations ===\n");
1652
1653   for (i = 0; i < nbbs; i++)
1654     {
1655       basic_block bb = bbs[i];
1656
1657       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1658            gsi_next (&si))
1659         {
1660           gphi *phi = si.phi ();
1661           ok = true;
1662
1663           stmt_info = vinfo_for_stmt (phi);
1664           if (dump_enabled_p ())
1665             {
1666               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1667               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1668             }
1669           if (virtual_operand_p (gimple_phi_result (phi)))
1670             continue;
1671
1672           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1673              (i.e., a phi in the tail of the outer-loop).  */
1674           if (! is_loop_header_bb_p (bb))
1675             {
1676               /* FORNOW: we currently don't support the case that these phis
1677                  are not used in the outerloop (unless it is double reduction,
1678                  i.e., this phi is vect_reduction_def), cause this case
1679                  requires to actually do something here.  */
1680               if (STMT_VINFO_LIVE_P (stmt_info)
1681                   && STMT_VINFO_DEF_TYPE (stmt_info)
1682                      != vect_double_reduction_def)
1683                 {
1684                   if (dump_enabled_p ())
1685                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                                      "Unsupported loop-closed phi in "
1687                                      "outer-loop.\n");
1688                   return false;
1689                 }
1690
1691               /* If PHI is used in the outer loop, we check that its operand
1692                  is defined in the inner loop.  */
1693               if (STMT_VINFO_RELEVANT_P (stmt_info))
1694                 {
1695                   tree phi_op;
1696                   gimple *op_def_stmt;
1697
1698                   if (gimple_phi_num_args (phi) != 1)
1699                     return false;
1700
1701                   phi_op = PHI_ARG_DEF (phi, 0);
1702                   if (TREE_CODE (phi_op) != SSA_NAME)
1703                     return false;
1704
1705                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1706                   if (gimple_nop_p (op_def_stmt)
1707                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1708                       || !vinfo_for_stmt (op_def_stmt))
1709                     return false;
1710
1711                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1712                         != vect_used_in_outer
1713                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1714                            != vect_used_in_outer_by_reduction)
1715                     return false;
1716                 }
1717
1718               continue;
1719             }
1720
1721           gcc_assert (stmt_info);
1722
1723           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1724                || STMT_VINFO_LIVE_P (stmt_info))
1725               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1726             {
1727               /* A scalar-dependence cycle that we don't support.  */
1728               if (dump_enabled_p ())
1729                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                                  "not vectorized: scalar dependence cycle.\n");
1731               return false;
1732             }
1733
1734           if (STMT_VINFO_RELEVANT_P (stmt_info))
1735             {
1736               need_to_vectorize = true;
1737               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1738                   && ! PURE_SLP_STMT (stmt_info))
1739                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1740               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1741                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1742                        && ! PURE_SLP_STMT (stmt_info))
1743                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1744             }
1745
1746           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1747             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1748
1749           if (!ok)
1750             {
1751               if (dump_enabled_p ())
1752                 {
1753                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1754                                    "not vectorized: relevant phi not "
1755                                    "supported: ");
1756                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1757                 }
1758               return false;
1759             }
1760         }
1761
1762       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1763            gsi_next (&si))
1764         {
1765           gimple *stmt = gsi_stmt (si);
1766           if (!gimple_clobber_p (stmt)
1767               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1768             return false;
1769         }
1770     } /* bbs */
1771
1772   /* All operations in the loop are either irrelevant (deal with loop
1773      control, or dead), or only used outside the loop and can be moved
1774      out of the loop (e.g. invariants, inductions).  The loop can be
1775      optimized away by scalar optimizations.  We're better off not
1776      touching this loop.  */
1777   if (!need_to_vectorize)
1778     {
1779       if (dump_enabled_p ())
1780         dump_printf_loc (MSG_NOTE, vect_location,
1781                          "All the computation can be taken out of the loop.\n");
1782       if (dump_enabled_p ())
1783         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1784                          "not vectorized: redundant loop. no profit to "
1785                          "vectorize.\n");
1786       return false;
1787     }
1788
1789   return true;
1790 }
1791
1792
1793 /* Function vect_analyze_loop_2.
1794
1795    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1796    for it.  The different analyses will record information in the
1797    loop_vec_info struct.  */
1798 static bool
1799 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1800 {
1801   bool ok;
1802   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
1803   poly_uint64 min_vf = 2;
1804   unsigned int n_stmts = 0;
1805
1806   /* The first group of checks is independent of the vector size.  */
1807   fatal = true;
1808
1809   /* Find all data references in the loop (which correspond to vdefs/vuses)
1810      and analyze their evolution in the loop.  */
1811
1812   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1813
1814   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1815   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1816     {
1817       if (dump_enabled_p ())
1818         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1819                          "not vectorized: loop nest containing two "
1820                          "or more consecutive inner loops cannot be "
1821                          "vectorized\n");
1822       return false;
1823     }
1824
1825   for (unsigned i = 0; i < loop->num_nodes; i++)
1826     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1827          !gsi_end_p (gsi); gsi_next (&gsi))
1828       {
1829         gimple *stmt = gsi_stmt (gsi);
1830         if (is_gimple_debug (stmt))
1831           continue;
1832         ++n_stmts;
1833         if (!find_data_references_in_stmt (loop, stmt,
1834                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1835           {
1836             if (is_gimple_call (stmt) && loop->safelen)
1837               {
1838                 tree fndecl = gimple_call_fndecl (stmt), op;
1839                 if (fndecl != NULL_TREE)
1840                   {
1841                     cgraph_node *node = cgraph_node::get (fndecl);
1842                     if (node != NULL && node->simd_clones != NULL)
1843                       {
1844                         unsigned int j, n = gimple_call_num_args (stmt);
1845                         for (j = 0; j < n; j++)
1846                           {
1847                             op = gimple_call_arg (stmt, j);
1848                             if (DECL_P (op)
1849                                 || (REFERENCE_CLASS_P (op)
1850                                     && get_base_address (op)))
1851                               break;
1852                           }
1853                         op = gimple_call_lhs (stmt);
1854                         /* Ignore #pragma omp declare simd functions
1855                            if they don't have data references in the
1856                            call stmt itself.  */
1857                         if (j == n
1858                             && !(op
1859                                  && (DECL_P (op)
1860                                      || (REFERENCE_CLASS_P (op)
1861                                          && get_base_address (op)))))
1862                           continue;
1863                       }
1864                   }
1865               }
1866             if (dump_enabled_p ())
1867               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1868                                "not vectorized: loop contains function "
1869                                "calls or data references that cannot "
1870                                "be analyzed\n");
1871             return false;
1872           }
1873       }
1874
1875   /* Analyze the data references and also adjust the minimal
1876      vectorization factor according to the loads and stores.  */
1877
1878   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1879   if (!ok)
1880     {
1881       if (dump_enabled_p ())
1882         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1883                          "bad data references.\n");
1884       return false;
1885     }
1886
1887   /* Classify all cross-iteration scalar data-flow cycles.
1888      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1889   vect_analyze_scalar_cycles (loop_vinfo);
1890
1891   vect_pattern_recog (loop_vinfo);
1892
1893   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1894
1895   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1896      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1897
1898   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "bad data access.\n");
1904       return false;
1905     }
1906
1907   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1908
1909   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1910   if (!ok)
1911     {
1912       if (dump_enabled_p ())
1913         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1914                          "unexpected pattern.\n");
1915       return false;
1916     }
1917
1918   /* While the rest of the analysis below depends on it in some way.  */
1919   fatal = false;
1920
1921   /* Analyze data dependences between the data-refs in the loop
1922      and adjust the maximum vectorization factor according to
1923      the dependences.
1924      FORNOW: fail at the first data dependence that we encounter.  */
1925
1926   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1927   if (!ok
1928       || (max_vf != MAX_VECTORIZATION_FACTOR
1929           && maybe_lt (max_vf, min_vf)))
1930     {
1931       if (dump_enabled_p ())
1932             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                              "bad data dependence.\n");
1934       return false;
1935     }
1936   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1937
1938   ok = vect_determine_vectorization_factor (loop_vinfo);
1939   if (!ok)
1940     {
1941       if (dump_enabled_p ())
1942         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1943                          "can't determine vectorization factor.\n");
1944       return false;
1945     }
1946   if (max_vf != MAX_VECTORIZATION_FACTOR
1947       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1948     {
1949       if (dump_enabled_p ())
1950         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1951                          "bad data dependence.\n");
1952       return false;
1953     }
1954
1955   /* Compute the scalar iteration cost.  */
1956   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1957
1958   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1959   HOST_WIDE_INT estimated_niter;
1960   unsigned th;
1961   int min_scalar_loop_bound;
1962
1963   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1964   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1965   if (!ok)
1966     return false;
1967
1968   /* If there are any SLP instances mark them as pure_slp.  */
1969   bool slp = vect_make_slp_decision (loop_vinfo);
1970   if (slp)
1971     {
1972       /* Find stmts that need to be both vectorized and SLPed.  */
1973       vect_detect_hybrid_slp (loop_vinfo);
1974
1975       /* Update the vectorization factor based on the SLP decision.  */
1976       vect_update_vf_for_slp (loop_vinfo);
1977     }
1978
1979   /* This is the point where we can re-start analysis with SLP forced off.  */
1980 start_over:
1981
1982   /* Now the vectorization factor is final.  */
1983   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1984   gcc_assert (known_ne (vectorization_factor, 0U));
1985   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1986
1987   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1988     {
1989       dump_printf_loc (MSG_NOTE, vect_location,
1990                        "vectorization_factor = ");
1991       dump_dec (MSG_NOTE, vectorization_factor);
1992       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
1993                    LOOP_VINFO_INT_NITERS (loop_vinfo));
1994     }
1995
1996   HOST_WIDE_INT max_niter
1997     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1998   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1999        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < assumed_vf))
2000       || (max_niter != -1
2001           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf))
2002     {
2003       if (dump_enabled_p ())
2004         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2005                          "not vectorized: iteration count smaller than "
2006                          "vectorization factor.\n");
2007       return false;
2008     }
2009
2010   /* Analyze the alignment of the data-refs in the loop.
2011      Fail if a data reference is found that cannot be vectorized.  */
2012
2013   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2014   if (!ok)
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "bad data alignment.\n");
2019       return false;
2020     }
2021
2022   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2023      It is important to call pruning after vect_analyze_data_ref_accesses,
2024      since we use grouping information gathered by interleaving analysis.  */
2025   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2026   if (!ok)
2027     return false;
2028
2029   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2030      vectorization.  */
2031   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2032     {
2033     /* This pass will decide on using loop versioning and/or loop peeling in
2034        order to enhance the alignment of data references in the loop.  */
2035     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2036     if (!ok)
2037       {
2038         if (dump_enabled_p ())
2039           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2040                            "bad data alignment.\n");
2041         return false;
2042       }
2043     }
2044
2045   if (slp)
2046     {
2047       /* Analyze operations in the SLP instances.  Note this may
2048          remove unsupported SLP instances which makes the above
2049          SLP kind detection invalid.  */
2050       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2051       vect_slp_analyze_operations (loop_vinfo);
2052       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2053         goto again;
2054     }
2055
2056   /* Scan all the remaining operations in the loop that are not subject
2057      to SLP and make sure they are vectorizable.  */
2058   ok = vect_analyze_loop_operations (loop_vinfo);
2059   if (!ok)
2060     {
2061       if (dump_enabled_p ())
2062         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2063                          "bad operation or unsupported loop bound.\n");
2064       return false;
2065     }
2066
2067   /* If epilog loop is required because of data accesses with gaps,
2068      one additional iteration needs to be peeled.  Check if there is
2069      enough iterations for vectorization.  */
2070   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2071       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2072     {
2073       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2074       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2075
2076       if (known_lt (wi::to_widest (scalar_niters), vf))
2077         {
2078           if (dump_enabled_p ())
2079             dump_printf_loc (MSG_NOTE, vect_location,
2080                              "loop has no enough iterations to support"
2081                              " peeling for gaps.\n");
2082           return false;
2083         }
2084     }
2085
2086   /* Analyze cost.  Decide if worth while to vectorize.  */
2087   int min_profitable_estimate, min_profitable_iters;
2088   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2089                                       &min_profitable_estimate);
2090
2091   if (min_profitable_iters < 0)
2092     {
2093       if (dump_enabled_p ())
2094         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2095                          "not vectorized: vectorization not profitable.\n");
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "not vectorized: vector version will never be "
2099                          "profitable.\n");
2100       goto again;
2101     }
2102
2103   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2104                            * assumed_vf);
2105
2106   /* Use the cost model only if it is more conservative than user specified
2107      threshold.  */
2108   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2109
2110   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2111
2112   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2113       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2114     {
2115       if (dump_enabled_p ())
2116         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2117                          "not vectorized: vectorization not profitable.\n");
2118       if (dump_enabled_p ())
2119         dump_printf_loc (MSG_NOTE, vect_location,
2120                          "not vectorized: iteration count smaller than user "
2121                          "specified loop bound parameter or minimum profitable "
2122                          "iterations (whichever is more conservative).\n");
2123       goto again;
2124     }
2125
2126   estimated_niter
2127     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2128   if (estimated_niter == -1)
2129     estimated_niter = max_niter;
2130   if (estimated_niter != -1
2131       && ((unsigned HOST_WIDE_INT) estimated_niter
2132           < MAX (th, (unsigned) min_profitable_estimate)))
2133     {
2134       if (dump_enabled_p ())
2135         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2136                          "not vectorized: estimated iteration count too "
2137                          "small.\n");
2138       if (dump_enabled_p ())
2139         dump_printf_loc (MSG_NOTE, vect_location,
2140                          "not vectorized: estimated iteration count smaller "
2141                          "than specified loop bound parameter or minimum "
2142                          "profitable iterations (whichever is more "
2143                          "conservative).\n");
2144       goto again;
2145     }
2146
2147   /* Decide whether we need to create an epilogue loop to handle
2148      remaining scalar iterations.  */
2149   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2150
2151   unsigned HOST_WIDE_INT const_vf;
2152   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2153       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2154     {
2155       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2156                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2157                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2158         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2159     }
2160   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2161            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2162            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2163                 < (unsigned) exact_log2 (const_vf))
2164                /* In case of versioning, check if the maximum number of
2165                   iterations is greater than th.  If they are identical,
2166                   the epilogue is unnecessary.  */
2167                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2168                    || ((unsigned HOST_WIDE_INT) max_niter
2169                        > (th / const_vf) * const_vf))))
2170     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2171
2172   /* If an epilogue loop is required make sure we can create one.  */
2173   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2174       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2175     {
2176       if (dump_enabled_p ())
2177         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2178       if (!vect_can_advance_ivs_p (loop_vinfo)
2179           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2180                                            single_exit (LOOP_VINFO_LOOP
2181                                                          (loop_vinfo))))
2182         {
2183           if (dump_enabled_p ())
2184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2185                              "not vectorized: can't create required "
2186                              "epilog loop\n");
2187           goto again;
2188         }
2189     }
2190
2191   /* During peeling, we need to check if number of loop iterations is
2192      enough for both peeled prolog loop and vector loop.  This check
2193      can be merged along with threshold check of loop versioning, so
2194      increase threshold for this case if necessary.  */
2195   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2196     {
2197       poly_uint64 niters_th;
2198
2199       /* Niters for peeled prolog loop.  */
2200       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2201         {
2202           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2203           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2204
2205           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2206         }
2207       else
2208         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2209
2210       /* Niters for at least one iteration of vectorized loop.  */
2211       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2212       /* One additional iteration because of peeling for gap.  */
2213       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2214         niters_th += 1;
2215       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2216     }
2217
2218   gcc_assert (known_eq (vectorization_factor,
2219                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2220
2221   /* Ok to vectorize!  */
2222   return true;
2223
2224 again:
2225   /* Try again with SLP forced off but if we didn't do any SLP there is
2226      no point in re-trying.  */
2227   if (!slp)
2228     return false;
2229
2230   /* If there are reduction chains re-trying will fail anyway.  */
2231   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2232     return false;
2233
2234   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2235      via interleaving or lane instructions.  */
2236   slp_instance instance;
2237   slp_tree node;
2238   unsigned i, j;
2239   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2240     {
2241       stmt_vec_info vinfo;
2242       vinfo = vinfo_for_stmt
2243           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2244       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2245         continue;
2246       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2247       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2248       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2249       if (! vect_store_lanes_supported (vectype, size)
2250           && ! vect_grouped_store_supported (vectype, size))
2251         return false;
2252       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2253         {
2254           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2255           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2256           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2257           size = STMT_VINFO_GROUP_SIZE (vinfo);
2258           vectype = STMT_VINFO_VECTYPE (vinfo);
2259           if (! vect_load_lanes_supported (vectype, size)
2260               && ! vect_grouped_load_supported (vectype, single_element_p,
2261                                                 size))
2262             return false;
2263         }
2264     }
2265
2266   if (dump_enabled_p ())
2267     dump_printf_loc (MSG_NOTE, vect_location,
2268                      "re-trying with SLP disabled\n");
2269
2270   /* Roll back state appropriately.  No SLP this time.  */
2271   slp = false;
2272   /* Restore vectorization factor as it were without SLP.  */
2273   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2274   /* Free the SLP instances.  */
2275   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2276     vect_free_slp_instance (instance);
2277   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2278   /* Reset SLP type to loop_vect on all stmts.  */
2279   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2280     {
2281       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2282       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2283            !gsi_end_p (si); gsi_next (&si))
2284         {
2285           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2286           STMT_SLP_TYPE (stmt_info) = loop_vect;
2287         }
2288       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2289            !gsi_end_p (si); gsi_next (&si))
2290         {
2291           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2292           STMT_SLP_TYPE (stmt_info) = loop_vect;
2293           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2294             {
2295               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2296               STMT_SLP_TYPE (stmt_info) = loop_vect;
2297               for (gimple_stmt_iterator pi
2298                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2299                    !gsi_end_p (pi); gsi_next (&pi))
2300                 {
2301                   gimple *pstmt = gsi_stmt (pi);
2302                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2303                 }
2304             }
2305         }
2306     }
2307   /* Free optimized alias test DDRS.  */
2308   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2309   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2310   /* Reset target cost data.  */
2311   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2312   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2313     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2314   /* Reset assorted flags.  */
2315   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2316   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2317   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2318   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2319
2320   goto start_over;
2321 }
2322
2323 /* Function vect_analyze_loop.
2324
2325    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2326    for it.  The different analyses will record information in the
2327    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2328    be vectorized.  */
2329 loop_vec_info
2330 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2331 {
2332   loop_vec_info loop_vinfo;
2333   auto_vector_sizes vector_sizes;
2334
2335   /* Autodetect first vector size we try.  */
2336   current_vector_size = 0;
2337   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2338   unsigned int next_size = 0;
2339
2340   if (dump_enabled_p ())
2341     dump_printf_loc (MSG_NOTE, vect_location,
2342                      "===== analyze_loop_nest =====\n");
2343
2344   if (loop_outer (loop)
2345       && loop_vec_info_for_loop (loop_outer (loop))
2346       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2347     {
2348       if (dump_enabled_p ())
2349         dump_printf_loc (MSG_NOTE, vect_location,
2350                          "outer-loop already vectorized.\n");
2351       return NULL;
2352     }
2353
2354   poly_uint64 autodetected_vector_size = 0;
2355   while (1)
2356     {
2357       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2358       loop_vinfo = vect_analyze_loop_form (loop);
2359       if (!loop_vinfo)
2360         {
2361           if (dump_enabled_p ())
2362             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2363                              "bad loop form.\n");
2364           return NULL;
2365         }
2366
2367       bool fatal = false;
2368
2369       if (orig_loop_vinfo)
2370         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2371
2372       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2373         {
2374           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2375
2376           return loop_vinfo;
2377         }
2378
2379       delete loop_vinfo;
2380
2381       if (next_size == 0)
2382         autodetected_vector_size = current_vector_size;
2383
2384       if (next_size < vector_sizes.length ()
2385           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2386         next_size += 1;
2387
2388       if (fatal
2389           || next_size == vector_sizes.length ()
2390           || known_eq (current_vector_size, 0U))
2391         return NULL;
2392
2393       /* Try the next biggest vector size.  */
2394       current_vector_size = vector_sizes[next_size++];
2395       if (dump_enabled_p ())
2396         {
2397           dump_printf_loc (MSG_NOTE, vect_location,
2398                            "***** Re-trying analysis with "
2399                            "vector size ");
2400           dump_dec (MSG_NOTE, current_vector_size);
2401           dump_printf (MSG_NOTE, "\n");
2402         }
2403     }
2404 }
2405
2406
2407 /* Function reduction_fn_for_scalar_code
2408
2409    Input:
2410    CODE - tree_code of a reduction operations.
2411
2412    Output:
2413    REDUC_FN - the corresponding internal function to be used to reduce the
2414       vector of partial results into a single scalar result, or IFN_LAST
2415       if the operation is a supported reduction operation, but does not have
2416       such an internal function.
2417
2418    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2419
2420 static bool
2421 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2422 {
2423   switch (code)
2424     {
2425       case MAX_EXPR:
2426         *reduc_fn = IFN_REDUC_MAX;
2427         return true;
2428
2429       case MIN_EXPR:
2430         *reduc_fn = IFN_REDUC_MIN;
2431         return true;
2432
2433       case PLUS_EXPR:
2434         *reduc_fn = IFN_REDUC_PLUS;
2435         return true;
2436
2437       case MULT_EXPR:
2438       case MINUS_EXPR:
2439       case BIT_IOR_EXPR:
2440       case BIT_XOR_EXPR:
2441       case BIT_AND_EXPR:
2442         *reduc_fn = IFN_LAST;
2443         return true;
2444
2445       default:
2446        return false;
2447     }
2448 }
2449
2450
2451 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2452    STMT is printed with a message MSG. */
2453
2454 static void
2455 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2456 {
2457   dump_printf_loc (msg_type, vect_location, "%s", msg);
2458   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2459 }
2460
2461
2462 /* Detect SLP reduction of the form:
2463
2464    #a1 = phi <a5, a0>
2465    a2 = operation (a1)
2466    a3 = operation (a2)
2467    a4 = operation (a3)
2468    a5 = operation (a4)
2469
2470    #a = phi <a5>
2471
2472    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2473    FIRST_STMT is the first reduction stmt in the chain
2474    (a2 = operation (a1)).
2475
2476    Return TRUE if a reduction chain was detected.  */
2477
2478 static bool
2479 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2480                        gimple *first_stmt)
2481 {
2482   struct loop *loop = (gimple_bb (phi))->loop_father;
2483   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2484   enum tree_code code;
2485   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2486   stmt_vec_info use_stmt_info, current_stmt_info;
2487   tree lhs;
2488   imm_use_iterator imm_iter;
2489   use_operand_p use_p;
2490   int nloop_uses, size = 0, n_out_of_loop_uses;
2491   bool found = false;
2492
2493   if (loop != vect_loop)
2494     return false;
2495
2496   lhs = PHI_RESULT (phi);
2497   code = gimple_assign_rhs_code (first_stmt);
2498   while (1)
2499     {
2500       nloop_uses = 0;
2501       n_out_of_loop_uses = 0;
2502       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2503         {
2504           gimple *use_stmt = USE_STMT (use_p);
2505           if (is_gimple_debug (use_stmt))
2506             continue;
2507
2508           /* Check if we got back to the reduction phi.  */
2509           if (use_stmt == phi)
2510             {
2511               loop_use_stmt = use_stmt;
2512               found = true;
2513               break;
2514             }
2515
2516           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2517             {
2518               loop_use_stmt = use_stmt;
2519               nloop_uses++;
2520             }
2521            else
2522              n_out_of_loop_uses++;
2523
2524            /* There are can be either a single use in the loop or two uses in
2525               phi nodes.  */
2526            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2527              return false;
2528         }
2529
2530       if (found)
2531         break;
2532
2533       /* We reached a statement with no loop uses.  */
2534       if (nloop_uses == 0)
2535         return false;
2536
2537       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2538       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2539         return false;
2540
2541       if (!is_gimple_assign (loop_use_stmt)
2542           || code != gimple_assign_rhs_code (loop_use_stmt)
2543           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2544         return false;
2545
2546       /* Insert USE_STMT into reduction chain.  */
2547       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2548       if (current_stmt)
2549         {
2550           current_stmt_info = vinfo_for_stmt (current_stmt);
2551           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2552           GROUP_FIRST_ELEMENT (use_stmt_info)
2553             = GROUP_FIRST_ELEMENT (current_stmt_info);
2554         }
2555       else
2556         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2557
2558       lhs = gimple_assign_lhs (loop_use_stmt);
2559       current_stmt = loop_use_stmt;
2560       size++;
2561    }
2562
2563   if (!found || loop_use_stmt != phi || size < 2)
2564     return false;
2565
2566   /* Swap the operands, if needed, to make the reduction operand be the second
2567      operand.  */
2568   lhs = PHI_RESULT (phi);
2569   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2570   while (next_stmt)
2571     {
2572       if (gimple_assign_rhs2 (next_stmt) == lhs)
2573         {
2574           tree op = gimple_assign_rhs1 (next_stmt);
2575           gimple *def_stmt = NULL;
2576
2577           if (TREE_CODE (op) == SSA_NAME)
2578             def_stmt = SSA_NAME_DEF_STMT (op);
2579
2580           /* Check that the other def is either defined in the loop
2581              ("vect_internal_def"), or it's an induction (defined by a
2582              loop-header phi-node).  */
2583           if (def_stmt
2584               && gimple_bb (def_stmt)
2585               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2586               && (is_gimple_assign (def_stmt)
2587                   || is_gimple_call (def_stmt)
2588                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2589                            == vect_induction_def
2590                   || (gimple_code (def_stmt) == GIMPLE_PHI
2591                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592                                   == vect_internal_def
2593                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2594             {
2595               lhs = gimple_assign_lhs (next_stmt);
2596               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2597               continue;
2598             }
2599
2600           return false;
2601         }
2602       else
2603         {
2604           tree op = gimple_assign_rhs2 (next_stmt);
2605           gimple *def_stmt = NULL;
2606
2607           if (TREE_CODE (op) == SSA_NAME)
2608             def_stmt = SSA_NAME_DEF_STMT (op);
2609
2610           /* Check that the other def is either defined in the loop
2611             ("vect_internal_def"), or it's an induction (defined by a
2612             loop-header phi-node).  */
2613           if (def_stmt
2614               && gimple_bb (def_stmt)
2615               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2616               && (is_gimple_assign (def_stmt)
2617                   || is_gimple_call (def_stmt)
2618                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2619                               == vect_induction_def
2620                   || (gimple_code (def_stmt) == GIMPLE_PHI
2621                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2622                                   == vect_internal_def
2623                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2624             {
2625               if (dump_enabled_p ())
2626                 {
2627                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2628                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2629                 }
2630
2631               swap_ssa_operands (next_stmt,
2632                                  gimple_assign_rhs1_ptr (next_stmt),
2633                                  gimple_assign_rhs2_ptr (next_stmt));
2634               update_stmt (next_stmt);
2635
2636               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2637                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2638             }
2639           else
2640             return false;
2641         }
2642
2643       lhs = gimple_assign_lhs (next_stmt);
2644       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2645     }
2646
2647   /* Save the chain for further analysis in SLP detection.  */
2648   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2649   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2650   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2651
2652   return true;
2653 }
2654
2655
2656 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2657    reduction operation CODE has a handled computation expression.  */
2658
2659 bool
2660 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2661                       enum tree_code code)
2662 {
2663   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2664   auto_bitmap visited;
2665   tree lookfor = PHI_RESULT (phi);
2666   ssa_op_iter curri;
2667   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2668   while (USE_FROM_PTR (curr) != loop_arg)
2669     curr = op_iter_next_use (&curri);
2670   curri.i = curri.numops;
2671   do
2672     {
2673       path.safe_push (std::make_pair (curri, curr));
2674       tree use = USE_FROM_PTR (curr);
2675       if (use == lookfor)
2676         break;
2677       gimple *def = SSA_NAME_DEF_STMT (use);
2678       if (gimple_nop_p (def)
2679           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2680         {
2681 pop:
2682           do
2683             {
2684               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2685               curri = x.first;
2686               curr = x.second;
2687               do
2688                 curr = op_iter_next_use (&curri);
2689               /* Skip already visited or non-SSA operands (from iterating
2690                  over PHI args).  */
2691               while (curr != NULL_USE_OPERAND_P
2692                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2693                          || ! bitmap_set_bit (visited,
2694                                               SSA_NAME_VERSION
2695                                                 (USE_FROM_PTR (curr)))));
2696             }
2697           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2698           if (curr == NULL_USE_OPERAND_P)
2699             break;
2700         }
2701       else
2702         {
2703           if (gimple_code (def) == GIMPLE_PHI)
2704             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2705           else
2706             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2707           while (curr != NULL_USE_OPERAND_P
2708                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2709                      || ! bitmap_set_bit (visited,
2710                                           SSA_NAME_VERSION
2711                                             (USE_FROM_PTR (curr)))))
2712             curr = op_iter_next_use (&curri);
2713           if (curr == NULL_USE_OPERAND_P)
2714             goto pop;
2715         }
2716     }
2717   while (1);
2718   if (dump_file && (dump_flags & TDF_DETAILS))
2719     {
2720       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2721       unsigned i;
2722       std::pair<ssa_op_iter, use_operand_p> *x;
2723       FOR_EACH_VEC_ELT (path, i, x)
2724         {
2725           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2726           dump_printf (MSG_NOTE, " ");
2727         }
2728       dump_printf (MSG_NOTE, "\n");
2729     }
2730
2731   /* Check whether the reduction path detected is valid.  */
2732   bool fail = path.length () == 0;
2733   bool neg = false;
2734   for (unsigned i = 1; i < path.length (); ++i)
2735     {
2736       gimple *use_stmt = USE_STMT (path[i].second);
2737       tree op = USE_FROM_PTR (path[i].second);
2738       if (! has_single_use (op)
2739           || ! is_gimple_assign (use_stmt))
2740         {
2741           fail = true;
2742           break;
2743         }
2744       if (gimple_assign_rhs_code (use_stmt) != code)
2745         {
2746           if (code == PLUS_EXPR
2747               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2748             {
2749               /* Track whether we negate the reduction value each iteration.  */
2750               if (gimple_assign_rhs2 (use_stmt) == op)
2751                 neg = ! neg;
2752             }
2753           else
2754             {
2755               fail = true;
2756               break;
2757             }
2758         }
2759     }
2760   return ! fail && ! neg;
2761 }
2762
2763
2764 /* Function vect_is_simple_reduction
2765
2766    (1) Detect a cross-iteration def-use cycle that represents a simple
2767    reduction computation.  We look for the following pattern:
2768
2769    loop_header:
2770      a1 = phi < a0, a2 >
2771      a3 = ...
2772      a2 = operation (a3, a1)
2773
2774    or
2775
2776    a3 = ...
2777    loop_header:
2778      a1 = phi < a0, a2 >
2779      a2 = operation (a3, a1)
2780
2781    such that:
2782    1. operation is commutative and associative and it is safe to
2783       change the order of the computation
2784    2. no uses for a2 in the loop (a2 is used out of the loop)
2785    3. no uses of a1 in the loop besides the reduction operation
2786    4. no uses of a1 outside the loop.
2787
2788    Conditions 1,4 are tested here.
2789    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2790
2791    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2792    nested cycles.
2793
2794    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2795    reductions:
2796
2797      a1 = phi < a0, a2 >
2798      inner loop (def of a3)
2799      a2 = phi < a3 >
2800
2801    (4) Detect condition expressions, ie:
2802      for (int i = 0; i < N; i++)
2803        if (a[i] < val)
2804         ret_val = a[i];
2805
2806 */
2807
2808 static gimple *
2809 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2810                           bool *double_reduc,
2811                           bool need_wrapping_integral_overflow,
2812                           enum vect_reduction_type *v_reduc_type)
2813 {
2814   struct loop *loop = (gimple_bb (phi))->loop_father;
2815   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2816   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2817   enum tree_code orig_code, code;
2818   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2819   tree type;
2820   int nloop_uses;
2821   tree name;
2822   imm_use_iterator imm_iter;
2823   use_operand_p use_p;
2824   bool phi_def;
2825
2826   *double_reduc = false;
2827   *v_reduc_type = TREE_CODE_REDUCTION;
2828
2829   tree phi_name = PHI_RESULT (phi);
2830   /* ???  If there are no uses of the PHI result the inner loop reduction
2831      won't be detected as possibly double-reduction by vectorizable_reduction
2832      because that tries to walk the PHI arg from the preheader edge which
2833      can be constant.  See PR60382.  */
2834   if (has_zero_uses (phi_name))
2835     return NULL;
2836   nloop_uses = 0;
2837   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2838     {
2839       gimple *use_stmt = USE_STMT (use_p);
2840       if (is_gimple_debug (use_stmt))
2841         continue;
2842
2843       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2844         {
2845           if (dump_enabled_p ())
2846             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2847                              "intermediate value used outside loop.\n");
2848
2849           return NULL;
2850         }
2851
2852       nloop_uses++;
2853       if (nloop_uses > 1)
2854         {
2855           if (dump_enabled_p ())
2856             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2857                              "reduction value used in loop.\n");
2858           return NULL;
2859         }
2860
2861       phi_use_stmt = use_stmt;
2862     }
2863
2864   edge latch_e = loop_latch_edge (loop);
2865   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2866   if (TREE_CODE (loop_arg) != SSA_NAME)
2867     {
2868       if (dump_enabled_p ())
2869         {
2870           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2871                            "reduction: not ssa_name: ");
2872           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2873           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2874         }
2875       return NULL;
2876     }
2877
2878   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2879   if (is_gimple_assign (def_stmt))
2880     {
2881       name = gimple_assign_lhs (def_stmt);
2882       phi_def = false;
2883     }
2884   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2885     {
2886       name = PHI_RESULT (def_stmt);
2887       phi_def = true;
2888     }
2889   else
2890     {
2891       if (dump_enabled_p ())
2892         {
2893           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2894                            "reduction: unhandled reduction operation: ");
2895           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2896         }
2897       return NULL;
2898     }
2899
2900   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2901     return NULL;
2902
2903   nloop_uses = 0;
2904   auto_vec<gphi *, 3> lcphis;
2905   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2906     {
2907       gimple *use_stmt = USE_STMT (use_p);
2908       if (is_gimple_debug (use_stmt))
2909         continue;
2910       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2911         nloop_uses++;
2912       else
2913         /* We can have more than one loop-closed PHI.  */
2914         lcphis.safe_push (as_a <gphi *> (use_stmt));
2915       if (nloop_uses > 1)
2916         {
2917           if (dump_enabled_p ())
2918             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2919                              "reduction used in loop.\n");
2920           return NULL;
2921         }
2922     }
2923
2924   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2925      defined in the inner loop.  */
2926   if (phi_def)
2927     {
2928       op1 = PHI_ARG_DEF (def_stmt, 0);
2929
2930       if (gimple_phi_num_args (def_stmt) != 1
2931           || TREE_CODE (op1) != SSA_NAME)
2932         {
2933           if (dump_enabled_p ())
2934             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2935                              "unsupported phi node definition.\n");
2936
2937           return NULL;
2938         }
2939
2940       def1 = SSA_NAME_DEF_STMT (op1);
2941       if (gimple_bb (def1)
2942           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2943           && loop->inner
2944           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2945           && is_gimple_assign (def1)
2946           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2947         {
2948           if (dump_enabled_p ())
2949             report_vect_op (MSG_NOTE, def_stmt,
2950                             "detected double reduction: ");
2951
2952           *double_reduc = true;
2953           return def_stmt;
2954         }
2955
2956       return NULL;
2957     }
2958
2959   /* If we are vectorizing an inner reduction we are executing that
2960      in the original order only in case we are not dealing with a
2961      double reduction.  */
2962   bool check_reduction = true;
2963   if (flow_loop_nested_p (vect_loop, loop))
2964     {
2965       gphi *lcphi;
2966       unsigned i;
2967       check_reduction = false;
2968       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2969         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2970           {
2971             gimple *use_stmt = USE_STMT (use_p);
2972             if (is_gimple_debug (use_stmt))
2973               continue;
2974             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2975               check_reduction = true;
2976           }
2977     }
2978
2979   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2980   code = orig_code = gimple_assign_rhs_code (def_stmt);
2981
2982   /* We can handle "res -= x[i]", which is non-associative by
2983      simply rewriting this into "res += -x[i]".  Avoid changing
2984      gimple instruction for the first simple tests and only do this
2985      if we're allowed to change code at all.  */
2986   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2987     code = PLUS_EXPR;
2988
2989   if (code == COND_EXPR)
2990     {
2991       if (! nested_in_vect_loop)
2992         *v_reduc_type = COND_REDUCTION;
2993
2994       op3 = gimple_assign_rhs1 (def_stmt);
2995       if (COMPARISON_CLASS_P (op3))
2996         {
2997           op4 = TREE_OPERAND (op3, 1);
2998           op3 = TREE_OPERAND (op3, 0);
2999         }
3000       if (op3 == phi_name || op4 == phi_name)
3001         {
3002           if (dump_enabled_p ())
3003             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3004                             "reduction: condition depends on previous"
3005                             " iteration: ");
3006           return NULL;
3007         }
3008
3009       op1 = gimple_assign_rhs2 (def_stmt);
3010       op2 = gimple_assign_rhs3 (def_stmt);
3011     }
3012   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3013     {
3014       if (dump_enabled_p ())
3015         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3016                         "reduction: not commutative/associative: ");
3017       return NULL;
3018     }
3019   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3020     {
3021       op1 = gimple_assign_rhs1 (def_stmt);
3022       op2 = gimple_assign_rhs2 (def_stmt);
3023     }
3024   else
3025     {
3026       if (dump_enabled_p ())
3027         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3028                         "reduction: not handled operation: ");
3029       return NULL;
3030     }
3031
3032   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3033     {
3034       if (dump_enabled_p ())
3035         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3036                         "reduction: both uses not ssa_names: ");
3037
3038       return NULL;
3039     }
3040
3041   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3042   if ((TREE_CODE (op1) == SSA_NAME
3043        && !types_compatible_p (type,TREE_TYPE (op1)))
3044       || (TREE_CODE (op2) == SSA_NAME
3045           && !types_compatible_p (type, TREE_TYPE (op2)))
3046       || (op3 && TREE_CODE (op3) == SSA_NAME
3047           && !types_compatible_p (type, TREE_TYPE (op3)))
3048       || (op4 && TREE_CODE (op4) == SSA_NAME
3049           && !types_compatible_p (type, TREE_TYPE (op4))))
3050     {
3051       if (dump_enabled_p ())
3052         {
3053           dump_printf_loc (MSG_NOTE, vect_location,
3054                            "reduction: multiple types: operation type: ");
3055           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3056           dump_printf (MSG_NOTE, ", operands types: ");
3057           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3058                              TREE_TYPE (op1));
3059           dump_printf (MSG_NOTE, ",");
3060           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3061                              TREE_TYPE (op2));
3062           if (op3)
3063             {
3064               dump_printf (MSG_NOTE, ",");
3065               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3066                                  TREE_TYPE (op3));
3067             }
3068
3069           if (op4)
3070             {
3071               dump_printf (MSG_NOTE, ",");
3072               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3073                                  TREE_TYPE (op4));
3074             }
3075           dump_printf (MSG_NOTE, "\n");
3076         }
3077
3078       return NULL;
3079     }
3080
3081   /* Check that it's ok to change the order of the computation.
3082      Generally, when vectorizing a reduction we change the order of the
3083      computation.  This may change the behavior of the program in some
3084      cases, so we need to check that this is ok.  One exception is when
3085      vectorizing an outer-loop: the inner-loop is executed sequentially,
3086      and therefore vectorizing reductions in the inner-loop during
3087      outer-loop vectorization is safe.  */
3088
3089   if (*v_reduc_type != COND_REDUCTION
3090       && check_reduction)
3091     {
3092       /* CHECKME: check for !flag_finite_math_only too?  */
3093       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3094         {
3095           /* Changing the order of operations changes the semantics.  */
3096           if (dump_enabled_p ())
3097             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3098                         "reduction: unsafe fp math optimization: ");
3099           return NULL;
3100         }
3101       else if (INTEGRAL_TYPE_P (type))
3102         {
3103           if (!operation_no_trapping_overflow (type, code))
3104             {
3105               /* Changing the order of operations changes the semantics.  */
3106               if (dump_enabled_p ())
3107                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3108                                 "reduction: unsafe int math optimization"
3109                                 " (overflow traps): ");
3110               return NULL;
3111             }
3112           if (need_wrapping_integral_overflow
3113               && !TYPE_OVERFLOW_WRAPS (type)
3114               && operation_can_overflow (code))
3115             {
3116               /* Changing the order of operations changes the semantics.  */
3117               if (dump_enabled_p ())
3118                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3119                                 "reduction: unsafe int math optimization"
3120                                 " (overflow doesn't wrap): ");
3121               return NULL;
3122             }
3123         }
3124       else if (SAT_FIXED_POINT_TYPE_P (type))
3125         {
3126           /* Changing the order of operations changes the semantics.  */
3127           if (dump_enabled_p ())
3128           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3129                           "reduction: unsafe fixed-point math optimization: ");
3130           return NULL;
3131         }
3132     }
3133
3134   /* Reduction is safe. We're dealing with one of the following:
3135      1) integer arithmetic and no trapv
3136      2) floating point arithmetic, and special flags permit this optimization
3137      3) nested cycle (i.e., outer loop vectorization).  */
3138   if (TREE_CODE (op1) == SSA_NAME)
3139     def1 = SSA_NAME_DEF_STMT (op1);
3140
3141   if (TREE_CODE (op2) == SSA_NAME)
3142     def2 = SSA_NAME_DEF_STMT (op2);
3143
3144   if (code != COND_EXPR
3145       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3146     {
3147       if (dump_enabled_p ())
3148         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3149       return NULL;
3150     }
3151
3152   /* Check that one def is the reduction def, defined by PHI,
3153      the other def is either defined in the loop ("vect_internal_def"),
3154      or it's an induction (defined by a loop-header phi-node).  */
3155
3156   if (def2 && def2 == phi
3157       && (code == COND_EXPR
3158           || !def1 || gimple_nop_p (def1)
3159           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3160           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3161               && (is_gimple_assign (def1)
3162                   || is_gimple_call (def1)
3163                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3164                       == vect_induction_def
3165                   || (gimple_code (def1) == GIMPLE_PHI
3166                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3167                           == vect_internal_def
3168                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3169     {
3170       if (dump_enabled_p ())
3171         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3172       return def_stmt;
3173     }
3174
3175   if (def1 && def1 == phi
3176       && (code == COND_EXPR
3177           || !def2 || gimple_nop_p (def2)
3178           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3179           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3180               && (is_gimple_assign (def2)
3181                   || is_gimple_call (def2)
3182                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3183                        == vect_induction_def
3184                   || (gimple_code (def2) == GIMPLE_PHI
3185                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3186                            == vect_internal_def
3187                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3188     {
3189       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3190         {
3191           /* Check if we can swap operands (just for simplicity - so that
3192              the rest of the code can assume that the reduction variable
3193              is always the last (second) argument).  */
3194           if (code == COND_EXPR)
3195             {
3196               /* Swap cond_expr by inverting the condition.  */
3197               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3198               enum tree_code invert_code = ERROR_MARK;
3199               enum tree_code cond_code = TREE_CODE (cond_expr);
3200
3201               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3202                 {
3203                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3204                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3205                 }
3206               if (invert_code != ERROR_MARK)
3207                 {
3208                   TREE_SET_CODE (cond_expr, invert_code);
3209                   swap_ssa_operands (def_stmt,
3210                                      gimple_assign_rhs2_ptr (def_stmt),
3211                                      gimple_assign_rhs3_ptr (def_stmt));
3212                 }
3213               else
3214                 {
3215                   if (dump_enabled_p ())
3216                     report_vect_op (MSG_NOTE, def_stmt,
3217                                     "detected reduction: cannot swap operands "
3218                                     "for cond_expr");
3219                   return NULL;
3220                 }
3221             }
3222           else
3223             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3224                                gimple_assign_rhs2_ptr (def_stmt));
3225
3226           if (dump_enabled_p ())
3227             report_vect_op (MSG_NOTE, def_stmt,
3228                             "detected reduction: need to swap operands: ");
3229
3230           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3231             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3232         }
3233       else
3234         {
3235           if (dump_enabled_p ())
3236             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3237         }
3238
3239       return def_stmt;
3240     }
3241
3242   /* Try to find SLP reduction chain.  */
3243   if (! nested_in_vect_loop
3244       && code != COND_EXPR
3245       && orig_code != MINUS_EXPR
3246       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3247     {
3248       if (dump_enabled_p ())
3249         report_vect_op (MSG_NOTE, def_stmt,
3250                         "reduction: detected reduction chain: ");
3251
3252       return def_stmt;
3253     }
3254
3255   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3256   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3257   while (first)
3258     {
3259       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3260       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3261       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3262       first = next;
3263     }
3264
3265   /* Look for the expression computing loop_arg from loop PHI result.  */
3266   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3267                             code))
3268     return def_stmt;
3269
3270   if (dump_enabled_p ())
3271     {
3272       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3273                       "reduction: unknown pattern: ");
3274     }
3275
3276   return NULL;
3277 }
3278
3279 /* Wrapper around vect_is_simple_reduction, which will modify code
3280    in-place if it enables detection of more reductions.  Arguments
3281    as there.  */
3282
3283 gimple *
3284 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3285                              bool *double_reduc,
3286                              bool need_wrapping_integral_overflow)
3287 {
3288   enum vect_reduction_type v_reduc_type;
3289   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3290                                           need_wrapping_integral_overflow,
3291                                           &v_reduc_type);
3292   if (def)
3293     {
3294       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3295       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3296       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3297       reduc_def_info = vinfo_for_stmt (def);
3298       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3299     }
3300   return def;
3301 }
3302
3303 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3304 int
3305 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3306                              int *peel_iters_epilogue,
3307                              stmt_vector_for_cost *scalar_cost_vec,
3308                              stmt_vector_for_cost *prologue_cost_vec,
3309                              stmt_vector_for_cost *epilogue_cost_vec)
3310 {
3311   int retval = 0;
3312   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3313
3314   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3315     {
3316       *peel_iters_epilogue = assumed_vf / 2;
3317       if (dump_enabled_p ())
3318         dump_printf_loc (MSG_NOTE, vect_location,
3319                          "cost model: epilogue peel iters set to vf/2 "
3320                          "because loop iterations are unknown .\n");
3321
3322       /* If peeled iterations are known but number of scalar loop
3323          iterations are unknown, count a taken branch per peeled loop.  */
3324       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3325                                  NULL, 0, vect_prologue);
3326       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3327                                  NULL, 0, vect_epilogue);
3328     }
3329   else
3330     {
3331       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3332       peel_iters_prologue = niters < peel_iters_prologue ?
3333                             niters : peel_iters_prologue;
3334       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3335       /* If we need to peel for gaps, but no peeling is required, we have to
3336          peel VF iterations.  */
3337       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3338         *peel_iters_epilogue = assumed_vf;
3339     }
3340
3341   stmt_info_for_cost *si;
3342   int j;
3343   if (peel_iters_prologue)
3344     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3345         {
3346           stmt_vec_info stmt_info
3347             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3348           retval += record_stmt_cost (prologue_cost_vec,
3349                                       si->count * peel_iters_prologue,
3350                                       si->kind, stmt_info, si->misalign,
3351                                       vect_prologue);
3352         }
3353   if (*peel_iters_epilogue)
3354     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3355         {
3356           stmt_vec_info stmt_info
3357             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3358           retval += record_stmt_cost (epilogue_cost_vec,
3359                                       si->count * *peel_iters_epilogue,
3360                                       si->kind, stmt_info, si->misalign,
3361                                       vect_epilogue);
3362         }
3363
3364   return retval;
3365 }
3366
3367 /* Function vect_estimate_min_profitable_iters
3368
3369    Return the number of iterations required for the vector version of the
3370    loop to be profitable relative to the cost of the scalar version of the
3371    loop.
3372
3373    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3374    of iterations for vectorization.  -1 value means loop vectorization
3375    is not profitable.  This returned value may be used for dynamic
3376    profitability check.
3377
3378    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3379    for static check against estimated number of iterations.  */
3380
3381 static void
3382 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3383                                     int *ret_min_profitable_niters,
3384                                     int *ret_min_profitable_estimate)
3385 {
3386   int min_profitable_iters;
3387   int min_profitable_estimate;
3388   int peel_iters_prologue;
3389   int peel_iters_epilogue;
3390   unsigned vec_inside_cost = 0;
3391   int vec_outside_cost = 0;
3392   unsigned vec_prologue_cost = 0;
3393   unsigned vec_epilogue_cost = 0;
3394   int scalar_single_iter_cost = 0;
3395   int scalar_outside_cost = 0;
3396   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3397   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3398   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3399
3400   /* Cost model disabled.  */
3401   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3402     {
3403       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3404       *ret_min_profitable_niters = 0;
3405       *ret_min_profitable_estimate = 0;
3406       return;
3407     }
3408
3409   /* Requires loop versioning tests to handle misalignment.  */
3410   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3411     {
3412       /*  FIXME: Make cost depend on complexity of individual check.  */
3413       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3414       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3415                             vect_prologue);
3416       dump_printf (MSG_NOTE,
3417                    "cost model: Adding cost of checks for loop "
3418                    "versioning to treat misalignment.\n");
3419     }
3420
3421   /* Requires loop versioning with alias checks.  */
3422   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3423     {
3424       /*  FIXME: Make cost depend on complexity of individual check.  */
3425       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3426       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3427                             vect_prologue);
3428       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3429       if (len)
3430         /* Count LEN - 1 ANDs and LEN comparisons.  */
3431         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3432                               NULL, 0, vect_prologue);
3433       dump_printf (MSG_NOTE,
3434                    "cost model: Adding cost of checks for loop "
3435                    "versioning aliasing.\n");
3436     }
3437
3438   /* Requires loop versioning with niter checks.  */
3439   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3440     {
3441       /*  FIXME: Make cost depend on complexity of individual check.  */
3442       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3443                             vect_prologue);
3444       dump_printf (MSG_NOTE,
3445                    "cost model: Adding cost of checks for loop "
3446                    "versioning niters.\n");
3447     }
3448
3449   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3450     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3451                           vect_prologue);
3452
3453   /* Count statements in scalar loop.  Using this as scalar cost for a single
3454      iteration for now.
3455
3456      TODO: Add outer loop support.
3457
3458      TODO: Consider assigning different costs to different scalar
3459      statements.  */
3460
3461   scalar_single_iter_cost
3462     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3463
3464   /* Add additional cost for the peeled instructions in prologue and epilogue
3465      loop.
3466
3467      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3468      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3469
3470      TODO: Build an expression that represents peel_iters for prologue and
3471      epilogue to be used in a run-time test.  */
3472
3473   if (npeel  < 0)
3474     {
3475       peel_iters_prologue = assumed_vf / 2;
3476       dump_printf (MSG_NOTE, "cost model: "
3477                    "prologue peel iters set to vf/2.\n");
3478
3479       /* If peeling for alignment is unknown, loop bound of main loop becomes
3480          unknown.  */
3481       peel_iters_epilogue = assumed_vf / 2;
3482       dump_printf (MSG_NOTE, "cost model: "
3483                    "epilogue peel iters set to vf/2 because "
3484                    "peeling for alignment is unknown.\n");
3485
3486       /* If peeled iterations are unknown, count a taken branch and a not taken
3487          branch per peeled loop. Even if scalar loop iterations are known,
3488          vector iterations are not known since peeled prologue iterations are
3489          not known. Hence guards remain the same.  */
3490       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3491                             NULL, 0, vect_prologue);
3492       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3493                             NULL, 0, vect_prologue);
3494       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3495                             NULL, 0, vect_epilogue);
3496       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3497                             NULL, 0, vect_epilogue);
3498       stmt_info_for_cost *si;
3499       int j;
3500       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3501         {
3502           struct _stmt_vec_info *stmt_info
3503             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3504           (void) add_stmt_cost (target_cost_data,
3505                                 si->count * peel_iters_prologue,
3506                                 si->kind, stmt_info, si->misalign,
3507                                 vect_prologue);
3508           (void) add_stmt_cost (target_cost_data,
3509                                 si->count * peel_iters_epilogue,
3510                                 si->kind, stmt_info, si->misalign,
3511                                 vect_epilogue);
3512         }
3513     }
3514   else
3515     {
3516       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3517       stmt_info_for_cost *si;
3518       int j;
3519       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3520
3521       prologue_cost_vec.create (2);
3522       epilogue_cost_vec.create (2);
3523       peel_iters_prologue = npeel;
3524
3525       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3526                                           &peel_iters_epilogue,
3527                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3528                                             (loop_vinfo),
3529                                           &prologue_cost_vec,
3530                                           &epilogue_cost_vec);
3531
3532       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3533         {
3534           struct _stmt_vec_info *stmt_info
3535             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3536           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3537                                 si->misalign, vect_prologue);
3538         }
3539
3540       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3541         {
3542           struct _stmt_vec_info *stmt_info
3543             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3544           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3545                                 si->misalign, vect_epilogue);
3546         }
3547
3548       prologue_cost_vec.release ();
3549       epilogue_cost_vec.release ();
3550     }
3551
3552   /* FORNOW: The scalar outside cost is incremented in one of the
3553      following ways:
3554
3555      1. The vectorizer checks for alignment and aliasing and generates
3556      a condition that allows dynamic vectorization.  A cost model
3557      check is ANDED with the versioning condition.  Hence scalar code
3558      path now has the added cost of the versioning check.
3559
3560        if (cost > th & versioning_check)
3561          jmp to vector code
3562
3563      Hence run-time scalar is incremented by not-taken branch cost.
3564
3565      2. The vectorizer then checks if a prologue is required.  If the
3566      cost model check was not done before during versioning, it has to
3567      be done before the prologue check.
3568
3569        if (cost <= th)
3570          prologue = scalar_iters
3571        if (prologue == 0)
3572          jmp to vector code
3573        else
3574          execute prologue
3575        if (prologue == num_iters)
3576          go to exit
3577
3578      Hence the run-time scalar cost is incremented by a taken branch,
3579      plus a not-taken branch, plus a taken branch cost.
3580
3581      3. The vectorizer then checks if an epilogue is required.  If the
3582      cost model check was not done before during prologue check, it
3583      has to be done with the epilogue check.
3584
3585        if (prologue == 0)
3586          jmp to vector code
3587        else
3588          execute prologue
3589        if (prologue == num_iters)
3590          go to exit
3591        vector code:
3592          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3593            jmp to epilogue
3594
3595      Hence the run-time scalar cost should be incremented by 2 taken
3596      branches.
3597
3598      TODO: The back end may reorder the BBS's differently and reverse
3599      conditions/branch directions.  Change the estimates below to
3600      something more reasonable.  */
3601
3602   /* If the number of iterations is known and we do not do versioning, we can
3603      decide whether to vectorize at compile time.  Hence the scalar version
3604      do not carry cost model guard costs.  */
3605   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3606       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3607     {
3608       /* Cost model check occurs at versioning.  */
3609       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3610         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3611       else
3612         {
3613           /* Cost model check occurs at prologue generation.  */
3614           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3615             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3616               + vect_get_stmt_cost (cond_branch_not_taken);
3617           /* Cost model check occurs at epilogue generation.  */
3618           else
3619             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3620         }
3621     }
3622
3623   /* Complete the target-specific cost calculations.  */
3624   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3625                &vec_inside_cost, &vec_epilogue_cost);
3626
3627   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3628
3629   if (dump_enabled_p ())
3630     {
3631       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3632       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3633                    vec_inside_cost);
3634       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3635                    vec_prologue_cost);
3636       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3637                    vec_epilogue_cost);
3638       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3639                    scalar_single_iter_cost);
3640       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3641                    scalar_outside_cost);
3642       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3643                    vec_outside_cost);
3644       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3645                    peel_iters_prologue);
3646       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3647                    peel_iters_epilogue);
3648     }
3649
3650   /* Calculate number of iterations required to make the vector version
3651      profitable, relative to the loop bodies only.  The following condition
3652      must hold true:
3653      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3654      where
3655      SIC = scalar iteration cost, VIC = vector iteration cost,
3656      VOC = vector outside cost, VF = vectorization factor,
3657      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3658      SOC = scalar outside cost for run time cost model check.  */
3659
3660   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3661     {
3662       if (vec_outside_cost <= 0)
3663         min_profitable_iters = 0;
3664       else
3665         {
3666           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3667                                   * assumed_vf
3668                                   - vec_inside_cost * peel_iters_prologue
3669                                   - vec_inside_cost * peel_iters_epilogue)
3670                                  / ((scalar_single_iter_cost * assumed_vf)
3671                                     - vec_inside_cost);
3672
3673           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3674               <= (((int) vec_inside_cost * min_profitable_iters)
3675                   + (((int) vec_outside_cost - scalar_outside_cost)
3676                      * assumed_vf)))
3677             min_profitable_iters++;
3678         }
3679     }
3680   /* vector version will never be profitable.  */
3681   else
3682     {
3683       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3684         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3685                     "did not happen for a simd loop");
3686
3687       if (dump_enabled_p ())
3688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3689                          "cost model: the vector iteration cost = %d "
3690                          "divided by the scalar iteration cost = %d "
3691                          "is greater or equal to the vectorization factor = %d"
3692                          ".\n",
3693                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
3694       *ret_min_profitable_niters = -1;
3695       *ret_min_profitable_estimate = -1;
3696       return;
3697     }
3698
3699   dump_printf (MSG_NOTE,
3700                "  Calculated minimum iters for profitability: %d\n",
3701                min_profitable_iters);
3702
3703   /* We want the vectorized loop to execute at least once.  */
3704   if (min_profitable_iters < (assumed_vf + peel_iters_prologue))
3705     min_profitable_iters = assumed_vf + peel_iters_prologue;
3706
3707   if (dump_enabled_p ())
3708     dump_printf_loc (MSG_NOTE, vect_location,
3709                      "  Runtime profitability threshold = %d\n",
3710                      min_profitable_iters);
3711
3712   *ret_min_profitable_niters = min_profitable_iters;
3713
3714   /* Calculate number of iterations required to make the vector version
3715      profitable, relative to the loop bodies only.
3716
3717      Non-vectorized variant is SIC * niters and it must win over vector
3718      variant on the expected loop trip count.  The following condition must hold true:
3719      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3720
3721   if (vec_outside_cost <= 0)
3722     min_profitable_estimate = 0;
3723   else
3724     {
3725       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
3726                                  * assumed_vf
3727                                  - vec_inside_cost * peel_iters_prologue
3728                                  - vec_inside_cost * peel_iters_epilogue)
3729                                  / ((scalar_single_iter_cost * assumed_vf)
3730                                    - vec_inside_cost);
3731     }
3732   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3733   if (dump_enabled_p ())
3734     dump_printf_loc (MSG_NOTE, vect_location,
3735                      "  Static estimate profitability threshold = %d\n",
3736                      min_profitable_estimate);
3737
3738   *ret_min_profitable_estimate = min_profitable_estimate;
3739 }
3740
3741 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3742    vector elements (not bits) for a vector with NELT elements.  */
3743 static void
3744 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3745                               vec_perm_builder *sel)
3746 {
3747   /* The encoding is a single stepped pattern.  Any wrap-around is handled
3748      by vec_perm_indices.  */
3749   sel->new_vector (nelt, 1, 3);
3750   for (unsigned int i = 0; i < 3; i++)
3751     sel->quick_push (i + offset);
3752 }
3753
3754 /* Checks whether the target supports whole-vector shifts for vectors of mode
3755    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3756    it supports vec_perm_const with masks for all necessary shift amounts.  */
3757 static bool
3758 have_whole_vector_shift (machine_mode mode)
3759 {
3760   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3761     return true;
3762
3763   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3764   vec_perm_builder sel;
3765   vec_perm_indices indices;
3766   for (i = nelt/2; i >= 1; i/=2)
3767     {
3768       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3769       indices.new_vector (sel, 2, nelt);
3770       if (!can_vec_perm_const_p (mode, indices, false))
3771         return false;
3772     }
3773   return true;
3774 }
3775
3776 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3777    functions. Design better to avoid maintenance issues.  */
3778
3779 /* Function vect_model_reduction_cost.
3780
3781    Models cost for a reduction operation, including the vector ops
3782    generated within the strip-mine loop, the initial definition before
3783    the loop, and the epilogue code that must be generated.  */
3784
3785 static void
3786 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3787                            int ncopies)
3788 {
3789   int prologue_cost = 0, epilogue_cost = 0;
3790   enum tree_code code;
3791   optab optab;
3792   tree vectype;
3793   gimple *orig_stmt;
3794   machine_mode mode;
3795   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3796   struct loop *loop = NULL;
3797   void *target_cost_data;
3798
3799   if (loop_vinfo)
3800     {
3801       loop = LOOP_VINFO_LOOP (loop_vinfo);
3802       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3803     }
3804   else
3805     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3806
3807   /* Condition reductions generate two reductions in the loop.  */
3808   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3809     ncopies *= 2;
3810
3811   /* Cost of reduction op inside loop.  */
3812   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3813                                         stmt_info, 0, vect_body);
3814
3815   vectype = STMT_VINFO_VECTYPE (stmt_info);
3816   mode = TYPE_MODE (vectype);
3817   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3818
3819   if (!orig_stmt)
3820     orig_stmt = STMT_VINFO_STMT (stmt_info);
3821
3822   code = gimple_assign_rhs_code (orig_stmt);
3823
3824   /* Add in cost for initial definition.
3825      For cond reduction we have four vectors: initial index, step, initial
3826      result of the data reduction, initial value of the index reduction.  */
3827   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3828                        == COND_REDUCTION ? 4 : 1;
3829   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3830                                   scalar_to_vec, stmt_info, 0,
3831                                   vect_prologue);
3832
3833   /* Determine cost of epilogue code.
3834
3835      We have a reduction operator that will reduce the vector in one statement.
3836      Also requires scalar extract.  */
3837
3838   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3839     {
3840       if (reduc_fn != IFN_LAST)
3841         {
3842           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3843             {
3844               /* An EQ stmt and an COND_EXPR stmt.  */
3845               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3846                                               vector_stmt, stmt_info, 0,
3847                                               vect_epilogue);
3848               /* Reduction of the max index and a reduction of the found
3849                  values.  */
3850               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3851                                               vec_to_scalar, stmt_info, 0,
3852                                               vect_epilogue);
3853               /* A broadcast of the max value.  */
3854               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3855                                               scalar_to_vec, stmt_info, 0,
3856                                               vect_epilogue);
3857             }
3858           else
3859             {
3860               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3861                                               stmt_info, 0, vect_epilogue);
3862               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3863                                               vec_to_scalar, stmt_info, 0,
3864                                               vect_epilogue);
3865             }
3866         }
3867       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3868         {
3869           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
3870           /* Extraction of scalar elements.  */
3871           epilogue_cost += add_stmt_cost (target_cost_data,
3872                                           2 * estimated_nunits,
3873                                           vec_to_scalar, stmt_info, 0,
3874                                           vect_epilogue);
3875           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3876           epilogue_cost += add_stmt_cost (target_cost_data,
3877                                           2 * estimated_nunits - 3,
3878                                           scalar_stmt, stmt_info, 0,
3879                                           vect_epilogue);
3880         }
3881       else
3882         {
3883           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3884           tree bitsize =
3885             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3886           int element_bitsize = tree_to_uhwi (bitsize);
3887           int nelements = vec_size_in_bits / element_bitsize;
3888
3889           if (code == COND_EXPR)
3890             code = MAX_EXPR;
3891
3892           optab = optab_for_tree_code (code, vectype, optab_default);
3893
3894           /* We have a whole vector shift available.  */
3895           if (optab != unknown_optab
3896               && VECTOR_MODE_P (mode)
3897               && optab_handler (optab, mode) != CODE_FOR_nothing
3898               && have_whole_vector_shift (mode))
3899             {
3900               /* Final reduction via vector shifts and the reduction operator.
3901                  Also requires scalar extract.  */
3902               epilogue_cost += add_stmt_cost (target_cost_data,
3903                                               exact_log2 (nelements) * 2,
3904                                               vector_stmt, stmt_info, 0,
3905                                               vect_epilogue);
3906               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3907                                               vec_to_scalar, stmt_info, 0,
3908                                               vect_epilogue);
3909             }
3910           else
3911             /* Use extracts and reduction op for final reduction.  For N
3912                elements, we have N extracts and N-1 reduction ops.  */
3913             epilogue_cost += add_stmt_cost (target_cost_data,
3914                                             nelements + nelements - 1,
3915                                             vector_stmt, stmt_info, 0,
3916                                             vect_epilogue);
3917         }
3918     }
3919
3920   if (dump_enabled_p ())
3921     dump_printf (MSG_NOTE,
3922                  "vect_model_reduction_cost: inside_cost = %d, "
3923                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3924                  prologue_cost, epilogue_cost);
3925 }
3926
3927
3928 /* Function vect_model_induction_cost.
3929
3930    Models cost for induction operations.  */
3931
3932 static void
3933 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3934 {
3935   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3936   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3937   unsigned inside_cost, prologue_cost;
3938
3939   if (PURE_SLP_STMT (stmt_info))
3940     return;
3941
3942   /* loop cost for vec_loop.  */
3943   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3944                                stmt_info, 0, vect_body);
3945
3946   /* prologue cost for vec_init and vec_step.  */
3947   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3948                                  stmt_info, 0, vect_prologue);
3949
3950   if (dump_enabled_p ())
3951     dump_printf_loc (MSG_NOTE, vect_location,
3952                      "vect_model_induction_cost: inside_cost = %d, "
3953                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3954 }
3955
3956
3957
3958 /* Function get_initial_def_for_reduction
3959
3960    Input:
3961    STMT - a stmt that performs a reduction operation in the loop.
3962    INIT_VAL - the initial value of the reduction variable
3963
3964    Output:
3965    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3966         of the reduction (used for adjusting the epilog - see below).
3967    Return a vector variable, initialized according to the operation that STMT
3968         performs. This vector will be used as the initial value of the
3969         vector of partial results.
3970
3971    Option1 (adjust in epilog): Initialize the vector as follows:
3972      add/bit or/xor:    [0,0,...,0,0]
3973      mult/bit and:      [1,1,...,1,1]
3974      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3975    and when necessary (e.g. add/mult case) let the caller know
3976    that it needs to adjust the result by init_val.
3977
3978    Option2: Initialize the vector as follows:
3979      add/bit or/xor:    [init_val,0,0,...,0]
3980      mult/bit and:      [init_val,1,1,...,1]
3981      min/max/cond_expr: [init_val,init_val,...,init_val]
3982    and no adjustments are needed.
3983
3984    For example, for the following code:
3985
3986    s = init_val;
3987    for (i=0;i<n;i++)
3988      s = s + a[i];
3989
3990    STMT is 's = s + a[i]', and the reduction variable is 's'.
3991    For a vector of 4 units, we want to return either [0,0,0,init_val],
3992    or [0,0,0,0] and let the caller know that it needs to adjust
3993    the result at the end by 'init_val'.
3994
3995    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3996    initialization vector is simpler (same element in all entries), if
3997    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3998
3999    A cost model should help decide between these two schemes.  */
4000
4001 tree
4002 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4003                                tree *adjustment_def)
4004 {
4005   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4006   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4007   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4008   tree scalar_type = TREE_TYPE (init_val);
4009   tree vectype = get_vectype_for_scalar_type (scalar_type);
4010   enum tree_code code = gimple_assign_rhs_code (stmt);
4011   tree def_for_init;
4012   tree init_def;
4013   bool nested_in_vect_loop = false;
4014   REAL_VALUE_TYPE real_init_val = dconst0;
4015   int int_init_val = 0;
4016   gimple *def_stmt = NULL;
4017   gimple_seq stmts = NULL;
4018
4019   gcc_assert (vectype);
4020
4021   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4022               || SCALAR_FLOAT_TYPE_P (scalar_type));
4023
4024   if (nested_in_vect_loop_p (loop, stmt))
4025     nested_in_vect_loop = true;
4026   else
4027     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4028
4029   /* In case of double reduction we only create a vector variable to be put
4030      in the reduction phi node.  The actual statement creation is done in
4031      vect_create_epilog_for_reduction.  */
4032   if (adjustment_def && nested_in_vect_loop
4033       && TREE_CODE (init_val) == SSA_NAME
4034       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4035       && gimple_code (def_stmt) == GIMPLE_PHI
4036       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4037       && vinfo_for_stmt (def_stmt)
4038       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4039           == vect_double_reduction_def)
4040     {
4041       *adjustment_def = NULL;
4042       return vect_create_destination_var (init_val, vectype);
4043     }
4044
4045   /* In case of a nested reduction do not use an adjustment def as
4046      that case is not supported by the epilogue generation correctly
4047      if ncopies is not one.  */
4048   if (adjustment_def && nested_in_vect_loop)
4049     {
4050       *adjustment_def = NULL;
4051       return vect_get_vec_def_for_operand (init_val, stmt);
4052     }
4053
4054   switch (code)
4055     {
4056     case WIDEN_SUM_EXPR:
4057     case DOT_PROD_EXPR:
4058     case SAD_EXPR:
4059     case PLUS_EXPR:
4060     case MINUS_EXPR:
4061     case BIT_IOR_EXPR:
4062     case BIT_XOR_EXPR:
4063     case MULT_EXPR:
4064     case BIT_AND_EXPR:
4065       {
4066         /* ADJUSTMENT_DEF is NULL when called from
4067            vect_create_epilog_for_reduction to vectorize double reduction.  */
4068         if (adjustment_def)
4069           *adjustment_def = init_val;
4070
4071         if (code == MULT_EXPR)
4072           {
4073             real_init_val = dconst1;
4074             int_init_val = 1;
4075           }
4076
4077         if (code == BIT_AND_EXPR)
4078           int_init_val = -1;
4079
4080         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4081           def_for_init = build_real (scalar_type, real_init_val);
4082         else
4083           def_for_init = build_int_cst (scalar_type, int_init_val);
4084
4085         if (adjustment_def)
4086           /* Option1: the first element is '0' or '1' as well.  */
4087           init_def = gimple_build_vector_from_val (&stmts, vectype,
4088                                                    def_for_init);
4089         else
4090           {
4091             /* Option2: the first element is INIT_VAL.  */
4092             tree_vector_builder elts (vectype, 1, 2);
4093             elts.quick_push (init_val);
4094             elts.quick_push (def_for_init);
4095             init_def = gimple_build_vector (&stmts, &elts);
4096           }
4097       }
4098       break;
4099
4100     case MIN_EXPR:
4101     case MAX_EXPR:
4102     case COND_EXPR:
4103       {
4104         if (adjustment_def)
4105           {
4106             *adjustment_def = NULL_TREE;
4107             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4108               {
4109                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4110                 break;
4111               }
4112           }
4113         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4114         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4115       }
4116       break;
4117
4118     default:
4119       gcc_unreachable ();
4120     }
4121
4122   if (stmts)
4123     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4124   return init_def;
4125 }
4126
4127 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4128    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4129
4130 static void
4131 get_initial_defs_for_reduction (slp_tree slp_node,
4132                                 vec<tree> *vec_oprnds,
4133                                 unsigned int number_of_vectors,
4134                                 enum tree_code code, bool reduc_chain)
4135 {
4136   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4137   gimple *stmt = stmts[0];
4138   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4139   unsigned nunits;
4140   unsigned j, number_of_places_left_in_vector;
4141   tree vector_type, scalar_type;
4142   tree vop;
4143   int group_size = stmts.length ();
4144   unsigned int vec_num, i;
4145   unsigned number_of_copies = 1;
4146   vec<tree> voprnds;
4147   voprnds.create (number_of_vectors);
4148   tree neutral_op = NULL;
4149   struct loop *loop;
4150
4151   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4152   scalar_type = TREE_TYPE (vector_type);
4153   /* vectorizable_reduction has already rejected SLP reductions on
4154      variable-length vectors.  */
4155   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4156
4157   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4158
4159   loop = (gimple_bb (stmt))->loop_father;
4160   gcc_assert (loop);
4161   edge pe = loop_preheader_edge (loop);
4162
4163   /* op is the reduction operand of the first stmt already.  */
4164   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4165      we need either neutral operands or the original operands.  See
4166      get_initial_def_for_reduction() for details.  */
4167   switch (code)
4168     {
4169     case WIDEN_SUM_EXPR:
4170     case DOT_PROD_EXPR:
4171     case SAD_EXPR:
4172     case PLUS_EXPR:
4173     case MINUS_EXPR:
4174     case BIT_IOR_EXPR:
4175     case BIT_XOR_EXPR:
4176       neutral_op = build_zero_cst (scalar_type);
4177       break;
4178
4179     case MULT_EXPR:
4180       neutral_op = build_one_cst (scalar_type);
4181       break;
4182
4183     case BIT_AND_EXPR:
4184       neutral_op = build_all_ones_cst (scalar_type);
4185       break;
4186
4187     /* For MIN/MAX we don't have an easy neutral operand but
4188        the initial values can be used fine here.  Only for
4189        a reduction chain we have to force a neutral element.  */
4190     case MAX_EXPR:
4191     case MIN_EXPR:
4192       if (! reduc_chain)
4193         neutral_op = NULL;
4194       else
4195         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4196       break;
4197
4198     default:
4199       gcc_assert (! reduc_chain);
4200       neutral_op = NULL;
4201     }
4202
4203   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4204      created vectors. It is greater than 1 if unrolling is performed.
4205
4206      For example, we have two scalar operands, s1 and s2 (e.g., group of
4207      strided accesses of size two), while NUNITS is four (i.e., four scalars
4208      of this type can be packed in a vector).  The output vector will contain
4209      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4210      will be 2).
4211
4212      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4213      containing the operands.
4214
4215      For example, NUNITS is four as before, and the group size is 8
4216      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4217      {s5, s6, s7, s8}.  */
4218
4219   number_of_copies = nunits * number_of_vectors / group_size;
4220
4221   number_of_places_left_in_vector = nunits;
4222   tree_vector_builder elts (vector_type, nunits, 1);
4223   elts.quick_grow (nunits);
4224   for (j = 0; j < number_of_copies; j++)
4225     {
4226       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4227         {
4228           tree op;
4229           /* Get the def before the loop.  In reduction chain we have only
4230              one initial value.  */
4231           if ((j != (number_of_copies - 1)
4232                || (reduc_chain && i != 0))
4233               && neutral_op)
4234             op = neutral_op;
4235           else
4236             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4237
4238           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4239           number_of_places_left_in_vector--;
4240           elts[number_of_places_left_in_vector] = op;
4241
4242           if (number_of_places_left_in_vector == 0)
4243             {
4244               gimple_seq ctor_seq = NULL;
4245               tree init = gimple_build_vector (&ctor_seq, &elts);
4246               if (ctor_seq != NULL)
4247                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4248               voprnds.quick_push (init);
4249
4250               number_of_places_left_in_vector = nunits;
4251               elts.new_vector (vector_type, nunits, 1);
4252               elts.quick_grow (nunits);
4253             }
4254         }
4255     }
4256
4257   /* Since the vectors are created in the reverse order, we should invert
4258      them.  */
4259   vec_num = voprnds.length ();
4260   for (j = vec_num; j != 0; j--)
4261     {
4262       vop = voprnds[j - 1];
4263       vec_oprnds->quick_push (vop);
4264     }
4265
4266   voprnds.release ();
4267
4268   /* In case that VF is greater than the unrolling factor needed for the SLP
4269      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4270      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4271      to replicate the vectors.  */
4272   tree neutral_vec = NULL;
4273   while (number_of_vectors > vec_oprnds->length ())
4274     {
4275       if (neutral_op)
4276         {
4277           if (!neutral_vec)
4278             {
4279               gimple_seq ctor_seq = NULL;
4280               neutral_vec = gimple_build_vector_from_val
4281                 (&ctor_seq, vector_type, neutral_op);
4282               if (ctor_seq != NULL)
4283                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4284             }
4285           vec_oprnds->quick_push (neutral_vec);
4286         }
4287       else
4288         {
4289           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4290             vec_oprnds->quick_push (vop);
4291         }
4292     }
4293 }
4294
4295
4296 /* Function vect_create_epilog_for_reduction
4297
4298    Create code at the loop-epilog to finalize the result of a reduction
4299    computation.
4300
4301    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4302      reduction statements.
4303    STMT is the scalar reduction stmt that is being vectorized.
4304    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4305      number of elements that we can fit in a vectype (nunits).  In this case
4306      we have to generate more than one vector stmt - i.e - we need to "unroll"
4307      the vector stmt by a factor VF/nunits.  For more details see documentation
4308      in vectorizable_operation.
4309    REDUC_FN is the internal function for the epilog reduction.
4310    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4311      computation.
4312    REDUC_INDEX is the index of the operand in the right hand side of the
4313      statement that is defined by REDUCTION_PHI.
4314    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4315    SLP_NODE is an SLP node containing a group of reduction statements. The
4316      first one in this group is STMT.
4317    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4318      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4319      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4320      any value of the IV in the loop.
4321    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4322
4323    This function:
4324    1. Creates the reduction def-use cycles: sets the arguments for
4325       REDUCTION_PHIS:
4326       The loop-entry argument is the vectorized initial-value of the reduction.
4327       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4328       sums.
4329    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4330       by calling the function specified by REDUC_FN if available, or by
4331       other means (whole-vector shifts or a scalar loop).
4332       The function also creates a new phi node at the loop exit to preserve
4333       loop-closed form, as illustrated below.
4334
4335      The flow at the entry to this function:
4336
4337         loop:
4338           vec_def = phi <null, null>            # REDUCTION_PHI
4339           VECT_DEF = vector_stmt                # vectorized form of STMT
4340           s_loop = scalar_stmt                  # (scalar) STMT
4341         loop_exit:
4342           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4343           use <s_out0>
4344           use <s_out0>
4345
4346      The above is transformed by this function into:
4347
4348         loop:
4349           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4350           VECT_DEF = vector_stmt                # vectorized form of STMT
4351           s_loop = scalar_stmt                  # (scalar) STMT
4352         loop_exit:
4353           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4354           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4355           v_out2 = reduce <v_out1>
4356           s_out3 = extract_field <v_out2, 0>
4357           s_out4 = adjust_result <s_out3>
4358           use <s_out4>
4359           use <s_out4>
4360 */
4361
4362 static void
4363 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4364                                   gimple *reduc_def_stmt,
4365                                   int ncopies, internal_fn reduc_fn,
4366                                   vec<gimple *> reduction_phis,
4367                                   bool double_reduc,
4368                                   slp_tree slp_node,
4369                                   slp_instance slp_node_instance,
4370                                   tree induc_val, enum tree_code induc_code)
4371 {
4372   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4373   stmt_vec_info prev_phi_info;
4374   tree vectype;
4375   machine_mode mode;
4376   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4377   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4378   basic_block exit_bb;
4379   tree scalar_dest;
4380   tree scalar_type;
4381   gimple *new_phi = NULL, *phi;
4382   gimple_stmt_iterator exit_gsi;
4383   tree vec_dest;
4384   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4385   gimple *epilog_stmt = NULL;
4386   enum tree_code code = gimple_assign_rhs_code (stmt);
4387   gimple *exit_phi;
4388   tree bitsize;
4389   tree adjustment_def = NULL;
4390   tree vec_initial_def = NULL;
4391   tree expr, def, initial_def = NULL;
4392   tree orig_name, scalar_result;
4393   imm_use_iterator imm_iter, phi_imm_iter;
4394   use_operand_p use_p, phi_use_p;
4395   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4396   bool nested_in_vect_loop = false;
4397   auto_vec<gimple *> new_phis;
4398   auto_vec<gimple *> inner_phis;
4399   enum vect_def_type dt = vect_unknown_def_type;
4400   int j, i;
4401   auto_vec<tree> scalar_results;
4402   unsigned int group_size = 1, k, ratio;
4403   auto_vec<tree> vec_initial_defs;
4404   auto_vec<gimple *> phis;
4405   bool slp_reduc = false;
4406   tree new_phi_result;
4407   gimple *inner_phi = NULL;
4408   tree induction_index = NULL_TREE;
4409
4410   if (slp_node)
4411     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4412
4413   if (nested_in_vect_loop_p (loop, stmt))
4414     {
4415       outer_loop = loop;
4416       loop = loop->inner;
4417       nested_in_vect_loop = true;
4418       gcc_assert (!slp_node);
4419     }
4420
4421   vectype = STMT_VINFO_VECTYPE (stmt_info);
4422   gcc_assert (vectype);
4423   mode = TYPE_MODE (vectype);
4424
4425   /* 1. Create the reduction def-use cycle:
4426      Set the arguments of REDUCTION_PHIS, i.e., transform
4427
4428         loop:
4429           vec_def = phi <null, null>            # REDUCTION_PHI
4430           VECT_DEF = vector_stmt                # vectorized form of STMT
4431           ...
4432
4433      into:
4434
4435         loop:
4436           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4437           VECT_DEF = vector_stmt                # vectorized form of STMT
4438           ...
4439
4440      (in case of SLP, do it for all the phis). */
4441
4442   /* Get the loop-entry arguments.  */
4443   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4444   if (slp_node)
4445     {
4446       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4447       vec_initial_defs.reserve (vec_num);
4448       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4449                                       &vec_initial_defs, vec_num, code,
4450                                       GROUP_FIRST_ELEMENT (stmt_info));
4451     }
4452   else
4453     {
4454       /* Get at the scalar def before the loop, that defines the initial value
4455          of the reduction variable.  */
4456       gimple *def_stmt;
4457       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4458                                            loop_preheader_edge (loop));
4459       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4460          and we can't use zero for induc_val, use initial_def.  Similarly
4461          for REDUC_MIN and initial_def larger than the base.  */
4462       if (TREE_CODE (initial_def) == INTEGER_CST
4463           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4464               == INTEGER_INDUC_COND_REDUCTION)
4465           && !integer_zerop (induc_val)
4466           && ((induc_code == MAX_EXPR
4467                && tree_int_cst_lt (initial_def, induc_val))
4468               || (induc_code == MIN_EXPR
4469                   && tree_int_cst_lt (induc_val, initial_def))))
4470         induc_val = initial_def;
4471       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4472       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4473                                                        &adjustment_def);
4474       vec_initial_defs.create (1);
4475       vec_initial_defs.quick_push (vec_initial_def);
4476     }
4477
4478   /* Set phi nodes arguments.  */
4479   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4480     {
4481       tree vec_init_def = vec_initial_defs[i];
4482       tree def = vect_defs[i];
4483       for (j = 0; j < ncopies; j++)
4484         {
4485           if (j != 0)
4486             {
4487               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4488               if (nested_in_vect_loop)
4489                 vec_init_def
4490                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4491                                                     vec_init_def);
4492             }
4493
4494           /* Set the loop-entry arg of the reduction-phi.  */
4495
4496           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4497               == INTEGER_INDUC_COND_REDUCTION)
4498             {
4499               /* Initialise the reduction phi to zero.  This prevents initial
4500                  values of non-zero interferring with the reduction op.  */
4501               gcc_assert (ncopies == 1);
4502               gcc_assert (i == 0);
4503
4504               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4505               tree induc_val_vec
4506                 = build_vector_from_val (vec_init_def_type, induc_val);
4507
4508               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4509                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4510             }
4511           else
4512             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4513                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4514
4515           /* Set the loop-latch arg for the reduction-phi.  */
4516           if (j > 0)
4517             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4518
4519           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4520                        UNKNOWN_LOCATION);
4521
4522           if (dump_enabled_p ())
4523             {
4524               dump_printf_loc (MSG_NOTE, vect_location,
4525                                "transform reduction: created def-use cycle: ");
4526               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4527               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4528             }
4529         }
4530     }
4531
4532   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4533      which is updated with the current index of the loop for every match of
4534      the original loop's cond_expr (VEC_STMT).  This results in a vector
4535      containing the last time the condition passed for that vector lane.
4536      The first match will be a 1 to allow 0 to be used for non-matching
4537      indexes.  If there are no matches at all then the vector will be all
4538      zeroes.  */
4539   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4540     {
4541       tree indx_before_incr, indx_after_incr;
4542       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4543
4544       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4545       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4546
4547       int scalar_precision
4548         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4549       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4550       tree cr_index_vector_type = build_vector_type
4551         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4552
4553       /* First we create a simple vector induction variable which starts
4554          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4555          vector size (STEP).  */
4556
4557       /* Create a {1,2,3,...} vector.  */
4558       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4559
4560       /* Create a vector of the step value.  */
4561       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4562       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4563
4564       /* Create an induction variable.  */
4565       gimple_stmt_iterator incr_gsi;
4566       bool insert_after;
4567       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4568       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4569                  insert_after, &indx_before_incr, &indx_after_incr);
4570
4571       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4572          filled with zeros (VEC_ZERO).  */
4573
4574       /* Create a vector of 0s.  */
4575       tree zero = build_zero_cst (cr_index_scalar_type);
4576       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4577
4578       /* Create a vector phi node.  */
4579       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4580       new_phi = create_phi_node (new_phi_tree, loop->header);
4581       set_vinfo_for_stmt (new_phi,
4582                           new_stmt_vec_info (new_phi, loop_vinfo));
4583       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4584                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4585
4586       /* Now take the condition from the loops original cond_expr
4587          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4588          every match uses values from the induction variable
4589          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4590          (NEW_PHI_TREE).
4591          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4592          the new cond_expr (INDEX_COND_EXPR).  */
4593
4594       /* Duplicate the condition from vec_stmt.  */
4595       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4596
4597       /* Create a conditional, where the condition is taken from vec_stmt
4598          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4599          else is the phi (NEW_PHI_TREE).  */
4600       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4601                                      ccompare, indx_before_incr,
4602                                      new_phi_tree);
4603       induction_index = make_ssa_name (cr_index_vector_type);
4604       gimple *index_condition = gimple_build_assign (induction_index,
4605                                                      index_cond_expr);
4606       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4607       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4608                                                         loop_vinfo);
4609       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4610       set_vinfo_for_stmt (index_condition, index_vec_info);
4611
4612       /* Update the phi with the vec cond.  */
4613       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4614                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4615     }
4616
4617   /* 2. Create epilog code.
4618         The reduction epilog code operates across the elements of the vector
4619         of partial results computed by the vectorized loop.
4620         The reduction epilog code consists of:
4621
4622         step 1: compute the scalar result in a vector (v_out2)
4623         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4624         step 3: adjust the scalar result (s_out3) if needed.
4625
4626         Step 1 can be accomplished using one the following three schemes:
4627           (scheme 1) using reduc_fn, if available.
4628           (scheme 2) using whole-vector shifts, if available.
4629           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4630                      combined.
4631
4632           The overall epilog code looks like this:
4633
4634           s_out0 = phi <s_loop>         # original EXIT_PHI
4635           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4636           v_out2 = reduce <v_out1>              # step 1
4637           s_out3 = extract_field <v_out2, 0>    # step 2
4638           s_out4 = adjust_result <s_out3>       # step 3
4639
4640           (step 3 is optional, and steps 1 and 2 may be combined).
4641           Lastly, the uses of s_out0 are replaced by s_out4.  */
4642
4643
4644   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4645          v_out1 = phi <VECT_DEF>
4646          Store them in NEW_PHIS.  */
4647
4648   exit_bb = single_exit (loop)->dest;
4649   prev_phi_info = NULL;
4650   new_phis.create (vect_defs.length ());
4651   FOR_EACH_VEC_ELT (vect_defs, i, def)
4652     {
4653       for (j = 0; j < ncopies; j++)
4654         {
4655           tree new_def = copy_ssa_name (def);
4656           phi = create_phi_node (new_def, exit_bb);
4657           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4658           if (j == 0)
4659             new_phis.quick_push (phi);
4660           else
4661             {
4662               def = vect_get_vec_def_for_stmt_copy (dt, def);
4663               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4664             }
4665
4666           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4667           prev_phi_info = vinfo_for_stmt (phi);
4668         }
4669     }
4670
4671   /* The epilogue is created for the outer-loop, i.e., for the loop being
4672      vectorized.  Create exit phis for the outer loop.  */
4673   if (double_reduc)
4674     {
4675       loop = outer_loop;
4676       exit_bb = single_exit (loop)->dest;
4677       inner_phis.create (vect_defs.length ());
4678       FOR_EACH_VEC_ELT (new_phis, i, phi)
4679         {
4680           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4681           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4682           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4683                            PHI_RESULT (phi));
4684           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4685                                                             loop_vinfo));
4686           inner_phis.quick_push (phi);
4687           new_phis[i] = outer_phi;
4688           prev_phi_info = vinfo_for_stmt (outer_phi);
4689           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4690             {
4691               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4692               new_result = copy_ssa_name (PHI_RESULT (phi));
4693               outer_phi = create_phi_node (new_result, exit_bb);
4694               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4695                                PHI_RESULT (phi));
4696               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4697                                                                 loop_vinfo));
4698               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4699               prev_phi_info = vinfo_for_stmt (outer_phi);
4700             }
4701         }
4702     }
4703
4704   exit_gsi = gsi_after_labels (exit_bb);
4705
4706   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4707          (i.e. when reduc_fn is not available) and in the final adjustment
4708          code (if needed).  Also get the original scalar reduction variable as
4709          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4710          represents a reduction pattern), the tree-code and scalar-def are
4711          taken from the original stmt that the pattern-stmt (STMT) replaces.
4712          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4713          are taken from STMT.  */
4714
4715   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4716   if (!orig_stmt)
4717     {
4718       /* Regular reduction  */
4719       orig_stmt = stmt;
4720     }
4721   else
4722     {
4723       /* Reduction pattern  */
4724       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4725       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4726       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4727     }
4728
4729   code = gimple_assign_rhs_code (orig_stmt);
4730   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4731      partial results are added and not subtracted.  */
4732   if (code == MINUS_EXPR)
4733     code = PLUS_EXPR;
4734
4735   scalar_dest = gimple_assign_lhs (orig_stmt);
4736   scalar_type = TREE_TYPE (scalar_dest);
4737   scalar_results.create (group_size);
4738   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4739   bitsize = TYPE_SIZE (scalar_type);
4740
4741   /* In case this is a reduction in an inner-loop while vectorizing an outer
4742      loop - we don't need to extract a single scalar result at the end of the
4743      inner-loop (unless it is double reduction, i.e., the use of reduction is
4744      outside the outer-loop).  The final vector of partial results will be used
4745      in the vectorized outer-loop, or reduced to a scalar result at the end of
4746      the outer-loop.  */
4747   if (nested_in_vect_loop && !double_reduc)
4748     goto vect_finalize_reduction;
4749
4750   /* SLP reduction without reduction chain, e.g.,
4751      # a1 = phi <a2, a0>
4752      # b1 = phi <b2, b0>
4753      a2 = operation (a1)
4754      b2 = operation (b1)  */
4755   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4756
4757   /* In case of reduction chain, e.g.,
4758      # a1 = phi <a3, a0>
4759      a2 = operation (a1)
4760      a3 = operation (a2),
4761
4762      we may end up with more than one vector result.  Here we reduce them to
4763      one vector.  */
4764   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4765     {
4766       tree first_vect = PHI_RESULT (new_phis[0]);
4767       gassign *new_vec_stmt = NULL;
4768       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4769       for (k = 1; k < new_phis.length (); k++)
4770         {
4771           gimple *next_phi = new_phis[k];
4772           tree second_vect = PHI_RESULT (next_phi);
4773           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4774           new_vec_stmt = gimple_build_assign (tem, code,
4775                                               first_vect, second_vect);
4776           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4777           first_vect = tem;
4778         }
4779
4780       new_phi_result = first_vect;
4781       if (new_vec_stmt)
4782         {
4783           new_phis.truncate (0);
4784           new_phis.safe_push (new_vec_stmt);
4785         }
4786     }
4787   /* Likewise if we couldn't use a single defuse cycle.  */
4788   else if (ncopies > 1)
4789     {
4790       gcc_assert (new_phis.length () == 1);
4791       tree first_vect = PHI_RESULT (new_phis[0]);
4792       gassign *new_vec_stmt = NULL;
4793       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4794       gimple *next_phi = new_phis[0];
4795       for (int k = 1; k < ncopies; ++k)
4796         {
4797           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4798           tree second_vect = PHI_RESULT (next_phi);
4799           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4800           new_vec_stmt = gimple_build_assign (tem, code,
4801                                               first_vect, second_vect);
4802           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4803           first_vect = tem;
4804         }
4805       new_phi_result = first_vect;
4806       new_phis.truncate (0);
4807       new_phis.safe_push (new_vec_stmt);
4808     }
4809   else
4810     new_phi_result = PHI_RESULT (new_phis[0]);
4811
4812   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4813       && reduc_fn != IFN_LAST)
4814     {
4815       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4816          various data values where the condition matched and another vector
4817          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4818          need to extract the last matching index (which will be the index with
4819          highest value) and use this to index into the data vector.
4820          For the case where there were no matches, the data vector will contain
4821          all default values and the index vector will be all zeros.  */
4822
4823       /* Get various versions of the type of the vector of indexes.  */
4824       tree index_vec_type = TREE_TYPE (induction_index);
4825       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4826       tree index_scalar_type = TREE_TYPE (index_vec_type);
4827       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4828         (index_vec_type);
4829
4830       /* Get an unsigned integer version of the type of the data vector.  */
4831       int scalar_precision
4832         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4833       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4834       tree vectype_unsigned = build_vector_type
4835         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4836
4837       /* First we need to create a vector (ZERO_VEC) of zeros and another
4838          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4839          can create using a MAX reduction and then expanding.
4840          In the case where the loop never made any matches, the max index will
4841          be zero.  */
4842
4843       /* Vector of {0, 0, 0,...}.  */
4844       tree zero_vec = make_ssa_name (vectype);
4845       tree zero_vec_rhs = build_zero_cst (vectype);
4846       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4847       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4848
4849       /* Find maximum value from the vector of found indexes.  */
4850       tree max_index = make_ssa_name (index_scalar_type);
4851       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4852                                                           1, induction_index);
4853       gimple_call_set_lhs (max_index_stmt, max_index);
4854       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4855
4856       /* Vector of {max_index, max_index, max_index,...}.  */
4857       tree max_index_vec = make_ssa_name (index_vec_type);
4858       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4859                                                       max_index);
4860       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4861                                                         max_index_vec_rhs);
4862       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4863
4864       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4865          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4866          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4867          otherwise.  Only one value should match, resulting in a vector
4868          (VEC_COND) with one data value and the rest zeros.
4869          In the case where the loop never made any matches, every index will
4870          match, resulting in a vector with all data values (which will all be
4871          the default value).  */
4872
4873       /* Compare the max index vector to the vector of found indexes to find
4874          the position of the max value.  */
4875       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4876       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4877                                                       induction_index,
4878                                                       max_index_vec);
4879       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4880
4881       /* Use the compare to choose either values from the data vector or
4882          zero.  */
4883       tree vec_cond = make_ssa_name (vectype);
4884       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4885                                                    vec_compare, new_phi_result,
4886                                                    zero_vec);
4887       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4888
4889       /* Finally we need to extract the data value from the vector (VEC_COND)
4890          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4891          reduction, but because this doesn't exist, we can use a MAX reduction
4892          instead.  The data value might be signed or a float so we need to cast
4893          it first.
4894          In the case where the loop never made any matches, the data values are
4895          all identical, and so will reduce down correctly.  */
4896
4897       /* Make the matched data values unsigned.  */
4898       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4899       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4900                                        vec_cond);
4901       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4902                                                         VIEW_CONVERT_EXPR,
4903                                                         vec_cond_cast_rhs);
4904       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4905
4906       /* Reduce down to a scalar value.  */
4907       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4908       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4909                                                            1, vec_cond_cast);
4910       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4911       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4912
4913       /* Convert the reduced value back to the result type and set as the
4914          result.  */
4915       gimple_seq stmts = NULL;
4916       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4917                                data_reduc);
4918       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4919       scalar_results.safe_push (new_temp);
4920     }
4921   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4922            && reduc_fn == IFN_LAST)
4923     {
4924       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4925          idx = 0;
4926          idx_val = induction_index[0];
4927          val = data_reduc[0];
4928          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4929            if (induction_index[i] > idx_val)
4930              val = data_reduc[i], idx_val = induction_index[i];
4931          return val;  */
4932
4933       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4934       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4935       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4936       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4937       /* Enforced by vectorizable_reduction, which ensures we have target
4938          support before allowing a conditional reduction on variable-length
4939          vectors.  */
4940       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
4941       tree idx_val = NULL_TREE, val = NULL_TREE;
4942       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4943         {
4944           tree old_idx_val = idx_val;
4945           tree old_val = val;
4946           idx_val = make_ssa_name (idx_eltype);
4947           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4948                                              build3 (BIT_FIELD_REF, idx_eltype,
4949                                                      induction_index,
4950                                                      bitsize_int (el_size),
4951                                                      bitsize_int (off)));
4952           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4953           val = make_ssa_name (data_eltype);
4954           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4955                                              build3 (BIT_FIELD_REF,
4956                                                      data_eltype,
4957                                                      new_phi_result,
4958                                                      bitsize_int (el_size),
4959                                                      bitsize_int (off)));
4960           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4961           if (off != 0)
4962             {
4963               tree new_idx_val = idx_val;
4964               tree new_val = val;
4965               if (off != v_size - el_size)
4966                 {
4967                   new_idx_val = make_ssa_name (idx_eltype);
4968                   epilog_stmt = gimple_build_assign (new_idx_val,
4969                                                      MAX_EXPR, idx_val,
4970                                                      old_idx_val);
4971                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4972                 }
4973               new_val = make_ssa_name (data_eltype);
4974               epilog_stmt = gimple_build_assign (new_val,
4975                                                  COND_EXPR,
4976                                                  build2 (GT_EXPR,
4977                                                          boolean_type_node,
4978                                                          idx_val,
4979                                                          old_idx_val),
4980                                                  val, old_val);
4981               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4982               idx_val = new_idx_val;
4983               val = new_val;
4984             }
4985         }
4986       /* Convert the reduced value back to the result type and set as the
4987          result.  */
4988       gimple_seq stmts = NULL;
4989       val = gimple_convert (&stmts, scalar_type, val);
4990       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4991       scalar_results.safe_push (val);
4992     }
4993
4994   /* 2.3 Create the reduction code, using one of the three schemes described
4995          above. In SLP we simply need to extract all the elements from the
4996          vector (without reducing them), so we use scalar shifts.  */
4997   else if (reduc_fn != IFN_LAST && !slp_reduc)
4998     {
4999       tree tmp;
5000       tree vec_elem_type;
5001
5002       /* Case 1:  Create:
5003          v_out2 = reduc_expr <v_out1>  */
5004
5005       if (dump_enabled_p ())
5006         dump_printf_loc (MSG_NOTE, vect_location,
5007                          "Reduce using direct vector reduction.\n");
5008
5009       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5010       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5011         {
5012           tree tmp_dest
5013             = vect_create_destination_var (scalar_dest, vec_elem_type);
5014           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5015                                                     new_phi_result);
5016           gimple_set_lhs (epilog_stmt, tmp_dest);
5017           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5018           gimple_set_lhs (epilog_stmt, new_temp);
5019           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5020
5021           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5022                                              new_temp);
5023         }
5024       else
5025         {
5026           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5027                                                     new_phi_result);
5028           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5029         }
5030
5031       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5032       gimple_set_lhs (epilog_stmt, new_temp);
5033       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5034
5035       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5036            == INTEGER_INDUC_COND_REDUCTION)
5037           && !operand_equal_p (initial_def, induc_val, 0))
5038         {
5039           /* Earlier we set the initial value to be a vector if induc_val
5040              values.  Check the result and if it is induc_val then replace
5041              with the original initial value, unless induc_val is
5042              the same as initial_def already.  */
5043           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5044                                   induc_val);
5045
5046           tmp = make_ssa_name (new_scalar_dest);
5047           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5048                                              initial_def, new_temp);
5049           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5050           new_temp = tmp;
5051         }
5052
5053       scalar_results.safe_push (new_temp);
5054     }
5055   else
5056     {
5057       bool reduce_with_shift = have_whole_vector_shift (mode);
5058       int element_bitsize = tree_to_uhwi (bitsize);
5059       /* Enforced by vectorizable_reduction, which disallows SLP reductions
5060          for variable-length vectors and also requires direct target support
5061          for loop reductions.  */
5062       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5063       tree vec_temp;
5064
5065       /* COND reductions all do the final reduction with MAX_EXPR
5066          or MIN_EXPR.  */
5067       if (code == COND_EXPR)
5068         {
5069           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5070               == INTEGER_INDUC_COND_REDUCTION)
5071             code = induc_code;
5072           else
5073             code = MAX_EXPR;
5074         }
5075
5076       /* Regardless of whether we have a whole vector shift, if we're
5077          emulating the operation via tree-vect-generic, we don't want
5078          to use it.  Only the first round of the reduction is likely
5079          to still be profitable via emulation.  */
5080       /* ??? It might be better to emit a reduction tree code here, so that
5081          tree-vect-generic can expand the first round via bit tricks.  */
5082       if (!VECTOR_MODE_P (mode))
5083         reduce_with_shift = false;
5084       else
5085         {
5086           optab optab = optab_for_tree_code (code, vectype, optab_default);
5087           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5088             reduce_with_shift = false;
5089         }
5090
5091       if (reduce_with_shift && !slp_reduc)
5092         {
5093           int nelements = vec_size_in_bits / element_bitsize;
5094           vec_perm_builder sel;
5095           vec_perm_indices indices;
5096
5097           int elt_offset;
5098
5099           tree zero_vec = build_zero_cst (vectype);
5100           /* Case 2: Create:
5101              for (offset = nelements/2; offset >= 1; offset/=2)
5102                 {
5103                   Create:  va' = vec_shift <va, offset>
5104                   Create:  va = vop <va, va'>
5105                 }  */
5106
5107           tree rhs;
5108
5109           if (dump_enabled_p ())
5110             dump_printf_loc (MSG_NOTE, vect_location,
5111                              "Reduce using vector shifts\n");
5112
5113           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5114           new_temp = new_phi_result;
5115           for (elt_offset = nelements / 2;
5116                elt_offset >= 1;
5117                elt_offset /= 2)
5118             {
5119               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5120               indices.new_vector (sel, 2, nelements);
5121               tree mask = vect_gen_perm_mask_any (vectype, indices);
5122               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5123                                                  new_temp, zero_vec, mask);
5124               new_name = make_ssa_name (vec_dest, epilog_stmt);
5125               gimple_assign_set_lhs (epilog_stmt, new_name);
5126               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5127
5128               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5129                                                  new_temp);
5130               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5131               gimple_assign_set_lhs (epilog_stmt, new_temp);
5132               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5133             }
5134
5135           /* 2.4  Extract the final scalar result.  Create:
5136              s_out3 = extract_field <v_out2, bitpos>  */
5137
5138           if (dump_enabled_p ())
5139             dump_printf_loc (MSG_NOTE, vect_location,
5140                              "extract scalar result\n");
5141
5142           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5143                         bitsize, bitsize_zero_node);
5144           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5145           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5146           gimple_assign_set_lhs (epilog_stmt, new_temp);
5147           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5148           scalar_results.safe_push (new_temp);
5149         }
5150       else
5151         {
5152           /* Case 3: Create:
5153              s = extract_field <v_out2, 0>
5154              for (offset = element_size;
5155                   offset < vector_size;
5156                   offset += element_size;)
5157                {
5158                  Create:  s' = extract_field <v_out2, offset>
5159                  Create:  s = op <s, s'>  // For non SLP cases
5160                }  */
5161
5162           if (dump_enabled_p ())
5163             dump_printf_loc (MSG_NOTE, vect_location,
5164                              "Reduce using scalar code.\n");
5165
5166           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5167           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5168             {
5169               int bit_offset;
5170               if (gimple_code (new_phi) == GIMPLE_PHI)
5171                 vec_temp = PHI_RESULT (new_phi);
5172               else
5173                 vec_temp = gimple_assign_lhs (new_phi);
5174               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5175                                  bitsize_zero_node);
5176               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5177               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5178               gimple_assign_set_lhs (epilog_stmt, new_temp);
5179               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5180
5181               /* In SLP we don't need to apply reduction operation, so we just
5182                  collect s' values in SCALAR_RESULTS.  */
5183               if (slp_reduc)
5184                 scalar_results.safe_push (new_temp);
5185
5186               for (bit_offset = element_bitsize;
5187                    bit_offset < vec_size_in_bits;
5188                    bit_offset += element_bitsize)
5189                 {
5190                   tree bitpos = bitsize_int (bit_offset);
5191                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5192                                      bitsize, bitpos);
5193
5194                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5195                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5196                   gimple_assign_set_lhs (epilog_stmt, new_name);
5197                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5198
5199                   if (slp_reduc)
5200                     {
5201                       /* In SLP we don't need to apply reduction operation, so
5202                          we just collect s' values in SCALAR_RESULTS.  */
5203                       new_temp = new_name;
5204                       scalar_results.safe_push (new_name);
5205                     }
5206                   else
5207                     {
5208                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5209                                                          new_name, new_temp);
5210                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5211                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5212                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5213                     }
5214                 }
5215             }
5216
5217           /* The only case where we need to reduce scalar results in SLP, is
5218              unrolling.  If the size of SCALAR_RESULTS is greater than
5219              GROUP_SIZE, we reduce them combining elements modulo
5220              GROUP_SIZE.  */
5221           if (slp_reduc)
5222             {
5223               tree res, first_res, new_res;
5224               gimple *new_stmt;
5225
5226               /* Reduce multiple scalar results in case of SLP unrolling.  */
5227               for (j = group_size; scalar_results.iterate (j, &res);
5228                    j++)
5229                 {
5230                   first_res = scalar_results[j % group_size];
5231                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5232                                                   first_res, res);
5233                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5234                   gimple_assign_set_lhs (new_stmt, new_res);
5235                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5236                   scalar_results[j % group_size] = new_res;
5237                 }
5238             }
5239           else
5240             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5241             scalar_results.safe_push (new_temp);
5242         }
5243
5244       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5245            == INTEGER_INDUC_COND_REDUCTION)
5246           && !operand_equal_p (initial_def, induc_val, 0))
5247         {
5248           /* Earlier we set the initial value to be a vector if induc_val
5249              values.  Check the result and if it is induc_val then replace
5250              with the original initial value, unless induc_val is
5251              the same as initial_def already.  */
5252           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5253                                   induc_val);
5254
5255           tree tmp = make_ssa_name (new_scalar_dest);
5256           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5257                                              initial_def, new_temp);
5258           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5259           scalar_results[0] = tmp;
5260         }
5261     }
5262
5263 vect_finalize_reduction:
5264
5265   if (double_reduc)
5266     loop = loop->inner;
5267
5268   /* 2.5 Adjust the final result by the initial value of the reduction
5269          variable. (When such adjustment is not needed, then
5270          'adjustment_def' is zero).  For example, if code is PLUS we create:
5271          new_temp = loop_exit_def + adjustment_def  */
5272
5273   if (adjustment_def)
5274     {
5275       gcc_assert (!slp_reduc);
5276       if (nested_in_vect_loop)
5277         {
5278           new_phi = new_phis[0];
5279           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5280           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5281           new_dest = vect_create_destination_var (scalar_dest, vectype);
5282         }
5283       else
5284         {
5285           new_temp = scalar_results[0];
5286           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5287           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5288           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5289         }
5290
5291       epilog_stmt = gimple_build_assign (new_dest, expr);
5292       new_temp = make_ssa_name (new_dest, epilog_stmt);
5293       gimple_assign_set_lhs (epilog_stmt, new_temp);
5294       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5295       if (nested_in_vect_loop)
5296         {
5297           set_vinfo_for_stmt (epilog_stmt,
5298                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5299           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5300                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5301
5302           if (!double_reduc)
5303             scalar_results.quick_push (new_temp);
5304           else
5305             scalar_results[0] = new_temp;
5306         }
5307       else
5308         scalar_results[0] = new_temp;
5309
5310       new_phis[0] = epilog_stmt;
5311     }
5312
5313   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5314           phis with new adjusted scalar results, i.e., replace use <s_out0>
5315           with use <s_out4>.
5316
5317      Transform:
5318         loop_exit:
5319           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5320           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5321           v_out2 = reduce <v_out1>
5322           s_out3 = extract_field <v_out2, 0>
5323           s_out4 = adjust_result <s_out3>
5324           use <s_out0>
5325           use <s_out0>
5326
5327      into:
5328
5329         loop_exit:
5330           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5331           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5332           v_out2 = reduce <v_out1>
5333           s_out3 = extract_field <v_out2, 0>
5334           s_out4 = adjust_result <s_out3>
5335           use <s_out4>
5336           use <s_out4> */
5337
5338
5339   /* In SLP reduction chain we reduce vector results into one vector if
5340      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5341      the last stmt in the reduction chain, since we are looking for the loop
5342      exit phi node.  */
5343   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5344     {
5345       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5346       /* Handle reduction patterns.  */
5347       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5348         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5349
5350       scalar_dest = gimple_assign_lhs (dest_stmt);
5351       group_size = 1;
5352     }
5353
5354   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5355      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5356      need to match SCALAR_RESULTS with corresponding statements.  The first
5357      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5358      the first vector stmt, etc.
5359      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5360   if (group_size > new_phis.length ())
5361     {
5362       ratio = group_size / new_phis.length ();
5363       gcc_assert (!(group_size % new_phis.length ()));
5364     }
5365   else
5366     ratio = 1;
5367
5368   for (k = 0; k < group_size; k++)
5369     {
5370       if (k % ratio == 0)
5371         {
5372           epilog_stmt = new_phis[k / ratio];
5373           reduction_phi = reduction_phis[k / ratio];
5374           if (double_reduc)
5375             inner_phi = inner_phis[k / ratio];
5376         }
5377
5378       if (slp_reduc)
5379         {
5380           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5381
5382           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5383           /* SLP statements can't participate in patterns.  */
5384           gcc_assert (!orig_stmt);
5385           scalar_dest = gimple_assign_lhs (current_stmt);
5386         }
5387
5388       phis.create (3);
5389       /* Find the loop-closed-use at the loop exit of the original scalar
5390          result.  (The reduction result is expected to have two immediate uses -
5391          one at the latch block, and one at the loop exit).  */
5392       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5393         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5394             && !is_gimple_debug (USE_STMT (use_p)))
5395           phis.safe_push (USE_STMT (use_p));
5396
5397       /* While we expect to have found an exit_phi because of loop-closed-ssa
5398          form we can end up without one if the scalar cycle is dead.  */
5399
5400       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5401         {
5402           if (outer_loop)
5403             {
5404               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5405               gphi *vect_phi;
5406
5407               /* FORNOW. Currently not supporting the case that an inner-loop
5408                  reduction is not used in the outer-loop (but only outside the
5409                  outer-loop), unless it is double reduction.  */
5410               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5411                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5412                           || double_reduc);
5413
5414               if (double_reduc)
5415                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5416               else
5417                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5418               if (!double_reduc
5419                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5420                       != vect_double_reduction_def)
5421                 continue;
5422
5423               /* Handle double reduction:
5424
5425                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5426                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5427                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5428                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5429
5430                  At that point the regular reduction (stmt2 and stmt3) is
5431                  already vectorized, as well as the exit phi node, stmt4.
5432                  Here we vectorize the phi node of double reduction, stmt1, and
5433                  update all relevant statements.  */
5434
5435               /* Go through all the uses of s2 to find double reduction phi
5436                  node, i.e., stmt1 above.  */
5437               orig_name = PHI_RESULT (exit_phi);
5438               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5439                 {
5440                   stmt_vec_info use_stmt_vinfo;
5441                   stmt_vec_info new_phi_vinfo;
5442                   tree vect_phi_init, preheader_arg, vect_phi_res;
5443                   basic_block bb = gimple_bb (use_stmt);
5444                   gimple *use;
5445
5446                   /* Check that USE_STMT is really double reduction phi
5447                      node.  */
5448                   if (gimple_code (use_stmt) != GIMPLE_PHI
5449                       || gimple_phi_num_args (use_stmt) != 2
5450                       || bb->loop_father != outer_loop)
5451                     continue;
5452                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5453                   if (!use_stmt_vinfo
5454                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5455                           != vect_double_reduction_def)
5456                     continue;
5457
5458                   /* Create vector phi node for double reduction:
5459                      vs1 = phi <vs0, vs2>
5460                      vs1 was created previously in this function by a call to
5461                        vect_get_vec_def_for_operand and is stored in
5462                        vec_initial_def;
5463                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5464                      vs0 is created here.  */
5465
5466                   /* Create vector phi node.  */
5467                   vect_phi = create_phi_node (vec_initial_def, bb);
5468                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5469                                     loop_vec_info_for_loop (outer_loop));
5470                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5471
5472                   /* Create vs0 - initial def of the double reduction phi.  */
5473                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5474                                              loop_preheader_edge (outer_loop));
5475                   vect_phi_init = get_initial_def_for_reduction
5476                     (stmt, preheader_arg, NULL);
5477
5478                   /* Update phi node arguments with vs0 and vs2.  */
5479                   add_phi_arg (vect_phi, vect_phi_init,
5480                                loop_preheader_edge (outer_loop),
5481                                UNKNOWN_LOCATION);
5482                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5483                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5484                   if (dump_enabled_p ())
5485                     {
5486                       dump_printf_loc (MSG_NOTE, vect_location,
5487                                        "created double reduction phi node: ");
5488                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5489                     }
5490
5491                   vect_phi_res = PHI_RESULT (vect_phi);
5492
5493                   /* Replace the use, i.e., set the correct vs1 in the regular
5494                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5495                      loop is redundant.  */
5496                   use = reduction_phi;
5497                   for (j = 0; j < ncopies; j++)
5498                     {
5499                       edge pr_edge = loop_preheader_edge (loop);
5500                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5501                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5502                     }
5503                 }
5504             }
5505         }
5506
5507       phis.release ();
5508       if (nested_in_vect_loop)
5509         {
5510           if (double_reduc)
5511             loop = outer_loop;
5512           else
5513             continue;
5514         }
5515
5516       phis.create (3);
5517       /* Find the loop-closed-use at the loop exit of the original scalar
5518          result.  (The reduction result is expected to have two immediate uses,
5519          one at the latch block, and one at the loop exit).  For double
5520          reductions we are looking for exit phis of the outer loop.  */
5521       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5522         {
5523           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5524             {
5525               if (!is_gimple_debug (USE_STMT (use_p)))
5526                 phis.safe_push (USE_STMT (use_p));
5527             }
5528           else
5529             {
5530               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5531                 {
5532                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5533
5534                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5535                     {
5536                       if (!flow_bb_inside_loop_p (loop,
5537                                              gimple_bb (USE_STMT (phi_use_p)))
5538                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5539                         phis.safe_push (USE_STMT (phi_use_p));
5540                     }
5541                 }
5542             }
5543         }
5544
5545       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5546         {
5547           /* Replace the uses:  */
5548           orig_name = PHI_RESULT (exit_phi);
5549           scalar_result = scalar_results[k];
5550           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5551             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5552               SET_USE (use_p, scalar_result);
5553         }
5554
5555       phis.release ();
5556     }
5557 }
5558
5559
5560 /* Function is_nonwrapping_integer_induction.
5561
5562    Check if STMT (which is part of loop LOOP) both increments and
5563    does not cause overflow.  */
5564
5565 static bool
5566 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5567 {
5568   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5569   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5570   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5571   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5572   widest_int ni, max_loop_value, lhs_max;
5573   bool overflow = false;
5574
5575   /* Make sure the loop is integer based.  */
5576   if (TREE_CODE (base) != INTEGER_CST
5577       || TREE_CODE (step) != INTEGER_CST)
5578     return false;
5579
5580   /* Check that the max size of the loop will not wrap.  */
5581
5582   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5583     return true;
5584
5585   if (! max_stmt_executions (loop, &ni))
5586     return false;
5587
5588   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5589                             &overflow);
5590   if (overflow)
5591     return false;
5592
5593   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5594                             TYPE_SIGN (lhs_type), &overflow);
5595   if (overflow)
5596     return false;
5597
5598   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5599           <= TYPE_PRECISION (lhs_type));
5600 }
5601
5602 /* Function vectorizable_reduction.
5603
5604    Check if STMT performs a reduction operation that can be vectorized.
5605    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5606    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5607    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5608
5609    This function also handles reduction idioms (patterns) that have been
5610    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5611    of this form:
5612      X = pattern_expr (arg0, arg1, ..., X)
5613    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5614    sequence that had been detected and replaced by the pattern-stmt (STMT).
5615
5616    This function also handles reduction of condition expressions, for example:
5617      for (int i = 0; i < N; i++)
5618        if (a[i] < value)
5619          last = a[i];
5620    This is handled by vectorising the loop and creating an additional vector
5621    containing the loop indexes for which "a[i] < value" was true.  In the
5622    function epilogue this is reduced to a single max value and then used to
5623    index into the vector of results.
5624
5625    In some cases of reduction patterns, the type of the reduction variable X is
5626    different than the type of the other arguments of STMT.
5627    In such cases, the vectype that is used when transforming STMT into a vector
5628    stmt is different than the vectype that is used to determine the
5629    vectorization factor, because it consists of a different number of elements
5630    than the actual number of elements that are being operated upon in parallel.
5631
5632    For example, consider an accumulation of shorts into an int accumulator.
5633    On some targets it's possible to vectorize this pattern operating on 8
5634    shorts at a time (hence, the vectype for purposes of determining the
5635    vectorization factor should be V8HI); on the other hand, the vectype that
5636    is used to create the vector form is actually V4SI (the type of the result).
5637
5638    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5639    indicates what is the actual level of parallelism (V8HI in the example), so
5640    that the right vectorization factor would be derived.  This vectype
5641    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5642    be used to create the vectorized stmt.  The right vectype for the vectorized
5643    stmt is obtained from the type of the result X:
5644         get_vectype_for_scalar_type (TREE_TYPE (X))
5645
5646    This means that, contrary to "regular" reductions (or "regular" stmts in
5647    general), the following equation:
5648       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5649    does *NOT* necessarily hold for reduction patterns.  */
5650
5651 bool
5652 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5653                         gimple **vec_stmt, slp_tree slp_node,
5654                         slp_instance slp_node_instance)
5655 {
5656   tree vec_dest;
5657   tree scalar_dest;
5658   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5659   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5660   tree vectype_in = NULL_TREE;
5661   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5662   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5663   enum tree_code code, orig_code;
5664   internal_fn reduc_fn;
5665   machine_mode vec_mode;
5666   int op_type;
5667   optab optab;
5668   tree new_temp = NULL_TREE;
5669   gimple *def_stmt;
5670   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5671   gimple *cond_reduc_def_stmt = NULL;
5672   enum tree_code cond_reduc_op_code = ERROR_MARK;
5673   tree scalar_type;
5674   bool is_simple_use;
5675   gimple *orig_stmt;
5676   stmt_vec_info orig_stmt_info = NULL;
5677   int i;
5678   int ncopies;
5679   int epilog_copies;
5680   stmt_vec_info prev_stmt_info, prev_phi_info;
5681   bool single_defuse_cycle = false;
5682   gimple *new_stmt = NULL;
5683   int j;
5684   tree ops[3];
5685   enum vect_def_type dts[3];
5686   bool nested_cycle = false, found_nested_cycle_def = false;
5687   bool double_reduc = false;
5688   basic_block def_bb;
5689   struct loop * def_stmt_loop, *outer_loop = NULL;
5690   tree def_arg;
5691   gimple *def_arg_stmt;
5692   auto_vec<tree> vec_oprnds0;
5693   auto_vec<tree> vec_oprnds1;
5694   auto_vec<tree> vec_oprnds2;
5695   auto_vec<tree> vect_defs;
5696   auto_vec<gimple *> phis;
5697   int vec_num;
5698   tree def0, tem;
5699   bool first_p = true;
5700   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5701   tree cond_reduc_val = NULL_TREE;
5702
5703   /* Make sure it was already recognized as a reduction computation.  */
5704   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5705       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5706     return false;
5707
5708   if (nested_in_vect_loop_p (loop, stmt))
5709     {
5710       outer_loop = loop;
5711       loop = loop->inner;
5712       nested_cycle = true;
5713     }
5714
5715   /* In case of reduction chain we switch to the first stmt in the chain, but
5716      we don't update STMT_INFO, since only the last stmt is marked as reduction
5717      and has reduction properties.  */
5718   if (GROUP_FIRST_ELEMENT (stmt_info)
5719       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5720     {
5721       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5722       first_p = false;
5723     }
5724
5725   if (gimple_code (stmt) == GIMPLE_PHI)
5726     {
5727       /* Analysis is fully done on the reduction stmt invocation.  */
5728       if (! vec_stmt)
5729         {
5730           if (slp_node)
5731             slp_node_instance->reduc_phis = slp_node;
5732
5733           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5734           return true;
5735         }
5736
5737       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5738       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5739         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5740
5741       gcc_assert (is_gimple_assign (reduc_stmt));
5742       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5743         {
5744           tree op = gimple_op (reduc_stmt, k);
5745           if (op == gimple_phi_result (stmt))
5746             continue;
5747           if (k == 1
5748               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5749             continue;
5750           if (!vectype_in
5751               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5752                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
5753             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
5754           break;
5755         }
5756       gcc_assert (vectype_in);
5757
5758       if (slp_node)
5759         ncopies = 1;
5760       else
5761         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5762
5763       use_operand_p use_p;
5764       gimple *use_stmt;
5765       if (ncopies > 1
5766           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5767               <= vect_used_only_live)
5768           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5769           && (use_stmt == reduc_stmt
5770               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5771                   == reduc_stmt)))
5772         single_defuse_cycle = true;
5773
5774       /* Create the destination vector  */
5775       scalar_dest = gimple_assign_lhs (reduc_stmt);
5776       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5777
5778       if (slp_node)
5779         /* The size vect_schedule_slp_instance computes is off for us.  */
5780         vec_num = vect_get_num_vectors
5781           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5782            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
5783            vectype_in);
5784       else
5785         vec_num = 1;
5786
5787       /* Generate the reduction PHIs upfront.  */
5788       prev_phi_info = NULL;
5789       for (j = 0; j < ncopies; j++)
5790         {
5791           if (j == 0 || !single_defuse_cycle)
5792             {
5793               for (i = 0; i < vec_num; i++)
5794                 {
5795                   /* Create the reduction-phi that defines the reduction
5796                      operand.  */
5797                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5798                   set_vinfo_for_stmt (new_phi,
5799                                       new_stmt_vec_info (new_phi, loop_vinfo));
5800
5801                   if (slp_node)
5802                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5803                   else
5804                     {
5805                       if (j == 0)
5806                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5807                       else
5808                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5809                       prev_phi_info = vinfo_for_stmt (new_phi);
5810                     }
5811                 }
5812             }
5813         }
5814
5815       return true;
5816     }
5817
5818   /* 1. Is vectorizable reduction?  */
5819   /* Not supportable if the reduction variable is used in the loop, unless
5820      it's a reduction chain.  */
5821   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5822       && !GROUP_FIRST_ELEMENT (stmt_info))
5823     return false;
5824
5825   /* Reductions that are not used even in an enclosing outer-loop,
5826      are expected to be "live" (used out of the loop).  */
5827   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5828       && !STMT_VINFO_LIVE_P (stmt_info))
5829     return false;
5830
5831   /* 2. Has this been recognized as a reduction pattern?
5832
5833      Check if STMT represents a pattern that has been recognized
5834      in earlier analysis stages.  For stmts that represent a pattern,
5835      the STMT_VINFO_RELATED_STMT field records the last stmt in
5836      the original sequence that constitutes the pattern.  */
5837
5838   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5839   if (orig_stmt)
5840     {
5841       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5842       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5843       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5844     }
5845
5846   /* 3. Check the operands of the operation.  The first operands are defined
5847         inside the loop body. The last operand is the reduction variable,
5848         which is defined by the loop-header-phi.  */
5849
5850   gcc_assert (is_gimple_assign (stmt));
5851
5852   /* Flatten RHS.  */
5853   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5854     {
5855     case GIMPLE_BINARY_RHS:
5856       code = gimple_assign_rhs_code (stmt);
5857       op_type = TREE_CODE_LENGTH (code);
5858       gcc_assert (op_type == binary_op);
5859       ops[0] = gimple_assign_rhs1 (stmt);
5860       ops[1] = gimple_assign_rhs2 (stmt);
5861       break;
5862
5863     case GIMPLE_TERNARY_RHS:
5864       code = gimple_assign_rhs_code (stmt);
5865       op_type = TREE_CODE_LENGTH (code);
5866       gcc_assert (op_type == ternary_op);
5867       ops[0] = gimple_assign_rhs1 (stmt);
5868       ops[1] = gimple_assign_rhs2 (stmt);
5869       ops[2] = gimple_assign_rhs3 (stmt);
5870       break;
5871
5872     case GIMPLE_UNARY_RHS:
5873       return false;
5874
5875     default:
5876       gcc_unreachable ();
5877     }
5878
5879   if (code == COND_EXPR && slp_node)
5880     return false;
5881
5882   scalar_dest = gimple_assign_lhs (stmt);
5883   scalar_type = TREE_TYPE (scalar_dest);
5884   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5885       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5886     return false;
5887
5888   /* Do not try to vectorize bit-precision reductions.  */
5889   if (!type_has_mode_precision_p (scalar_type))
5890     return false;
5891
5892   /* All uses but the last are expected to be defined in the loop.
5893      The last use is the reduction variable.  In case of nested cycle this
5894      assumption is not true: we use reduc_index to record the index of the
5895      reduction variable.  */
5896   gimple *reduc_def_stmt = NULL;
5897   int reduc_index = -1;
5898   for (i = 0; i < op_type; i++)
5899     {
5900       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5901       if (i == 0 && code == COND_EXPR)
5902         continue;
5903
5904       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5905                                           &def_stmt, &dts[i], &tem);
5906       dt = dts[i];
5907       gcc_assert (is_simple_use);
5908       if (dt == vect_reduction_def)
5909         {
5910           reduc_def_stmt = def_stmt;
5911           reduc_index = i;
5912           continue;
5913         }
5914       else if (tem)
5915         {
5916           /* To properly compute ncopies we are interested in the widest
5917              input type in case we're looking at a widening accumulation.  */
5918           if (!vectype_in
5919               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
5920                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
5921             vectype_in = tem;
5922         }
5923
5924       if (dt != vect_internal_def
5925           && dt != vect_external_def
5926           && dt != vect_constant_def
5927           && dt != vect_induction_def
5928           && !(dt == vect_nested_cycle && nested_cycle))
5929         return false;
5930
5931       if (dt == vect_nested_cycle)
5932         {
5933           found_nested_cycle_def = true;
5934           reduc_def_stmt = def_stmt;
5935           reduc_index = i;
5936         }
5937
5938       if (i == 1 && code == COND_EXPR)
5939         {
5940           /* Record how value of COND_EXPR is defined.  */
5941           if (dt == vect_constant_def)
5942             {
5943               cond_reduc_dt = dt;
5944               cond_reduc_val = ops[i];
5945             }
5946           if (dt == vect_induction_def
5947               && def_stmt != NULL
5948               && is_nonwrapping_integer_induction (def_stmt, loop))
5949             {
5950               cond_reduc_dt = dt;
5951               cond_reduc_def_stmt = def_stmt;
5952             }
5953         }
5954     }
5955
5956   if (!vectype_in)
5957     vectype_in = vectype_out;
5958
5959   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5960      directy used in stmt.  */
5961   if (reduc_index == -1)
5962     {
5963       if (orig_stmt)
5964         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5965       else
5966         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5967     }
5968
5969   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5970     return false;
5971
5972   if (!(reduc_index == -1
5973         || dts[reduc_index] == vect_reduction_def
5974         || dts[reduc_index] == vect_nested_cycle
5975         || ((dts[reduc_index] == vect_internal_def
5976              || dts[reduc_index] == vect_external_def
5977              || dts[reduc_index] == vect_constant_def
5978              || dts[reduc_index] == vect_induction_def)
5979             && nested_cycle && found_nested_cycle_def)))
5980     {
5981       /* For pattern recognized stmts, orig_stmt might be a reduction,
5982          but some helper statements for the pattern might not, or
5983          might be COND_EXPRs with reduction uses in the condition.  */
5984       gcc_assert (orig_stmt);
5985       return false;
5986     }
5987
5988   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5989   enum vect_reduction_type v_reduc_type
5990     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5991   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5992
5993   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5994   /* If we have a condition reduction, see if we can simplify it further.  */
5995   if (v_reduc_type == COND_REDUCTION)
5996     {
5997       if (cond_reduc_dt == vect_induction_def)
5998         {
5999           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6000           tree base
6001             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6002           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6003
6004           gcc_assert (TREE_CODE (base) == INTEGER_CST
6005                       && TREE_CODE (step) == INTEGER_CST);
6006           cond_reduc_val = NULL_TREE;
6007           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6008              above base; punt if base is the minimum value of the type for
6009              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6010           if (tree_int_cst_sgn (step) == -1)
6011             {
6012               cond_reduc_op_code = MIN_EXPR;
6013               if (tree_int_cst_sgn (base) == -1)
6014                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6015               else if (tree_int_cst_lt (base,
6016                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6017                 cond_reduc_val
6018                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6019             }
6020           else
6021             {
6022               cond_reduc_op_code = MAX_EXPR;
6023               if (tree_int_cst_sgn (base) == 1)
6024                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6025               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6026                                         base))
6027                 cond_reduc_val
6028                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6029             }
6030           if (cond_reduc_val)
6031             {
6032               if (dump_enabled_p ())
6033                 dump_printf_loc (MSG_NOTE, vect_location,
6034                                  "condition expression based on "
6035                                  "integer induction.\n");
6036               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6037                 = INTEGER_INDUC_COND_REDUCTION;
6038             }
6039         }
6040
6041       /* Loop peeling modifies initial value of reduction PHI, which
6042          makes the reduction stmt to be transformed different to the
6043          original stmt analyzed.  We need to record reduction code for
6044          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6045          it can be used directly at transform stage.  */
6046       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6047           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6048         {
6049           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6050           gcc_assert (cond_reduc_dt == vect_constant_def);
6051           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6052         }
6053       else if (cond_reduc_dt == vect_constant_def)
6054         {
6055           enum vect_def_type cond_initial_dt;
6056           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6057           tree cond_initial_val
6058             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6059
6060           gcc_assert (cond_reduc_val != NULL_TREE);
6061           vect_is_simple_use (cond_initial_val, loop_vinfo,
6062                               &def_stmt, &cond_initial_dt);
6063           if (cond_initial_dt == vect_constant_def
6064               && types_compatible_p (TREE_TYPE (cond_initial_val),
6065                                      TREE_TYPE (cond_reduc_val)))
6066             {
6067               tree e = fold_binary (LE_EXPR, boolean_type_node,
6068                                     cond_initial_val, cond_reduc_val);
6069               if (e && (integer_onep (e) || integer_zerop (e)))
6070                 {
6071                   if (dump_enabled_p ())
6072                     dump_printf_loc (MSG_NOTE, vect_location,
6073                                      "condition expression based on "
6074                                      "compile time constant.\n");
6075                   /* Record reduction code at analysis stage.  */
6076                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6077                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6078                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6079                     = CONST_COND_REDUCTION;
6080                 }
6081             }
6082         }
6083     }
6084
6085   if (orig_stmt)
6086     gcc_assert (tmp == orig_stmt
6087                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6088   else
6089     /* We changed STMT to be the first stmt in reduction chain, hence we
6090        check that in this case the first element in the chain is STMT.  */
6091     gcc_assert (stmt == tmp
6092                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6093
6094   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6095     return false;
6096
6097   if (slp_node)
6098     ncopies = 1;
6099   else
6100     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6101
6102   gcc_assert (ncopies >= 1);
6103
6104   vec_mode = TYPE_MODE (vectype_in);
6105   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6106
6107   if (code == COND_EXPR)
6108     {
6109       /* Only call during the analysis stage, otherwise we'll lose
6110          STMT_VINFO_TYPE.  */
6111       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6112                                                 ops[reduc_index], 0, NULL))
6113         {
6114           if (dump_enabled_p ())
6115             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                              "unsupported condition in reduction\n");
6117           return false;
6118         }
6119     }
6120   else
6121     {
6122       /* 4. Supportable by target?  */
6123
6124       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6125           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6126         {
6127           /* Shifts and rotates are only supported by vectorizable_shifts,
6128              not vectorizable_reduction.  */
6129           if (dump_enabled_p ())
6130             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6131                              "unsupported shift or rotation.\n");
6132           return false;
6133         }
6134
6135       /* 4.1. check support for the operation in the loop  */
6136       optab = optab_for_tree_code (code, vectype_in, optab_default);
6137       if (!optab)
6138         {
6139           if (dump_enabled_p ())
6140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6141                              "no optab.\n");
6142
6143           return false;
6144         }
6145
6146       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6147         {
6148           if (dump_enabled_p ())
6149             dump_printf (MSG_NOTE, "op not supported by target.\n");
6150
6151           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6152               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6153             return false;
6154
6155           if (dump_enabled_p ())
6156             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6157         }
6158
6159       /* Worthwhile without SIMD support?  */
6160       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6161           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6162         {
6163           if (dump_enabled_p ())
6164             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6165                              "not worthwhile without SIMD support.\n");
6166
6167           return false;
6168         }
6169     }
6170
6171   /* 4.2. Check support for the epilog operation.
6172
6173           If STMT represents a reduction pattern, then the type of the
6174           reduction variable may be different than the type of the rest
6175           of the arguments.  For example, consider the case of accumulation
6176           of shorts into an int accumulator; The original code:
6177                         S1: int_a = (int) short_a;
6178           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6179
6180           was replaced with:
6181                         STMT: int_acc = widen_sum <short_a, int_acc>
6182
6183           This means that:
6184           1. The tree-code that is used to create the vector operation in the
6185              epilog code (that reduces the partial results) is not the
6186              tree-code of STMT, but is rather the tree-code of the original
6187              stmt from the pattern that STMT is replacing.  I.e, in the example
6188              above we want to use 'widen_sum' in the loop, but 'plus' in the
6189              epilog.
6190           2. The type (mode) we use to check available target support
6191              for the vector operation to be created in the *epilog*, is
6192              determined by the type of the reduction variable (in the example
6193              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6194              However the type (mode) we use to check available target support
6195              for the vector operation to be created *inside the loop*, is
6196              determined by the type of the other arguments to STMT (in the
6197              example we'd check this: optab_handler (widen_sum_optab,
6198              vect_short_mode)).
6199
6200           This is contrary to "regular" reductions, in which the types of all
6201           the arguments are the same as the type of the reduction variable.
6202           For "regular" reductions we can therefore use the same vector type
6203           (and also the same tree-code) when generating the epilog code and
6204           when generating the code inside the loop.  */
6205
6206   if (orig_stmt)
6207     {
6208       /* This is a reduction pattern: get the vectype from the type of the
6209          reduction variable, and get the tree-code from orig_stmt.  */
6210       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6211                   == TREE_CODE_REDUCTION);
6212       orig_code = gimple_assign_rhs_code (orig_stmt);
6213       gcc_assert (vectype_out);
6214       vec_mode = TYPE_MODE (vectype_out);
6215     }
6216   else
6217     {
6218       /* Regular reduction: use the same vectype and tree-code as used for
6219          the vector code inside the loop can be used for the epilog code. */
6220       orig_code = code;
6221
6222       if (code == MINUS_EXPR)
6223         orig_code = PLUS_EXPR;
6224
6225       /* For simple condition reductions, replace with the actual expression
6226          we want to base our reduction around.  */
6227       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6228         {
6229           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6230           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6231         }
6232       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6233                == INTEGER_INDUC_COND_REDUCTION)
6234         orig_code = cond_reduc_op_code;
6235     }
6236
6237   if (nested_cycle)
6238     {
6239       def_bb = gimple_bb (reduc_def_stmt);
6240       def_stmt_loop = def_bb->loop_father;
6241       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6242                                        loop_preheader_edge (def_stmt_loop));
6243       if (TREE_CODE (def_arg) == SSA_NAME
6244           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6245           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6246           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6247           && vinfo_for_stmt (def_arg_stmt)
6248           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6249               == vect_double_reduction_def)
6250         double_reduc = true;
6251     }
6252
6253   reduc_fn = IFN_LAST;
6254
6255   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6256     {
6257       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6258         {
6259           if (reduc_fn != IFN_LAST
6260               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6261                                                   OPTIMIZE_FOR_SPEED))
6262             {
6263               if (dump_enabled_p ())
6264                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6265                                  "reduc op not supported by target.\n");
6266
6267               reduc_fn = IFN_LAST;
6268             }
6269         }
6270       else
6271         {
6272           if (!nested_cycle || double_reduc)
6273             {
6274               if (dump_enabled_p ())
6275                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6276                                  "no reduc code for scalar code.\n");
6277
6278               return false;
6279             }
6280         }
6281     }
6282   else
6283     {
6284       int scalar_precision
6285         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6286       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6287       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
6288                                                 nunits_out);
6289
6290       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6291                                           OPTIMIZE_FOR_SPEED))
6292         reduc_fn = IFN_REDUC_MAX;
6293     }
6294
6295   if (reduc_fn == IFN_LAST && !nunits_out.is_constant ())
6296     {
6297       if (dump_enabled_p ())
6298         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6299                          "missing target support for reduction on"
6300                          " variable-length vectors.\n");
6301       return false;
6302     }
6303
6304   if ((double_reduc
6305        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6306       && ncopies > 1)
6307     {
6308       if (dump_enabled_p ())
6309         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6310                          "multiple types in double reduction or condition "
6311                          "reduction.\n");
6312       return false;
6313     }
6314
6315   if (double_reduc && !nunits_out.is_constant ())
6316     {
6317       /* The current double-reduction code creates the initial value
6318          element-by-element.  */
6319       if (dump_enabled_p ())
6320         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6321                          "double reduction not supported for variable-length"
6322                          " vectors.\n");
6323       return false;
6324     }
6325
6326   if (slp_node && !nunits_out.is_constant ())
6327     {
6328       /* The current SLP code creates the initial value element-by-element.  */
6329       if (dump_enabled_p ())
6330         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6331                          "SLP reduction not supported for variable-length"
6332                          " vectors.\n");
6333       return false;
6334     }
6335
6336   /* In case of widenning multiplication by a constant, we update the type
6337      of the constant to be the type of the other operand.  We check that the
6338      constant fits the type in the pattern recognition pass.  */
6339   if (code == DOT_PROD_EXPR
6340       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6341     {
6342       if (TREE_CODE (ops[0]) == INTEGER_CST)
6343         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6344       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6345         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6346       else
6347         {
6348           if (dump_enabled_p ())
6349             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6350                              "invalid types in dot-prod\n");
6351
6352           return false;
6353         }
6354     }
6355
6356   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6357     {
6358       widest_int ni;
6359
6360       if (! max_loop_iterations (loop, &ni))
6361         {
6362           if (dump_enabled_p ())
6363             dump_printf_loc (MSG_NOTE, vect_location,
6364                              "loop count not known, cannot create cond "
6365                              "reduction.\n");
6366           return false;
6367         }
6368       /* Convert backedges to iterations.  */
6369       ni += 1;
6370
6371       /* The additional index will be the same type as the condition.  Check
6372          that the loop can fit into this less one (because we'll use up the
6373          zero slot for when there are no matches).  */
6374       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6375       if (wi::geu_p (ni, wi::to_widest (max_index)))
6376         {
6377           if (dump_enabled_p ())
6378             dump_printf_loc (MSG_NOTE, vect_location,
6379                              "loop size is greater than data size.\n");
6380           return false;
6381         }
6382     }
6383
6384   /* In case the vectorization factor (VF) is bigger than the number
6385      of elements that we can fit in a vectype (nunits), we have to generate
6386      more than one vector stmt - i.e - we need to "unroll" the
6387      vector stmt by a factor VF/nunits.  For more details see documentation
6388      in vectorizable_operation.  */
6389
6390   /* If the reduction is used in an outer loop we need to generate
6391      VF intermediate results, like so (e.g. for ncopies=2):
6392         r0 = phi (init, r0)
6393         r1 = phi (init, r1)
6394         r0 = x0 + r0;
6395         r1 = x1 + r1;
6396     (i.e. we generate VF results in 2 registers).
6397     In this case we have a separate def-use cycle for each copy, and therefore
6398     for each copy we get the vector def for the reduction variable from the
6399     respective phi node created for this copy.
6400
6401     Otherwise (the reduction is unused in the loop nest), we can combine
6402     together intermediate results, like so (e.g. for ncopies=2):
6403         r = phi (init, r)
6404         r = x0 + r;
6405         r = x1 + r;
6406    (i.e. we generate VF/2 results in a single register).
6407    In this case for each copy we get the vector def for the reduction variable
6408    from the vectorized reduction operation generated in the previous iteration.
6409
6410    This only works when we see both the reduction PHI and its only consumer
6411    in vectorizable_reduction and there are no intermediate stmts
6412    participating.  */
6413   use_operand_p use_p;
6414   gimple *use_stmt;
6415   if (ncopies > 1
6416       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6417       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6418       && (use_stmt == stmt
6419           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6420     {
6421       single_defuse_cycle = true;
6422       epilog_copies = 1;
6423     }
6424   else
6425     epilog_copies = ncopies;
6426
6427   /* If the reduction stmt is one of the patterns that have lane
6428      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6429   if ((ncopies > 1
6430        && ! single_defuse_cycle)
6431       && (code == DOT_PROD_EXPR
6432           || code == WIDEN_SUM_EXPR
6433           || code == SAD_EXPR))
6434     {
6435       if (dump_enabled_p ())
6436         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6437                          "multi def-use cycle not possible for lane-reducing "
6438                          "reduction operation\n");
6439       return false;
6440     }
6441
6442   if (!vec_stmt) /* transformation not required.  */
6443     {
6444       if (first_p)
6445         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6446       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6447       return true;
6448     }
6449
6450   /* Transform.  */
6451
6452   if (dump_enabled_p ())
6453     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6454
6455   /* FORNOW: Multiple types are not supported for condition.  */
6456   if (code == COND_EXPR)
6457     gcc_assert (ncopies == 1);
6458
6459   /* Create the destination vector  */
6460   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6461
6462   prev_stmt_info = NULL;
6463   prev_phi_info = NULL;
6464   if (slp_node)
6465     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6466   else
6467     {
6468       vec_num = 1;
6469       vec_oprnds0.create (1);
6470       vec_oprnds1.create (1);
6471       if (op_type == ternary_op)
6472         vec_oprnds2.create (1);
6473     }
6474
6475   phis.create (vec_num);
6476   vect_defs.create (vec_num);
6477   if (!slp_node)
6478     vect_defs.quick_push (NULL_TREE);
6479
6480   if (slp_node)
6481     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6482   else
6483     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6484
6485   for (j = 0; j < ncopies; j++)
6486     {
6487       if (code == COND_EXPR)
6488         {
6489           gcc_assert (!slp_node);
6490           vectorizable_condition (stmt, gsi, vec_stmt,
6491                                   PHI_RESULT (phis[0]),
6492                                   reduc_index, NULL);
6493           /* Multiple types are not supported for condition.  */
6494           break;
6495         }
6496
6497       /* Handle uses.  */
6498       if (j == 0)
6499         {
6500           if (slp_node)
6501             {
6502               /* Get vec defs for all the operands except the reduction index,
6503                  ensuring the ordering of the ops in the vector is kept.  */
6504               auto_vec<tree, 3> slp_ops;
6505               auto_vec<vec<tree>, 3> vec_defs;
6506
6507               slp_ops.quick_push (ops[0]);
6508               slp_ops.quick_push (ops[1]);
6509               if (op_type == ternary_op)
6510                 slp_ops.quick_push (ops[2]);
6511
6512               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6513
6514               vec_oprnds0.safe_splice (vec_defs[0]);
6515               vec_defs[0].release ();
6516               vec_oprnds1.safe_splice (vec_defs[1]);
6517               vec_defs[1].release ();
6518               if (op_type == ternary_op)
6519                 {
6520                   vec_oprnds2.safe_splice (vec_defs[2]);
6521                   vec_defs[2].release ();
6522                 }
6523             }
6524           else
6525             {
6526               vec_oprnds0.quick_push
6527                 (vect_get_vec_def_for_operand (ops[0], stmt));
6528               vec_oprnds1.quick_push
6529                 (vect_get_vec_def_for_operand (ops[1], stmt));
6530               if (op_type == ternary_op)
6531                 vec_oprnds2.quick_push
6532                   (vect_get_vec_def_for_operand (ops[2], stmt));
6533             }
6534         }
6535       else
6536         {
6537           if (!slp_node)
6538             {
6539               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6540
6541               if (single_defuse_cycle && reduc_index == 0)
6542                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6543               else
6544                 vec_oprnds0[0]
6545                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6546               if (single_defuse_cycle && reduc_index == 1)
6547                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6548               else
6549                 vec_oprnds1[0]
6550                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6551               if (op_type == ternary_op)
6552                 {
6553                   if (single_defuse_cycle && reduc_index == 2)
6554                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6555                   else
6556                     vec_oprnds2[0]
6557                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6558                 }
6559             }
6560         }
6561
6562       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6563         {
6564           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6565           if (op_type == ternary_op)
6566             vop[2] = vec_oprnds2[i];
6567
6568           new_temp = make_ssa_name (vec_dest, new_stmt);
6569           new_stmt = gimple_build_assign (new_temp, code,
6570                                           vop[0], vop[1], vop[2]);
6571           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6572
6573           if (slp_node)
6574             {
6575               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6576               vect_defs.quick_push (new_temp);
6577             }
6578           else
6579             vect_defs[0] = new_temp;
6580         }
6581
6582       if (slp_node)
6583         continue;
6584
6585       if (j == 0)
6586         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6587       else
6588         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6589
6590       prev_stmt_info = vinfo_for_stmt (new_stmt);
6591     }
6592
6593   /* Finalize the reduction-phi (set its arguments) and create the
6594      epilog reduction code.  */
6595   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6596     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6597
6598   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6599                                     epilog_copies, reduc_fn, phis,
6600                                     double_reduc, slp_node, slp_node_instance,
6601                                     cond_reduc_val, cond_reduc_op_code);
6602
6603   return true;
6604 }
6605
6606 /* Function vect_min_worthwhile_factor.
6607
6608    For a loop where we could vectorize the operation indicated by CODE,
6609    return the minimum vectorization factor that makes it worthwhile
6610    to use generic vectors.  */
6611 static unsigned int
6612 vect_min_worthwhile_factor (enum tree_code code)
6613 {
6614   switch (code)
6615     {
6616     case PLUS_EXPR:
6617     case MINUS_EXPR:
6618     case NEGATE_EXPR:
6619       return 4;
6620
6621     case BIT_AND_EXPR:
6622     case BIT_IOR_EXPR:
6623     case BIT_XOR_EXPR:
6624     case BIT_NOT_EXPR:
6625       return 2;
6626
6627     default:
6628       return INT_MAX;
6629     }
6630 }
6631
6632 /* Return true if VINFO indicates we are doing loop vectorization and if
6633    it is worth decomposing CODE operations into scalar operations for
6634    that loop's vectorization factor.  */
6635
6636 bool
6637 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6638 {
6639   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6640   unsigned HOST_WIDE_INT value;
6641   return (loop_vinfo
6642           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
6643           && value >= vect_min_worthwhile_factor (code));
6644 }
6645
6646 /* Function vectorizable_induction
6647
6648    Check if PHI performs an induction computation that can be vectorized.
6649    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6650    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6651    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6652
6653 bool
6654 vectorizable_induction (gimple *phi,
6655                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6656                         gimple **vec_stmt, slp_tree slp_node)
6657 {
6658   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6659   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6660   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6661   unsigned ncopies;
6662   bool nested_in_vect_loop = false;
6663   struct loop *iv_loop;
6664   tree vec_def;
6665   edge pe = loop_preheader_edge (loop);
6666   basic_block new_bb;
6667   tree new_vec, vec_init, vec_step, t;
6668   tree new_name;
6669   gimple *new_stmt;
6670   gphi *induction_phi;
6671   tree induc_def, vec_dest;
6672   tree init_expr, step_expr;
6673   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6674   unsigned i;
6675   tree expr;
6676   gimple_seq stmts;
6677   imm_use_iterator imm_iter;
6678   use_operand_p use_p;
6679   gimple *exit_phi;
6680   edge latch_e;
6681   tree loop_arg;
6682   gimple_stmt_iterator si;
6683   basic_block bb = gimple_bb (phi);
6684
6685   if (gimple_code (phi) != GIMPLE_PHI)
6686     return false;
6687
6688   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6689     return false;
6690
6691   /* Make sure it was recognized as induction computation.  */
6692   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6693     return false;
6694
6695   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6696   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6697
6698   if (slp_node)
6699     ncopies = 1;
6700   else
6701     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6702   gcc_assert (ncopies >= 1);
6703
6704   /* FORNOW. These restrictions should be relaxed.  */
6705   if (nested_in_vect_loop_p (loop, phi))
6706     {
6707       imm_use_iterator imm_iter;
6708       use_operand_p use_p;
6709       gimple *exit_phi;
6710       edge latch_e;
6711       tree loop_arg;
6712
6713       if (ncopies > 1)
6714         {
6715           if (dump_enabled_p ())
6716             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6717                              "multiple types in nested loop.\n");
6718           return false;
6719         }
6720
6721       /* FORNOW: outer loop induction with SLP not supported.  */
6722       if (STMT_SLP_TYPE (stmt_info))
6723         return false;
6724
6725       exit_phi = NULL;
6726       latch_e = loop_latch_edge (loop->inner);
6727       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6728       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6729         {
6730           gimple *use_stmt = USE_STMT (use_p);
6731           if (is_gimple_debug (use_stmt))
6732             continue;
6733
6734           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6735             {
6736               exit_phi = use_stmt;
6737               break;
6738             }
6739         }
6740       if (exit_phi)
6741         {
6742           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6743           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6744                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6745             {
6746               if (dump_enabled_p ())
6747                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6748                                  "inner-loop induction only used outside "
6749                                  "of the outer vectorized loop.\n");
6750               return false;
6751             }
6752         }
6753
6754       nested_in_vect_loop = true;
6755       iv_loop = loop->inner;
6756     }
6757   else
6758     iv_loop = loop;
6759   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6760
6761   if (!vec_stmt) /* transformation not required.  */
6762     {
6763       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6764       if (dump_enabled_p ())
6765         dump_printf_loc (MSG_NOTE, vect_location,
6766                          "=== vectorizable_induction ===\n");
6767       vect_model_induction_cost (stmt_info, ncopies);
6768       return true;
6769     }
6770
6771   /* Transform.  */
6772
6773   /* Compute a vector variable, initialized with the first VF values of
6774      the induction variable.  E.g., for an iv with IV_PHI='X' and
6775      evolution S, for a vector of 4 units, we want to compute:
6776      [X, X + S, X + 2*S, X + 3*S].  */
6777
6778   if (dump_enabled_p ())
6779     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6780
6781   latch_e = loop_latch_edge (iv_loop);
6782   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6783
6784   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6785   gcc_assert (step_expr != NULL_TREE);
6786
6787   pe = loop_preheader_edge (iv_loop);
6788   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6789                                      loop_preheader_edge (iv_loop));
6790
6791   /* Convert the step to the desired type.  */
6792   stmts = NULL;
6793   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6794   if (stmts)
6795     {
6796       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6797       gcc_assert (!new_bb);
6798     }
6799
6800   /* Find the first insertion point in the BB.  */
6801   si = gsi_after_labels (bb);
6802
6803   /* For SLP induction we have to generate several IVs as for example
6804      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6805      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6806      [VF*S, VF*S, VF*S, VF*S] for all.  */
6807   if (slp_node)
6808     {
6809       /* Convert the init to the desired type.  */
6810       stmts = NULL;
6811       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6812       if (stmts)
6813         {
6814           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6815           gcc_assert (!new_bb);
6816         }
6817
6818       /* Generate [VF*S, VF*S, ... ].  */
6819       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6820         {
6821           expr = build_int_cst (integer_type_node, vf);
6822           expr = fold_convert (TREE_TYPE (step_expr), expr);
6823         }
6824       else
6825         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6826       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6827                               expr, step_expr);
6828       if (! CONSTANT_CLASS_P (new_name))
6829         new_name = vect_init_vector (phi, new_name,
6830                                      TREE_TYPE (step_expr), NULL);
6831       new_vec = build_vector_from_val (vectype, new_name);
6832       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6833
6834       /* Now generate the IVs.  */
6835       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6836       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6837       unsigned elts = nunits * nvects;
6838       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6839       gcc_assert (elts % group_size == 0);
6840       tree elt = init_expr;
6841       unsigned ivn;
6842       for (ivn = 0; ivn < nivs; ++ivn)
6843         {
6844           tree_vector_builder elts (vectype, nunits, 1);
6845           stmts = NULL;
6846           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6847             {
6848               if (ivn*nunits + eltn >= group_size
6849                   && (ivn*nunits + eltn) % group_size == 0)
6850                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6851                                     elt, step_expr);
6852               elts.quick_push (elt);
6853             }
6854           vec_init = gimple_build_vector (&stmts, &elts);
6855           if (stmts)
6856             {
6857               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6858               gcc_assert (!new_bb);
6859             }
6860
6861           /* Create the induction-phi that defines the induction-operand.  */
6862           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6863           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6864           set_vinfo_for_stmt (induction_phi,
6865                               new_stmt_vec_info (induction_phi, loop_vinfo));
6866           induc_def = PHI_RESULT (induction_phi);
6867
6868           /* Create the iv update inside the loop  */
6869           vec_def = make_ssa_name (vec_dest);
6870           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6871           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6872           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6873
6874           /* Set the arguments of the phi node:  */
6875           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6876           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6877                        UNKNOWN_LOCATION);
6878
6879           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6880         }
6881
6882       /* Re-use IVs when we can.  */
6883       if (ivn < nvects)
6884         {
6885           unsigned vfp
6886             = least_common_multiple (group_size, nunits) / group_size;
6887           /* Generate [VF'*S, VF'*S, ... ].  */
6888           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6889             {
6890               expr = build_int_cst (integer_type_node, vfp);
6891               expr = fold_convert (TREE_TYPE (step_expr), expr);
6892             }
6893           else
6894             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6895           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6896                                   expr, step_expr);
6897           if (! CONSTANT_CLASS_P (new_name))
6898             new_name = vect_init_vector (phi, new_name,
6899                                          TREE_TYPE (step_expr), NULL);
6900           new_vec = build_vector_from_val (vectype, new_name);
6901           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6902           for (; ivn < nvects; ++ivn)
6903             {
6904               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6905               tree def;
6906               if (gimple_code (iv) == GIMPLE_PHI)
6907                 def = gimple_phi_result (iv);
6908               else
6909                 def = gimple_assign_lhs (iv);
6910               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6911                                               PLUS_EXPR,
6912                                               def, vec_step);
6913               if (gimple_code (iv) == GIMPLE_PHI)
6914                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6915               else
6916                 {
6917                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6918                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6919                 }
6920               set_vinfo_for_stmt (new_stmt,
6921                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6922               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6923             }
6924         }
6925
6926       return true;
6927     }
6928
6929   /* Create the vector that holds the initial_value of the induction.  */
6930   if (nested_in_vect_loop)
6931     {
6932       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6933          been created during vectorization of previous stmts.  We obtain it
6934          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6935       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6936       /* If the initial value is not of proper type, convert it.  */
6937       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6938         {
6939           new_stmt
6940             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6941                                                           vect_simple_var,
6942                                                           "vec_iv_"),
6943                                    VIEW_CONVERT_EXPR,
6944                                    build1 (VIEW_CONVERT_EXPR, vectype,
6945                                            vec_init));
6946           vec_init = gimple_assign_lhs (new_stmt);
6947           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6948                                                  new_stmt);
6949           gcc_assert (!new_bb);
6950           set_vinfo_for_stmt (new_stmt,
6951                               new_stmt_vec_info (new_stmt, loop_vinfo));
6952         }
6953     }
6954   else
6955     {
6956       /* iv_loop is the loop to be vectorized. Create:
6957          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6958       stmts = NULL;
6959       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6960
6961       tree_vector_builder elts (vectype, nunits, 1);
6962       elts.quick_push (new_name);
6963       for (i = 1; i < nunits; i++)
6964         {
6965           /* Create: new_name_i = new_name + step_expr  */
6966           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6967                                    new_name, step_expr);
6968           elts.quick_push (new_name);
6969         }
6970       /* Create a vector from [new_name_0, new_name_1, ...,
6971          new_name_nunits-1]  */
6972       vec_init = gimple_build_vector (&stmts, &elts);
6973       if (stmts)
6974         {
6975           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6976           gcc_assert (!new_bb);
6977         }
6978     }
6979
6980
6981   /* Create the vector that holds the step of the induction.  */
6982   if (nested_in_vect_loop)
6983     /* iv_loop is nested in the loop to be vectorized. Generate:
6984        vec_step = [S, S, S, S]  */
6985     new_name = step_expr;
6986   else
6987     {
6988       /* iv_loop is the loop to be vectorized. Generate:
6989           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6990       gimple_seq seq = NULL;
6991       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6992         {
6993           expr = build_int_cst (integer_type_node, vf);
6994           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6995         }
6996       else
6997         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6998       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6999                                expr, step_expr);
7000       if (seq)
7001         {
7002           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7003           gcc_assert (!new_bb);
7004         }
7005     }
7006
7007   t = unshare_expr (new_name);
7008   gcc_assert (CONSTANT_CLASS_P (new_name)
7009               || TREE_CODE (new_name) == SSA_NAME);
7010   new_vec = build_vector_from_val (vectype, t);
7011   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7012
7013
7014   /* Create the following def-use cycle:
7015      loop prolog:
7016          vec_init = ...
7017          vec_step = ...
7018      loop:
7019          vec_iv = PHI <vec_init, vec_loop>
7020          ...
7021          STMT
7022          ...
7023          vec_loop = vec_iv + vec_step;  */
7024
7025   /* Create the induction-phi that defines the induction-operand.  */
7026   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7027   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7028   set_vinfo_for_stmt (induction_phi,
7029                       new_stmt_vec_info (induction_phi, loop_vinfo));
7030   induc_def = PHI_RESULT (induction_phi);
7031
7032   /* Create the iv update inside the loop  */
7033   vec_def = make_ssa_name (vec_dest);
7034   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7035   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7036   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7037
7038   /* Set the arguments of the phi node:  */
7039   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7040   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7041                UNKNOWN_LOCATION);
7042
7043   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
7044
7045   /* In case that vectorization factor (VF) is bigger than the number
7046      of elements that we can fit in a vectype (nunits), we have to generate
7047      more than one vector stmt - i.e - we need to "unroll" the
7048      vector stmt by a factor VF/nunits.  For more details see documentation
7049      in vectorizable_operation.  */
7050
7051   if (ncopies > 1)
7052     {
7053       gimple_seq seq = NULL;
7054       stmt_vec_info prev_stmt_vinfo;
7055       /* FORNOW. This restriction should be relaxed.  */
7056       gcc_assert (!nested_in_vect_loop);
7057
7058       /* Create the vector that holds the step of the induction.  */
7059       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7060         {
7061           expr = build_int_cst (integer_type_node, nunits);
7062           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7063         }
7064       else
7065         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
7066       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7067                                expr, step_expr);
7068       if (seq)
7069         {
7070           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7071           gcc_assert (!new_bb);
7072         }
7073
7074       t = unshare_expr (new_name);
7075       gcc_assert (CONSTANT_CLASS_P (new_name)
7076                   || TREE_CODE (new_name) == SSA_NAME);
7077       new_vec = build_vector_from_val (vectype, t);
7078       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7079
7080       vec_def = induc_def;
7081       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7082       for (i = 1; i < ncopies; i++)
7083         {
7084           /* vec_i = vec_prev + vec_step  */
7085           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7086                                           vec_def, vec_step);
7087           vec_def = make_ssa_name (vec_dest, new_stmt);
7088           gimple_assign_set_lhs (new_stmt, vec_def);
7089
7090           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7091           set_vinfo_for_stmt (new_stmt,
7092                               new_stmt_vec_info (new_stmt, loop_vinfo));
7093           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7094           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7095         }
7096     }
7097
7098   if (nested_in_vect_loop)
7099     {
7100       /* Find the loop-closed exit-phi of the induction, and record
7101          the final vector of induction results:  */
7102       exit_phi = NULL;
7103       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7104         {
7105           gimple *use_stmt = USE_STMT (use_p);
7106           if (is_gimple_debug (use_stmt))
7107             continue;
7108
7109           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7110             {
7111               exit_phi = use_stmt;
7112               break;
7113             }
7114         }
7115       if (exit_phi)
7116         {
7117           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7118           /* FORNOW. Currently not supporting the case that an inner-loop induction
7119              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7120           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7121                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7122
7123           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7124           if (dump_enabled_p ())
7125             {
7126               dump_printf_loc (MSG_NOTE, vect_location,
7127                                "vector of inductions after inner-loop:");
7128               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7129             }
7130         }
7131     }
7132
7133
7134   if (dump_enabled_p ())
7135     {
7136       dump_printf_loc (MSG_NOTE, vect_location,
7137                        "transform induction: created def-use cycle: ");
7138       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7139       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7140                         SSA_NAME_DEF_STMT (vec_def), 0);
7141     }
7142
7143   return true;
7144 }
7145
7146 /* Function vectorizable_live_operation.
7147
7148    STMT computes a value that is used outside the loop.  Check if
7149    it can be supported.  */
7150
7151 bool
7152 vectorizable_live_operation (gimple *stmt,
7153                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7154                              slp_tree slp_node, int slp_index,
7155                              gimple **vec_stmt)
7156 {
7157   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7158   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7159   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7160   imm_use_iterator imm_iter;
7161   tree lhs, lhs_type, bitsize, vec_bitsize;
7162   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7163   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7164   int ncopies;
7165   gimple *use_stmt;
7166   auto_vec<tree> vec_oprnds;
7167
7168   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7169
7170   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7171     return false;
7172
7173   /* FORNOW.  CHECKME.  */
7174   if (nested_in_vect_loop_p (loop, stmt))
7175     return false;
7176
7177   /* If STMT is not relevant and it is a simple assignment and its inputs are
7178      invariant then it can remain in place, unvectorized.  The original last
7179      scalar value that it computes will be used.  */
7180   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7181     {
7182       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7183       if (dump_enabled_p ())
7184         dump_printf_loc (MSG_NOTE, vect_location,
7185                          "statement is simple and uses invariant.  Leaving in "
7186                          "place.\n");
7187       return true;
7188     }
7189
7190   if (slp_node)
7191     ncopies = 1;
7192   else
7193     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7194
7195   if (!vec_stmt)
7196     /* No transformation required.  */
7197     return true;
7198
7199   /* If stmt has a related stmt, then use that for getting the lhs.  */
7200   if (is_pattern_stmt_p (stmt_info))
7201     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7202
7203   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7204         : gimple_get_lhs (stmt);
7205   lhs_type = TREE_TYPE (lhs);
7206
7207   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7208              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7209              : TYPE_SIZE (TREE_TYPE (vectype)));
7210   vec_bitsize = TYPE_SIZE (vectype);
7211
7212   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7213   tree vec_lhs, bitstart;
7214   if (slp_node)
7215     {
7216       gcc_assert (slp_index >= 0);
7217
7218       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7219       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7220
7221       /* Get the last occurrence of the scalar index from the concatenation of
7222          all the slp vectors. Calculate which slp vector it is and the index
7223          within.  */
7224       int pos = (num_vec * nunits) - num_scalar + slp_index;
7225       int vec_entry = pos / nunits;
7226       int vec_index = pos % nunits;
7227
7228       /* Get the correct slp vectorized stmt.  */
7229       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7230
7231       /* Get entry to use.  */
7232       bitstart = bitsize_int (vec_index);
7233       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7234     }
7235   else
7236     {
7237       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7238       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7239
7240       /* For multiple copies, get the last copy.  */
7241       for (int i = 1; i < ncopies; ++i)
7242         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7243                                                   vec_lhs);
7244
7245       /* Get the last lane in the vector.  */
7246       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7247     }
7248
7249   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7250      loop.  */
7251   gimple_seq stmts = NULL;
7252   tree bftype = TREE_TYPE (vectype);
7253   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7254     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7255   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7256   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7257                                    true, NULL_TREE);
7258   if (stmts)
7259     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7260
7261   /* Replace use of lhs with newly computed result.  If the use stmt is a
7262      single arg PHI, just replace all uses of PHI result.  It's necessary
7263      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7264   use_operand_p use_p;
7265   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7266     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7267         && !is_gimple_debug (use_stmt))
7268     {
7269       if (gimple_code (use_stmt) == GIMPLE_PHI
7270           && gimple_phi_num_args (use_stmt) == 1)
7271         {
7272           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7273         }
7274       else
7275         {
7276           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7277             SET_USE (use_p, new_tree);
7278         }
7279       update_stmt (use_stmt);
7280     }
7281
7282   return true;
7283 }
7284
7285 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7286
7287 static void
7288 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7289 {
7290   ssa_op_iter op_iter;
7291   imm_use_iterator imm_iter;
7292   def_operand_p def_p;
7293   gimple *ustmt;
7294
7295   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7296     {
7297       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7298         {
7299           basic_block bb;
7300
7301           if (!is_gimple_debug (ustmt))
7302             continue;
7303
7304           bb = gimple_bb (ustmt);
7305
7306           if (!flow_bb_inside_loop_p (loop, bb))
7307             {
7308               if (gimple_debug_bind_p (ustmt))
7309                 {
7310                   if (dump_enabled_p ())
7311                     dump_printf_loc (MSG_NOTE, vect_location,
7312                                      "killing debug use\n");
7313
7314                   gimple_debug_bind_reset_value (ustmt);
7315                   update_stmt (ustmt);
7316                 }
7317               else
7318                 gcc_unreachable ();
7319             }
7320         }
7321     }
7322 }
7323
7324 /* Given loop represented by LOOP_VINFO, return true if computation of
7325    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7326    otherwise.  */
7327
7328 static bool
7329 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7330 {
7331   /* Constant case.  */
7332   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7333     {
7334       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7335       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7336
7337       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7338       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7339       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7340         return true;
7341     }
7342
7343   widest_int max;
7344   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7345   /* Check the upper bound of loop niters.  */
7346   if (get_max_loop_iterations (loop, &max))
7347     {
7348       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7349       signop sgn = TYPE_SIGN (type);
7350       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7351       if (max < type_max)
7352         return true;
7353     }
7354   return false;
7355 }
7356
7357 /* Scale profiling counters by estimation for LOOP which is vectorized
7358    by factor VF.  */
7359
7360 static void
7361 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7362 {
7363   edge preheader = loop_preheader_edge (loop);
7364   /* Reduce loop iterations by the vectorization factor.  */
7365   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7366   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7367
7368   if (freq_h.nonzero_p ())
7369     {
7370       profile_probability p;
7371
7372       /* Avoid dropping loop body profile counter to 0 because of zero count
7373          in loop's preheader.  */
7374       if (!(freq_e == profile_count::zero ()))
7375         freq_e = freq_e.force_nonzero ();
7376       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7377       scale_loop_frequencies (loop, p);
7378     }
7379
7380   edge exit_e = single_exit (loop);
7381   exit_e->probability = profile_probability::always ()
7382                                  .apply_scale (1, new_est_niter + 1);
7383
7384   edge exit_l = single_pred_edge (loop->latch);
7385   profile_probability prob = exit_l->probability;
7386   exit_l->probability = exit_e->probability.invert ();
7387   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7388     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7389 }
7390
7391 /* Function vect_transform_loop.
7392
7393    The analysis phase has determined that the loop is vectorizable.
7394    Vectorize the loop - created vectorized stmts to replace the scalar
7395    stmts in the loop, and update the loop exit condition.
7396    Returns scalar epilogue loop if any.  */
7397
7398 struct loop *
7399 vect_transform_loop (loop_vec_info loop_vinfo)
7400 {
7401   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7402   struct loop *epilogue = NULL;
7403   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7404   int nbbs = loop->num_nodes;
7405   int i;
7406   tree niters_vector = NULL_TREE;
7407   tree step_vector = NULL_TREE;
7408   tree niters_vector_mult_vf = NULL_TREE;
7409   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7410   unsigned int lowest_vf = constant_lower_bound (vf);
7411   bool grouped_store;
7412   bool slp_scheduled = false;
7413   gimple *stmt, *pattern_stmt;
7414   gimple_seq pattern_def_seq = NULL;
7415   gimple_stmt_iterator pattern_def_si = gsi_none ();
7416   bool transform_pattern_stmt = false;
7417   bool check_profitability = false;
7418   unsigned int th;
7419
7420   if (dump_enabled_p ())
7421     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7422
7423   /* Use the more conservative vectorization threshold.  If the number
7424      of iterations is constant assume the cost check has been performed
7425      by our caller.  If the threshold makes all loops profitable that
7426      run at least the (estimated) vectorization factor number of times
7427      checking is pointless, too.  */
7428   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7429   if (th >= vect_vf_for_cost (loop_vinfo)
7430       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7431     {
7432       if (dump_enabled_p ())
7433         dump_printf_loc (MSG_NOTE, vect_location,
7434                          "Profitability threshold is %d loop iterations.\n",
7435                          th);
7436       check_profitability = true;
7437     }
7438
7439   /* Make sure there exists a single-predecessor exit bb.  Do this before
7440      versioning.   */
7441   edge e = single_exit (loop);
7442   if (! single_pred_p (e->dest))
7443     {
7444       split_loop_exit_edge (e);
7445       if (dump_enabled_p ())
7446         dump_printf (MSG_NOTE, "split exit edge\n");
7447     }
7448
7449   /* Version the loop first, if required, so the profitability check
7450      comes first.  */
7451
7452   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7453     {
7454       poly_uint64 versioning_threshold
7455         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7456       if (check_profitability
7457           && ordered_p (poly_uint64 (th), versioning_threshold))
7458         {
7459           versioning_threshold = ordered_max (poly_uint64 (th),
7460                                               versioning_threshold);
7461           check_profitability = false;
7462         }
7463       vect_loop_versioning (loop_vinfo, th, check_profitability,
7464                             versioning_threshold);
7465       check_profitability = false;
7466     }
7467
7468   /* Make sure there exists a single-predecessor exit bb also on the
7469      scalar loop copy.  Do this after versioning but before peeling
7470      so CFG structure is fine for both scalar and if-converted loop
7471      to make slpeel_duplicate_current_defs_from_edges face matched
7472      loop closed PHI nodes on the exit.  */
7473   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7474     {
7475       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7476       if (! single_pred_p (e->dest))
7477         {
7478           split_loop_exit_edge (e);
7479           if (dump_enabled_p ())
7480             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7481         }
7482     }
7483
7484   tree niters = vect_build_loop_niters (loop_vinfo);
7485   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7486   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7487   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7488   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
7489                               &step_vector, &niters_vector_mult_vf, th,
7490                               check_profitability, niters_no_overflow);
7491   if (niters_vector == NULL_TREE)
7492     {
7493       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && known_eq (lowest_vf, vf))
7494         {
7495           niters_vector
7496             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7497                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
7498           step_vector = build_one_cst (TREE_TYPE (niters));
7499         }
7500       else
7501         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7502                                      &step_vector, niters_no_overflow);
7503     }
7504
7505   /* 1) Make sure the loop header has exactly two entries
7506      2) Make sure we have a preheader basic block.  */
7507
7508   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7509
7510   split_edge (loop_preheader_edge (loop));
7511
7512   /* FORNOW: the vectorizer supports only loops which body consist
7513      of one basic block (header + empty latch). When the vectorizer will
7514      support more involved loop forms, the order by which the BBs are
7515      traversed need to be reconsidered.  */
7516
7517   for (i = 0; i < nbbs; i++)
7518     {
7519       basic_block bb = bbs[i];
7520       stmt_vec_info stmt_info;
7521
7522       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7523            gsi_next (&si))
7524         {
7525           gphi *phi = si.phi ();
7526           if (dump_enabled_p ())
7527             {
7528               dump_printf_loc (MSG_NOTE, vect_location,
7529                                "------>vectorizing phi: ");
7530               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7531             }
7532           stmt_info = vinfo_for_stmt (phi);
7533           if (!stmt_info)
7534             continue;
7535
7536           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7537             vect_loop_kill_debug_uses (loop, phi);
7538
7539           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7540               && !STMT_VINFO_LIVE_P (stmt_info))
7541             continue;
7542
7543           if (STMT_VINFO_VECTYPE (stmt_info)
7544               && (maybe_ne
7545                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
7546               && dump_enabled_p ())
7547             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7548
7549           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7550                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7551                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7552               && ! PURE_SLP_STMT (stmt_info))
7553             {
7554               if (dump_enabled_p ())
7555                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7556               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7557             }
7558         }
7559
7560       pattern_stmt = NULL;
7561       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7562            !gsi_end_p (si) || transform_pattern_stmt;)
7563         {
7564           bool is_store;
7565
7566           if (transform_pattern_stmt)
7567             stmt = pattern_stmt;
7568           else
7569             {
7570               stmt = gsi_stmt (si);
7571               /* During vectorization remove existing clobber stmts.  */
7572               if (gimple_clobber_p (stmt))
7573                 {
7574                   unlink_stmt_vdef (stmt);
7575                   gsi_remove (&si, true);
7576                   release_defs (stmt);
7577                   continue;
7578                 }
7579             }
7580
7581           if (dump_enabled_p ())
7582             {
7583               dump_printf_loc (MSG_NOTE, vect_location,
7584                                "------>vectorizing statement: ");
7585               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7586             }
7587
7588           stmt_info = vinfo_for_stmt (stmt);
7589
7590           /* vector stmts created in the outer-loop during vectorization of
7591              stmts in an inner-loop may not have a stmt_info, and do not
7592              need to be vectorized.  */
7593           if (!stmt_info)
7594             {
7595               gsi_next (&si);
7596               continue;
7597             }
7598
7599           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7600             vect_loop_kill_debug_uses (loop, stmt);
7601
7602           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7603               && !STMT_VINFO_LIVE_P (stmt_info))
7604             {
7605               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7606                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7607                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7608                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7609                 {
7610                   stmt = pattern_stmt;
7611                   stmt_info = vinfo_for_stmt (stmt);
7612                 }
7613               else
7614                 {
7615                   gsi_next (&si);
7616                   continue;
7617                 }
7618             }
7619           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7620                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7621                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7622                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7623             transform_pattern_stmt = true;
7624
7625           /* If pattern statement has def stmts, vectorize them too.  */
7626           if (is_pattern_stmt_p (stmt_info))
7627             {
7628               if (pattern_def_seq == NULL)
7629                 {
7630                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7631                   pattern_def_si = gsi_start (pattern_def_seq);
7632                 }
7633               else if (!gsi_end_p (pattern_def_si))
7634                 gsi_next (&pattern_def_si);
7635               if (pattern_def_seq != NULL)
7636                 {
7637                   gimple *pattern_def_stmt = NULL;
7638                   stmt_vec_info pattern_def_stmt_info = NULL;
7639
7640                   while (!gsi_end_p (pattern_def_si))
7641                     {
7642                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7643                       pattern_def_stmt_info
7644                         = vinfo_for_stmt (pattern_def_stmt);
7645                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7646                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7647                         break;
7648                       gsi_next (&pattern_def_si);
7649                     }
7650
7651                   if (!gsi_end_p (pattern_def_si))
7652                     {
7653                       if (dump_enabled_p ())
7654                         {
7655                           dump_printf_loc (MSG_NOTE, vect_location,
7656                                            "==> vectorizing pattern def "
7657                                            "stmt: ");
7658                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7659                                             pattern_def_stmt, 0);
7660                         }
7661
7662                       stmt = pattern_def_stmt;
7663                       stmt_info = pattern_def_stmt_info;
7664                     }
7665                   else
7666                     {
7667                       pattern_def_si = gsi_none ();
7668                       transform_pattern_stmt = false;
7669                     }
7670                 }
7671               else
7672                 transform_pattern_stmt = false;
7673             }
7674
7675           if (STMT_VINFO_VECTYPE (stmt_info))
7676             {
7677               unsigned int nunits
7678                 = (unsigned int)
7679                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7680               if (!STMT_SLP_TYPE (stmt_info)
7681                   && maybe_ne (nunits, vf)
7682                   && dump_enabled_p ())
7683                   /* For SLP VF is set according to unrolling factor, and not
7684                      to vector size, hence for SLP this print is not valid.  */
7685                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7686             }
7687
7688           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7689              reached.  */
7690           if (STMT_SLP_TYPE (stmt_info))
7691             {
7692               if (!slp_scheduled)
7693                 {
7694                   slp_scheduled = true;
7695
7696                   if (dump_enabled_p ())
7697                     dump_printf_loc (MSG_NOTE, vect_location,
7698                                      "=== scheduling SLP instances ===\n");
7699
7700                   vect_schedule_slp (loop_vinfo);
7701                 }
7702
7703               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7704               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7705                 {
7706                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7707                     {
7708                       pattern_def_seq = NULL;
7709                       gsi_next (&si);
7710                     }
7711                   continue;
7712                 }
7713             }
7714
7715           /* -------- vectorize statement ------------ */
7716           if (dump_enabled_p ())
7717             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7718
7719           grouped_store = false;
7720           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7721           if (is_store)
7722             {
7723               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7724                 {
7725                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7726                      interleaving chain was completed - free all the stores in
7727                      the chain.  */
7728                   gsi_next (&si);
7729                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7730                 }
7731               else
7732                 {
7733                   /* Free the attached stmt_vec_info and remove the stmt.  */
7734                   gimple *store = gsi_stmt (si);
7735                   free_stmt_vec_info (store);
7736                   unlink_stmt_vdef (store);
7737                   gsi_remove (&si, true);
7738                   release_defs (store);
7739                 }
7740
7741               /* Stores can only appear at the end of pattern statements.  */
7742               gcc_assert (!transform_pattern_stmt);
7743               pattern_def_seq = NULL;
7744             }
7745           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7746             {
7747               pattern_def_seq = NULL;
7748               gsi_next (&si);
7749             }
7750         }                       /* stmts in BB */
7751     }                           /* BBs in loop */
7752
7753   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
7754      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
7755   if (integer_onep (step_vector))
7756     niters_no_overflow = true;
7757   slpeel_make_loop_iterate_ntimes (loop, niters_vector, step_vector,
7758                                    niters_vector_mult_vf,
7759                                    !niters_no_overflow);
7760
7761   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
7762   scale_profile_for_vect_loop (loop, assumed_vf);
7763
7764   /* The minimum number of iterations performed by the epilogue.  This
7765      is 1 when peeling for gaps because we always need a final scalar
7766      iteration.  */
7767   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7768   /* +1 to convert latch counts to loop iteration counts,
7769      -min_epilogue_iters to remove iterations that cannot be performed
7770        by the vector code.  */
7771   int bias = 1 - min_epilogue_iters;
7772   /* In these calculations the "- 1" converts loop iteration counts
7773      back to latch counts.  */
7774   if (loop->any_upper_bound)
7775     loop->nb_iterations_upper_bound
7776       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias,
7777                         lowest_vf) - 1;
7778   if (loop->any_likely_upper_bound)
7779     loop->nb_iterations_likely_upper_bound
7780       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias,
7781                         lowest_vf) - 1;
7782   if (loop->any_estimate)
7783     loop->nb_iterations_estimate
7784       = wi::udiv_floor (loop->nb_iterations_estimate + bias,
7785                         assumed_vf) - 1;
7786
7787   if (dump_enabled_p ())
7788     {
7789       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7790         {
7791           dump_printf_loc (MSG_NOTE, vect_location,
7792                            "LOOP VECTORIZED\n");
7793           if (loop->inner)
7794             dump_printf_loc (MSG_NOTE, vect_location,
7795                              "OUTER LOOP VECTORIZED\n");
7796           dump_printf (MSG_NOTE, "\n");
7797         }
7798       else
7799         {
7800           dump_printf_loc (MSG_NOTE, vect_location,
7801                            "LOOP EPILOGUE VECTORIZED (VS=");
7802           dump_dec (MSG_NOTE, current_vector_size);
7803           dump_printf (MSG_NOTE, ")\n");
7804         }
7805     }
7806
7807   /* Free SLP instances here because otherwise stmt reference counting
7808      won't work.  */
7809   slp_instance instance;
7810   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7811     vect_free_slp_instance (instance);
7812   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7813   /* Clear-up safelen field since its value is invalid after vectorization
7814      since vectorized loop can have loop-carried dependencies.  */
7815   loop->safelen = 0;
7816
7817   /* Don't vectorize epilogue for epilogue.  */
7818   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7819     epilogue = NULL;
7820
7821   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7822     epilogue = NULL;
7823
7824   if (epilogue)
7825     {
7826       auto_vector_sizes vector_sizes;
7827       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
7828       unsigned int next_size = 0;
7829
7830       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7831           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
7832           && known_eq (vf, lowest_vf))
7833         {
7834           unsigned int eiters
7835             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
7836                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
7837           eiters = eiters % lowest_vf;
7838           epilogue->nb_iterations_upper_bound = eiters - 1;
7839
7840           unsigned int ratio;
7841           while (next_size < vector_sizes.length ()
7842                  && !(constant_multiple_p (current_vector_size,
7843                                            vector_sizes[next_size], &ratio)
7844                       && eiters >= lowest_vf / ratio))
7845             next_size += 1;
7846         }
7847       else
7848         while (next_size < vector_sizes.length ()
7849                && maybe_lt (current_vector_size, vector_sizes[next_size]))
7850           next_size += 1;
7851
7852       if (next_size == vector_sizes.length ())
7853         epilogue = NULL;
7854     }
7855
7856   if (epilogue)
7857     {
7858       epilogue->force_vectorize = loop->force_vectorize;
7859       epilogue->safelen = loop->safelen;
7860       epilogue->dont_vectorize = false;
7861
7862       /* We may need to if-convert epilogue to vectorize it.  */
7863       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7864         tree_if_conversion (epilogue);
7865     }
7866
7867   return epilogue;
7868 }
7869
7870 /* The code below is trying to perform simple optimization - revert
7871    if-conversion for masked stores, i.e. if the mask of a store is zero
7872    do not perform it and all stored value producers also if possible.
7873    For example,
7874      for (i=0; i<n; i++)
7875        if (c[i])
7876         {
7877           p1[i] += 1;
7878           p2[i] = p3[i] +2;
7879         }
7880    this transformation will produce the following semi-hammock:
7881
7882    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7883      {
7884        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7885        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7886        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7887        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7888        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7889        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7890      }
7891 */
7892
7893 void
7894 optimize_mask_stores (struct loop *loop)
7895 {
7896   basic_block *bbs = get_loop_body (loop);
7897   unsigned nbbs = loop->num_nodes;
7898   unsigned i;
7899   basic_block bb;
7900   struct loop *bb_loop;
7901   gimple_stmt_iterator gsi;
7902   gimple *stmt;
7903   auto_vec<gimple *> worklist;
7904
7905   vect_location = find_loop_location (loop);
7906   /* Pick up all masked stores in loop if any.  */
7907   for (i = 0; i < nbbs; i++)
7908     {
7909       bb = bbs[i];
7910       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7911            gsi_next (&gsi))
7912         {
7913           stmt = gsi_stmt (gsi);
7914           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7915             worklist.safe_push (stmt);
7916         }
7917     }
7918
7919   free (bbs);
7920   if (worklist.is_empty ())
7921     return;
7922
7923   /* Loop has masked stores.  */
7924   while (!worklist.is_empty ())
7925     {
7926       gimple *last, *last_store;
7927       edge e, efalse;
7928       tree mask;
7929       basic_block store_bb, join_bb;
7930       gimple_stmt_iterator gsi_to;
7931       tree vdef, new_vdef;
7932       gphi *phi;
7933       tree vectype;
7934       tree zero;
7935
7936       last = worklist.pop ();
7937       mask = gimple_call_arg (last, 2);
7938       bb = gimple_bb (last);
7939       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7940          the same loop as if_bb.  It could be different to LOOP when two
7941          level loop-nest is vectorized and mask_store belongs to the inner
7942          one.  */
7943       e = split_block (bb, last);
7944       bb_loop = bb->loop_father;
7945       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7946       join_bb = e->dest;
7947       store_bb = create_empty_bb (bb);
7948       add_bb_to_loop (store_bb, bb_loop);
7949       e->flags = EDGE_TRUE_VALUE;
7950       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7951       /* Put STORE_BB to likely part.  */
7952       efalse->probability = profile_probability::unlikely ();
7953       store_bb->count = efalse->count ();
7954       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7955       if (dom_info_available_p (CDI_DOMINATORS))
7956         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7957       if (dump_enabled_p ())
7958         dump_printf_loc (MSG_NOTE, vect_location,
7959                          "Create new block %d to sink mask stores.",
7960                          store_bb->index);
7961       /* Create vector comparison with boolean result.  */
7962       vectype = TREE_TYPE (mask);
7963       zero = build_zero_cst (vectype);
7964       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7965       gsi = gsi_last_bb (bb);
7966       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7967       /* Create new PHI node for vdef of the last masked store:
7968          .MEM_2 = VDEF <.MEM_1>
7969          will be converted to
7970          .MEM.3 = VDEF <.MEM_1>
7971          and new PHI node will be created in join bb
7972          .MEM_2 = PHI <.MEM_1, .MEM_3>
7973       */
7974       vdef = gimple_vdef (last);
7975       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7976       gimple_set_vdef (last, new_vdef);
7977       phi = create_phi_node (vdef, join_bb);
7978       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7979
7980       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7981       while (true)
7982         {
7983           gimple_stmt_iterator gsi_from;
7984           gimple *stmt1 = NULL;
7985
7986           /* Move masked store to STORE_BB.  */
7987           last_store = last;
7988           gsi = gsi_for_stmt (last);
7989           gsi_from = gsi;
7990           /* Shift GSI to the previous stmt for further traversal.  */
7991           gsi_prev (&gsi);
7992           gsi_to = gsi_start_bb (store_bb);
7993           gsi_move_before (&gsi_from, &gsi_to);
7994           /* Setup GSI_TO to the non-empty block start.  */
7995           gsi_to = gsi_start_bb (store_bb);
7996           if (dump_enabled_p ())
7997             {
7998               dump_printf_loc (MSG_NOTE, vect_location,
7999                                "Move stmt to created bb\n");
8000               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
8001             }
8002           /* Move all stored value producers if possible.  */
8003           while (!gsi_end_p (gsi))
8004             {
8005               tree lhs;
8006               imm_use_iterator imm_iter;
8007               use_operand_p use_p;
8008               bool res;
8009
8010               /* Skip debug statements.  */
8011               if (is_gimple_debug (gsi_stmt (gsi)))
8012                 {
8013                   gsi_prev (&gsi);
8014                   continue;
8015                 }
8016               stmt1 = gsi_stmt (gsi);
8017               /* Do not consider statements writing to memory or having
8018                  volatile operand.  */
8019               if (gimple_vdef (stmt1)
8020                   || gimple_has_volatile_ops (stmt1))
8021                 break;
8022               gsi_from = gsi;
8023               gsi_prev (&gsi);
8024               lhs = gimple_get_lhs (stmt1);
8025               if (!lhs)
8026                 break;
8027
8028               /* LHS of vectorized stmt must be SSA_NAME.  */
8029               if (TREE_CODE (lhs) != SSA_NAME)
8030                 break;
8031
8032               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8033                 {
8034                   /* Remove dead scalar statement.  */
8035                   if (has_zero_uses (lhs))
8036                     {
8037                       gsi_remove (&gsi_from, true);
8038                       continue;
8039                     }
8040                 }
8041
8042               /* Check that LHS does not have uses outside of STORE_BB.  */
8043               res = true;
8044               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
8045                 {
8046                   gimple *use_stmt;
8047                   use_stmt = USE_STMT (use_p);
8048                   if (is_gimple_debug (use_stmt))
8049                     continue;
8050                   if (gimple_bb (use_stmt) != store_bb)
8051                     {
8052                       res = false;
8053                       break;
8054                     }
8055                 }
8056               if (!res)
8057                 break;
8058
8059               if (gimple_vuse (stmt1)
8060                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
8061                 break;
8062
8063               /* Can move STMT1 to STORE_BB.  */
8064               if (dump_enabled_p ())
8065                 {
8066                   dump_printf_loc (MSG_NOTE, vect_location,
8067                                    "Move stmt to created bb\n");
8068                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
8069                 }
8070               gsi_move_before (&gsi_from, &gsi_to);
8071               /* Shift GSI_TO for further insertion.  */
8072               gsi_prev (&gsi_to);
8073             }
8074           /* Put other masked stores with the same mask to STORE_BB.  */
8075           if (worklist.is_empty ()
8076               || gimple_call_arg (worklist.last (), 2) != mask
8077               || worklist.last () != stmt1)
8078             break;
8079           last = worklist.pop ();
8080         }
8081       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
8082     }
8083 }