gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55
  56 /* Loop Vectorization Pass.
  57
  58    This pass tries to vectorize loops.
  59
  60    For example, the vectorizer transforms the following simple loop:
  61
  62         short a[N]; short b[N]; short c[N]; int i;
  63
  64         for (i=0; i<N; i++){
  65           a[i] = b[i] + c[i];
  66         }
  67
  68    as if it was manually vectorized by rewriting the source code into:
  69
  70         typedef int __attribute__((mode(V8HI))) v8hi;
  71         short a[N];  short b[N]; short c[N];   int i;
  72         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  73         v8hi va, vb, vc;
  74
  75         for (i=0; i<N/8; i++){
  76           vb = pb[i];
  77           vc = pc[i];
  78           va = vb + vc;
  79           pa[i] = va;
  80         }
  81
  82         The main entry to this pass is vectorize_loops(), in which
  83    the vectorizer applies a set of analyses on a given set of loops,
  84    followed by the actual vectorization transformation for the loops that
  85    had successfully passed the analysis phase.
  86         Throughout this pass we make a distinction between two types of
  87    data: scalars (which are represented by SSA_NAMES), and memory references
  88    ("data-refs").  These two types of data require different handling both
  89    during analysis and transformation. The types of data-refs that the
  90    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  91    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  92    accesses are required to have a simple (consecutive) access pattern.
  93
  94    Analysis phase:
  95    ===============
  96         The driver for the analysis phase is vect_analyze_loop().
  97    It applies a set of analyses, some of which rely on the scalar evolution
  98    analyzer (scev) developed by Sebastian Pop.
  99
 100         During the analysis phase the vectorizer records some information
 101    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 102    loop, as well as general information about the loop as a whole, which is
 103    recorded in a "loop_vec_info" struct attached to each loop.
 104
 105    Transformation phase:
 106    =====================
 107         The loop transformation phase scans all the stmts in the loop, and
 108    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 109    the loop that needs to be vectorized.  It inserts the vector code sequence
 110    just before the scalar stmt S, and records a pointer to the vector code
 111    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 112    attached to S).  This pointer will be used for the vectorization of following
 113    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 114    otherwise, we rely on dead code elimination for removing it.
 115
 116         For example, say stmt S1 was vectorized into stmt VS1:
 117
 118    VS1: vb = px[i];
 119    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 120    S2:  a = b;
 121
 122    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 123    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 124    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 125    resulting sequence would be:
 126
 127    VS1: vb = px[i];
 128    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 129    VS2: va = vb;
 130    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 131
 132         Operands that are not SSA_NAMEs, are data-refs that appear in
 133    load/store operations (like 'x[i]' in S1), and are handled differently.
 134
 135    Target modeling:
 136    =================
 137         Currently the only target specific information that is used is the
 138    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 139    Targets that can support different sizes of vectors, for now will need
 140    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 141    flexibility will be added in the future.
 142
 143         Since we only vectorize operations which vector form can be
 144    expressed using existing tree codes, to verify that an operation is
 145    supported, the vectorizer checks the relevant optab at the relevant
 146    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 147    the value found is CODE_FOR_nothing, then there's no target support, and
 148    we can't vectorize the stmt.
 149
 150    For additional information on this project see:
 151    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 152 */
 153
 154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 155
 156 /* Function vect_determine_vectorization_factor
 157
 158    Determine the vectorization factor (VF).  VF is the number of data elements
 159    that are operated upon in parallel in a single iteration of the vectorized
 160    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 161    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 162    elements can fit in a single vector register.
 163
 164    We currently support vectorization of loops in which all types operated upon
 165    are of the same size.  Therefore this function currently sets VF according to
 166    the size of the types operated upon, and fails if there are multiple sizes
 167    in the loop.
 168
 169    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 170    original loop:
 171         for (i=0; i<N; i++){
 172           a[i] = b[i] + c[i];
 173         }
 174
 175    vectorized loop:
 176         for (i=0; i<N; i+=VF){
 177           a[i:VF] = b[i:VF] + c[i:VF];
 178         }
 179 */
 180
 181 static bool
 182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 186   unsigned nbbs = loop->num_nodes;
 187   unsigned int vectorization_factor = 0;
 188   tree scalar_type = NULL_TREE;
 189   gphi *phi;
 190   tree vectype;
 191   unsigned int nunits;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 261               if (dump_enabled_p ())
 262                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 263                                  nunits);
 264
 265               if (!vectorization_factor
 266                   || (nunits > vectorization_factor))
 267                 vectorization_factor = nunits;
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 531                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 558           if (!vectorization_factor
 559               || (nunits > vectorization_factor))
 560             vectorization_factor = nunits;
 561
 562           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 563             {
 564               pattern_def_seq = NULL;
 565               gsi_next (&si);
 566             }
 567         }
 568     }
 569
 570   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 571   if (dump_enabled_p ())
 572     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 573                      vectorization_factor);
 574   if (vectorization_factor <= 1)
 575     {
 576       if (dump_enabled_p ())
 577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 578                          "not vectorized: unsupported data-type\n");
 579       return false;
 580     }
 581   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 582
 583   for (i = 0; i < mask_producers.length (); i++)
 584     {
 585       tree mask_type = NULL;
 586
 587       stmt = STMT_VINFO_STMT (mask_producers[i]);
 588
 589       if (is_gimple_assign (stmt)
 590           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 591           && !VECT_SCALAR_BOOLEAN_TYPE_P
 592                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 593         {
 594           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 595           mask_type = get_mask_type_for_scalar_type (scalar_type);
 596
 597           if (!mask_type)
 598             {
 599               if (dump_enabled_p ())
 600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 601                                  "not vectorized: unsupported mask\n");
 602               return false;
 603             }
 604         }
 605       else
 606         {
 607           tree rhs;
 608           ssa_op_iter iter;
 609           gimple *def_stmt;
 610           enum vect_def_type dt;
 611
 612           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 613             {
 614               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 615                                        &def_stmt, &dt, &vectype))
 616                 {
 617                   if (dump_enabled_p ())
 618                     {
 619                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 620                                        "not vectorized: can't compute mask type "
 621                                        "for statement, ");
 622                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 623                                         0);
 624                     }
 625                   return false;
 626                 }
 627
 628               /* No vectype probably means external definition.
 629                  Allow it in case there is another operand which
 630                  allows to determine mask type.  */
 631               if (!vectype)
 632                 continue;
 633
 634               if (!mask_type)
 635                 mask_type = vectype;
 636               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 637                        != TYPE_VECTOR_SUBPARTS (vectype))
 638                 {
 639                   if (dump_enabled_p ())
 640                     {
 641                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 642                                        "not vectorized: different sized masks "
 643                                        "types in statement, ");
 644                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 645                                          mask_type);
 646                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 647                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 648                                          vectype);
 649                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 650                     }
 651                   return false;
 652                 }
 653               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 654                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 655                 {
 656                   if (dump_enabled_p ())
 657                     {
 658                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 659                                        "not vectorized: mixed mask and "
 660                                        "nonmask vector types in statement, ");
 661                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 662                                          mask_type);
 663                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 664                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 665                                          vectype);
 666                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 667                     }
 668                   return false;
 669                 }
 670             }
 671
 672           /* We may compare boolean value loaded as vector of integers.
 673              Fix mask_type in such case.  */
 674           if (mask_type
 675               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 676               && gimple_code (stmt) == GIMPLE_ASSIGN
 677               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 678             mask_type = build_same_sized_truth_vector_type (mask_type);
 679         }
 680
 681       /* No mask_type should mean loop invariant predicate.
 682          This is probably a subject for optimization in
 683          if-conversion.  */
 684       if (!mask_type)
 685         {
 686           if (dump_enabled_p ())
 687             {
 688               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 689                                "not vectorized: can't compute mask type "
 690                                "for statement, ");
 691               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 692                                 0);
 693             }
 694           return false;
 695         }
 696
 697       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 698     }
 699
 700   return true;
 701 }
 702
 703
 704 /* Function vect_is_simple_iv_evolution.
 705
 706    FORNOW: A simple evolution of an induction variables in the loop is
 707    considered a polynomial evolution.  */
 708
 709 static bool
 710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 711                              tree * step)
 712 {
 713   tree init_expr;
 714   tree step_expr;
 715   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 716   basic_block bb;
 717
 718   /* When there is no evolution in this loop, the evolution function
 719      is not "simple".  */
 720   if (evolution_part == NULL_TREE)
 721     return false;
 722
 723   /* When the evolution is a polynomial of degree >= 2
 724      the evolution function is not "simple".  */
 725   if (tree_is_chrec (evolution_part))
 726     return false;
 727
 728   step_expr = evolution_part;
 729   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 730
 731   if (dump_enabled_p ())
 732     {
 733       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 735       dump_printf (MSG_NOTE, ",  init: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 737       dump_printf (MSG_NOTE, "\n");
 738     }
 739
 740   *init = init_expr;
 741   *step = step_expr;
 742
 743   if (TREE_CODE (step_expr) != INTEGER_CST
 744       && (TREE_CODE (step_expr) != SSA_NAME
 745           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 746               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 747           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 748               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 749                   || !flag_associative_math)))
 750       && (TREE_CODE (step_expr) != REAL_CST
 751           || !flag_associative_math))
 752     {
 753       if (dump_enabled_p ())
 754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 755                          "step unknown.\n");
 756       return false;
 757     }
 758
 759   return true;
 760 }
 761
 762 /* Function vect_analyze_scalar_cycles_1.
 763
 764    Examine the cross iteration def-use cycles of scalar variables
 765    in LOOP.  LOOP_VINFO represents the loop that is now being
 766    considered for vectorization (can be LOOP, or an outer-loop
 767    enclosing LOOP).  */
 768
 769 static void
 770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 771 {
 772   basic_block bb = loop->header;
 773   tree init, step;
 774   auto_vec<gimple *, 64> worklist;
 775   gphi_iterator gsi;
 776   bool double_reduc;
 777
 778   if (dump_enabled_p ())
 779     dump_printf_loc (MSG_NOTE, vect_location,
 780                      "=== vect_analyze_scalar_cycles ===\n");
 781
 782   /* First - identify all inductions.  Reduction detection assumes that all the
 783      inductions have been identified, therefore, this order must not be
 784      changed.  */
 785   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 786     {
 787       gphi *phi = gsi.phi ();
 788       tree access_fn = NULL;
 789       tree def = PHI_RESULT (phi);
 790       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 791
 792       if (dump_enabled_p ())
 793         {
 794           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 795           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 796         }
 797
 798       /* Skip virtual phi's.  The data dependences that are associated with
 799          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 800       if (virtual_operand_p (def))
 801         continue;
 802
 803       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 804
 805       /* Analyze the evolution function.  */
 806       access_fn = analyze_scalar_evolution (loop, def);
 807       if (access_fn)
 808         {
 809           STRIP_NOPS (access_fn);
 810           if (dump_enabled_p ())
 811             {
 812               dump_printf_loc (MSG_NOTE, vect_location,
 813                                "Access function of PHI: ");
 814               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 815               dump_printf (MSG_NOTE, "\n");
 816             }
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 818             = initial_condition_in_loop_num (access_fn, loop->num);
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 820             = evolution_part_in_loop_num (access_fn, loop->num);
 821         }
 822
 823       if (!access_fn
 824           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 825           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 826               && TREE_CODE (step) != INTEGER_CST))
 827         {
 828           worklist.safe_push (phi);
 829           continue;
 830         }
 831
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 833                   != NULL_TREE);
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 835
 836       if (dump_enabled_p ())
 837         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 838       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 839     }
 840
 841
 842   /* Second - identify all reductions and nested cycles.  */
 843   while (worklist.length () > 0)
 844     {
 845       gimple *phi = worklist.pop ();
 846       tree def = PHI_RESULT (phi);
 847       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 848       gimple *reduc_stmt;
 849
 850       if (dump_enabled_p ())
 851         {
 852           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 853           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 854         }
 855
 856       gcc_assert (!virtual_operand_p (def)
 857                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 858
 859       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 860                                                 &double_reduc, false);
 861       if (reduc_stmt)
 862         {
 863           if (double_reduc)
 864             {
 865               if (dump_enabled_p ())
 866                 dump_printf_loc (MSG_NOTE, vect_location,
 867                                  "Detected double reduction.\n");
 868
 869               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 870               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 871                                                     vect_double_reduction_def;
 872             }
 873           else
 874             {
 875               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 876                 {
 877                   if (dump_enabled_p ())
 878                     dump_printf_loc (MSG_NOTE, vect_location,
 879                                      "Detected vectorizable nested cycle.\n");
 880
 881                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 882                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 883                                                              vect_nested_cycle;
 884                 }
 885               else
 886                 {
 887                   if (dump_enabled_p ())
 888                     dump_printf_loc (MSG_NOTE, vect_location,
 889                                      "Detected reduction.\n");
 890
 891                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 892                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 893                                                            vect_reduction_def;
 894                   /* Store the reduction cycles for possible vectorization in
 895                      loop-aware SLP if it was not detected as reduction
 896                      chain.  */
 897                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 898                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 899                 }
 900             }
 901         }
 902       else
 903         if (dump_enabled_p ())
 904           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 905                            "Unknown def-use cycle pattern.\n");
 906     }
 907 }
 908
 909
 910 /* Function vect_analyze_scalar_cycles.
 911
 912    Examine the cross iteration def-use cycles of scalar variables, by
 913    analyzing the loop-header PHIs of scalar variables.  Classify each
 914    cycle as one of the following: invariant, induction, reduction, unknown.
 915    We do that for the loop represented by LOOP_VINFO, and also to its
 916    inner-loop, if exists.
 917    Examples for scalar cycles:
 918
 919    Example1: reduction:
 920
 921               loop1:
 922               for (i=0; i<N; i++)
 923                  sum += a[i];
 924
 925    Example2: induction:
 926
 927               loop2:
 928               for (i=0; i<N; i++)
 929                  a[i] = i;  */
 930
 931 static void
 932 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 933 {
 934   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 935
 936   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 937
 938   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 939      Reductions in such inner-loop therefore have different properties than
 940      the reductions in the nest that gets vectorized:
 941      1. When vectorized, they are executed in the same order as in the original
 942         scalar loop, so we can't change the order of computation when
 943         vectorizing them.
 944      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 945         current checks are too strict.  */
 946
 947   if (loop->inner)
 948     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 949 }
 950
 951 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 952
 953 static void
 954 vect_fixup_reduc_chain (gimple *stmt)
 955 {
 956   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 957   gimple *stmtp;
 958   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 959               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 960   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 961   do
 962     {
 963       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 965       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 966       if (stmt)
 967         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 968           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 969     }
 970   while (stmt);
 971   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 972 }
 973
 974 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 975
 976 static void
 977 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 978 {
 979   gimple *first;
 980   unsigned i;
 981
 982   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 983     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 984       {
 985         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 986         while (next)
 987           {
 988             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 989               break;
 990             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 991           }
 992         /* If not all stmt in the chain are patterns try to handle
 993            the chain without patterns.  */
 994         if (! next)
 995           {
 996             vect_fixup_reduc_chain (first);
 997             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 998               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 999           }
1000       }
1001 }
1002
1003 /* Function vect_get_loop_niters.
1004
1005    Determine how many iterations the loop is executed and place it
1006    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1007    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1008    niter information holds in ASSUMPTIONS.
1009
1010    Return the loop exit condition.  */
1011
1012
1013 static gcond *
1014 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1015                       tree *number_of_iterations, tree *number_of_iterationsm1)
1016 {
1017   edge exit = single_exit (loop);
1018   struct tree_niter_desc niter_desc;
1019   tree niter_assumptions, niter, may_be_zero;
1020   gcond *cond = get_loop_exit_condition (loop);
1021
1022   *assumptions = boolean_true_node;
1023   *number_of_iterationsm1 = chrec_dont_know;
1024   *number_of_iterations = chrec_dont_know;
1025   if (dump_enabled_p ())
1026     dump_printf_loc (MSG_NOTE, vect_location,
1027                      "=== get_loop_niters ===\n");
1028
1029   if (!exit)
1030     return cond;
1031
1032   niter = chrec_dont_know;
1033   may_be_zero = NULL_TREE;
1034   niter_assumptions = boolean_true_node;
1035   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1036       || chrec_contains_undetermined (niter_desc.niter))
1037     return cond;
1038
1039   niter_assumptions = niter_desc.assumptions;
1040   may_be_zero = niter_desc.may_be_zero;
1041   niter = niter_desc.niter;
1042
1043   if (may_be_zero && integer_zerop (may_be_zero))
1044     may_be_zero = NULL_TREE;
1045
1046   if (may_be_zero)
1047     {
1048       if (COMPARISON_CLASS_P (may_be_zero))
1049         {
1050           /* Try to combine may_be_zero with assumptions, this can simplify
1051              computation of niter expression.  */
1052           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1053             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1054                                              niter_assumptions,
1055                                              fold_build1 (TRUTH_NOT_EXPR,
1056                                                           boolean_type_node,
1057                                                           may_be_zero));
1058           else
1059             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1060                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1061
1062           may_be_zero = NULL_TREE;
1063         }
1064       else if (integer_nonzerop (may_be_zero))
1065         {
1066           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1067           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1068           return cond;
1069         }
1070       else
1071         return cond;
1072     }
1073
1074   *assumptions = niter_assumptions;
1075   *number_of_iterationsm1 = niter;
1076
1077   /* We want the number of loop header executions which is the number
1078      of latch executions plus one.
1079      ???  For UINT_MAX latch executions this number overflows to zero
1080      for loops like do { n++; } while (n != 0);  */
1081   if (niter && !chrec_contains_undetermined (niter))
1082     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1083                           build_int_cst (TREE_TYPE (niter), 1));
1084   *number_of_iterations = niter;
1085
1086   return cond;
1087 }
1088
1089 /* Function bb_in_loop_p
1090
1091    Used as predicate for dfs order traversal of the loop bbs.  */
1092
1093 static bool
1094 bb_in_loop_p (const_basic_block bb, const void *data)
1095 {
1096   const struct loop *const loop = (const struct loop *)data;
1097   if (flow_bb_inside_loop_p (loop, bb))
1098     return true;
1099   return false;
1100 }
1101
1102
1103 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1104    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1105
1106 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1107   : vec_info (vec_info::loop, init_cost (loop_in)),
1108     loop (loop_in),
1109     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1110     num_itersm1 (NULL_TREE),
1111     num_iters (NULL_TREE),
1112     num_iters_unchanged (NULL_TREE),
1113     num_iters_assumptions (NULL_TREE),
1114     th (0),
1115     versioning_threshold (0),
1116     vectorization_factor (0),
1117     max_vectorization_factor (0),
1118     unaligned_dr (NULL),
1119     peeling_for_alignment (0),
1120     ptr_mask (0),
1121     slp_unrolling_factor (1),
1122     single_scalar_iteration_cost (0),
1123     vectorizable (false),
1124     peeling_for_gaps (false),
1125     peeling_for_niter (false),
1126     operands_swapped (false),
1127     no_data_dependencies (false),
1128     has_mask_store (false),
1129     scalar_loop (NULL),
1130     orig_loop_info (NULL)
1131 {
1132   /* Create/Update stmt_info for all stmts in the loop.  */
1133   basic_block *body = get_loop_body (loop);
1134   for (unsigned int i = 0; i < loop->num_nodes; i++)
1135     {
1136       basic_block bb = body[i];
1137       gimple_stmt_iterator si;
1138
1139       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1140         {
1141           gimple *phi = gsi_stmt (si);
1142           gimple_set_uid (phi, 0);
1143           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1144         }
1145
1146       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1147         {
1148           gimple *stmt = gsi_stmt (si);
1149           gimple_set_uid (stmt, 0);
1150           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1151         }
1152     }
1153   free (body);
1154
1155   /* CHECKME: We want to visit all BBs before their successors (except for
1156      latch blocks, for which this assertion wouldn't hold).  In the simple
1157      case of the loop forms we allow, a dfs order of the BBs would the same
1158      as reversed postorder traversal, so we are safe.  */
1159
1160   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1161                                           bbs, loop->num_nodes, loop);
1162   gcc_assert (nbbs == loop->num_nodes);
1163 }
1164
1165
1166 /* Free all memory used by the _loop_vec_info, as well as all the
1167    stmt_vec_info structs of all the stmts in the loop.  */
1168
1169 _loop_vec_info::~_loop_vec_info ()
1170 {
1171   int nbbs;
1172   gimple_stmt_iterator si;
1173   int j;
1174
1175   nbbs = loop->num_nodes;
1176   for (j = 0; j < nbbs; j++)
1177     {
1178       basic_block bb = bbs[j];
1179       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1180         free_stmt_vec_info (gsi_stmt (si));
1181
1182       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1183         {
1184           gimple *stmt = gsi_stmt (si);
1185
1186           /* We may have broken canonical form by moving a constant
1187              into RHS1 of a commutative op.  Fix such occurrences.  */
1188           if (operands_swapped && is_gimple_assign (stmt))
1189             {
1190               enum tree_code code = gimple_assign_rhs_code (stmt);
1191
1192               if ((code == PLUS_EXPR
1193                    || code == POINTER_PLUS_EXPR
1194                    || code == MULT_EXPR)
1195                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1196                 swap_ssa_operands (stmt,
1197                                    gimple_assign_rhs1_ptr (stmt),
1198                                    gimple_assign_rhs2_ptr (stmt));
1199               else if (code == COND_EXPR
1200                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1201                 {
1202                   tree cond_expr = gimple_assign_rhs1 (stmt);
1203                   enum tree_code cond_code = TREE_CODE (cond_expr);
1204
1205                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1206                     {
1207                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1208                                                                   0));
1209                       cond_code = invert_tree_comparison (cond_code,
1210                                                           honor_nans);
1211                       if (cond_code != ERROR_MARK)
1212                         {
1213                           TREE_SET_CODE (cond_expr, cond_code);
1214                           swap_ssa_operands (stmt,
1215                                              gimple_assign_rhs2_ptr (stmt),
1216                                              gimple_assign_rhs3_ptr (stmt));
1217                         }
1218                     }
1219                 }
1220             }
1221
1222           /* Free stmt_vec_info.  */
1223           free_stmt_vec_info (stmt);
1224           gsi_next (&si);
1225         }
1226     }
1227
1228   free (bbs);
1229
1230   loop->aux = NULL;
1231 }
1232
1233
1234 /* Calculate the cost of one scalar iteration of the loop.  */
1235 static void
1236 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1237 {
1238   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1239   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1240   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1241   int innerloop_iters, i;
1242
1243   /* Count statements in scalar loop.  Using this as scalar cost for a single
1244      iteration for now.
1245
1246      TODO: Add outer loop support.
1247
1248      TODO: Consider assigning different costs to different scalar
1249      statements.  */
1250
1251   /* FORNOW.  */
1252   innerloop_iters = 1;
1253   if (loop->inner)
1254     innerloop_iters = 50; /* FIXME */
1255
1256   for (i = 0; i < nbbs; i++)
1257     {
1258       gimple_stmt_iterator si;
1259       basic_block bb = bbs[i];
1260
1261       if (bb->loop_father == loop->inner)
1262         factor = innerloop_iters;
1263       else
1264         factor = 1;
1265
1266       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1267         {
1268           gimple *stmt = gsi_stmt (si);
1269           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1270
1271           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1272             continue;
1273
1274           /* Skip stmts that are not vectorized inside the loop.  */
1275           if (stmt_info
1276               && !STMT_VINFO_RELEVANT_P (stmt_info)
1277               && (!STMT_VINFO_LIVE_P (stmt_info)
1278                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1279               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1280             continue;
1281
1282           vect_cost_for_stmt kind;
1283           if (STMT_VINFO_DATA_REF (stmt_info))
1284             {
1285               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1286                kind = scalar_load;
1287              else
1288                kind = scalar_store;
1289             }
1290           else
1291             kind = scalar_stmt;
1292
1293           scalar_single_iter_cost
1294             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1295                                  factor, kind, stmt_info, 0, vect_prologue);
1296         }
1297     }
1298   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1299     = scalar_single_iter_cost;
1300 }
1301
1302
1303 /* Function vect_analyze_loop_form_1.
1304
1305    Verify that certain CFG restrictions hold, including:
1306    - the loop has a pre-header
1307    - the loop has a single entry and exit
1308    - the loop exit condition is simple enough
1309    - the number of iterations can be analyzed, i.e, a countable loop.  The
1310      niter could be analyzed under some assumptions.  */
1311
1312 bool
1313 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1314                           tree *assumptions, tree *number_of_iterationsm1,
1315                           tree *number_of_iterations, gcond **inner_loop_cond)
1316 {
1317   if (dump_enabled_p ())
1318     dump_printf_loc (MSG_NOTE, vect_location,
1319                      "=== vect_analyze_loop_form ===\n");
1320
1321   /* Different restrictions apply when we are considering an inner-most loop,
1322      vs. an outer (nested) loop.
1323      (FORNOW. May want to relax some of these restrictions in the future).  */
1324
1325   if (!loop->inner)
1326     {
1327       /* Inner-most loop.  We currently require that the number of BBs is
1328          exactly 2 (the header and latch).  Vectorizable inner-most loops
1329          look like this:
1330
1331                         (pre-header)
1332                            |
1333                           header <--------+
1334                            | |            |
1335                            | +--> latch --+
1336                            |
1337                         (exit-bb)  */
1338
1339       if (loop->num_nodes != 2)
1340         {
1341           if (dump_enabled_p ())
1342             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1343                              "not vectorized: control flow in loop.\n");
1344           return false;
1345         }
1346
1347       if (empty_block_p (loop->header))
1348         {
1349           if (dump_enabled_p ())
1350             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1351                              "not vectorized: empty loop.\n");
1352           return false;
1353         }
1354     }
1355   else
1356     {
1357       struct loop *innerloop = loop->inner;
1358       edge entryedge;
1359
1360       /* Nested loop. We currently require that the loop is doubly-nested,
1361          contains a single inner loop, and the number of BBs is exactly 5.
1362          Vectorizable outer-loops look like this:
1363
1364                         (pre-header)
1365                            |
1366                           header <---+
1367                            |         |
1368                           inner-loop |
1369                            |         |
1370                           tail ------+
1371                            |
1372                         (exit-bb)
1373
1374          The inner-loop has the properties expected of inner-most loops
1375          as described above.  */
1376
1377       if ((loop->inner)->inner || (loop->inner)->next)
1378         {
1379           if (dump_enabled_p ())
1380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1381                              "not vectorized: multiple nested loops.\n");
1382           return false;
1383         }
1384
1385       if (loop->num_nodes != 5)
1386         {
1387           if (dump_enabled_p ())
1388             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1389                              "not vectorized: control flow in loop.\n");
1390           return false;
1391         }
1392
1393       entryedge = loop_preheader_edge (innerloop);
1394       if (entryedge->src != loop->header
1395           || !single_exit (innerloop)
1396           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1397         {
1398           if (dump_enabled_p ())
1399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                              "not vectorized: unsupported outerloop form.\n");
1401           return false;
1402         }
1403
1404       /* Analyze the inner-loop.  */
1405       tree inner_niterm1, inner_niter, inner_assumptions;
1406       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1407                                       &inner_assumptions, &inner_niterm1,
1408                                       &inner_niter, NULL)
1409           /* Don't support analyzing niter under assumptions for inner
1410              loop.  */
1411           || !integer_onep (inner_assumptions))
1412         {
1413           if (dump_enabled_p ())
1414             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1415                              "not vectorized: Bad inner loop.\n");
1416           return false;
1417         }
1418
1419       if (!expr_invariant_in_loop_p (loop, inner_niter))
1420         {
1421           if (dump_enabled_p ())
1422             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1423                              "not vectorized: inner-loop count not"
1424                              " invariant.\n");
1425           return false;
1426         }
1427
1428       if (dump_enabled_p ())
1429         dump_printf_loc (MSG_NOTE, vect_location,
1430                          "Considering outer-loop vectorization.\n");
1431     }
1432
1433   if (!single_exit (loop)
1434       || EDGE_COUNT (loop->header->preds) != 2)
1435     {
1436       if (dump_enabled_p ())
1437         {
1438           if (!single_exit (loop))
1439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1440                              "not vectorized: multiple exits.\n");
1441           else if (EDGE_COUNT (loop->header->preds) != 2)
1442             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                              "not vectorized: too many incoming edges.\n");
1444         }
1445       return false;
1446     }
1447
1448   /* We assume that the loop exit condition is at the end of the loop. i.e,
1449      that the loop is represented as a do-while (with a proper if-guard
1450      before the loop if needed), where the loop header contains all the
1451      executable statements, and the latch is empty.  */
1452   if (!empty_block_p (loop->latch)
1453       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1454     {
1455       if (dump_enabled_p ())
1456         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1457                          "not vectorized: latch block not empty.\n");
1458       return false;
1459     }
1460
1461   /* Make sure the exit is not abnormal.  */
1462   edge e = single_exit (loop);
1463   if (e->flags & EDGE_ABNORMAL)
1464     {
1465       if (dump_enabled_p ())
1466         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467                          "not vectorized: abnormal loop exit edge.\n");
1468       return false;
1469     }
1470
1471   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1472                                      number_of_iterationsm1);
1473   if (!*loop_cond)
1474     {
1475       if (dump_enabled_p ())
1476         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477                          "not vectorized: complicated exit condition.\n");
1478       return false;
1479     }
1480
1481   if (integer_zerop (*assumptions)
1482       || !*number_of_iterations
1483       || chrec_contains_undetermined (*number_of_iterations))
1484     {
1485       if (dump_enabled_p ())
1486         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487                          "not vectorized: number of iterations cannot be "
1488                          "computed.\n");
1489       return false;
1490     }
1491
1492   if (integer_zerop (*number_of_iterations))
1493     {
1494       if (dump_enabled_p ())
1495         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1496                          "not vectorized: number of iterations = 0.\n");
1497       return false;
1498     }
1499
1500   return true;
1501 }
1502
1503 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1504
1505 loop_vec_info
1506 vect_analyze_loop_form (struct loop *loop)
1507 {
1508   tree assumptions, number_of_iterations, number_of_iterationsm1;
1509   gcond *loop_cond, *inner_loop_cond = NULL;
1510
1511   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1512                                   &assumptions, &number_of_iterationsm1,
1513                                   &number_of_iterations, &inner_loop_cond))
1514     return NULL;
1515
1516   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1517   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1518   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1519   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1520   if (!integer_onep (assumptions))
1521     {
1522       /* We consider to vectorize this loop by versioning it under
1523          some assumptions.  In order to do this, we need to clear
1524          existing information computed by scev and niter analyzer.  */
1525       scev_reset_htab ();
1526       free_numbers_of_iterations_estimates (loop);
1527       /* Also set flag for this loop so that following scev and niter
1528          analysis are done under the assumptions.  */
1529       loop_constraint_set (loop, LOOP_C_FINITE);
1530       /* Also record the assumptions for versioning.  */
1531       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1532     }
1533
1534   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1535     {
1536       if (dump_enabled_p ())
1537         {
1538           dump_printf_loc (MSG_NOTE, vect_location,
1539                            "Symbolic number of iterations is ");
1540           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1541           dump_printf (MSG_NOTE, "\n");
1542         }
1543     }
1544
1545   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1546   if (inner_loop_cond)
1547     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1548       = loop_exit_ctrl_vec_info_type;
1549
1550   gcc_assert (!loop->aux);
1551   loop->aux = loop_vinfo;
1552   return loop_vinfo;
1553 }
1554
1555
1556
1557 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1558    statements update the vectorization factor.  */
1559
1560 static void
1561 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1562 {
1563   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1564   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1565   int nbbs = loop->num_nodes;
1566   unsigned int vectorization_factor;
1567   int i;
1568
1569   if (dump_enabled_p ())
1570     dump_printf_loc (MSG_NOTE, vect_location,
1571                      "=== vect_update_vf_for_slp ===\n");
1572
1573   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1574   gcc_assert (vectorization_factor != 0);
1575
1576   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1577      vectorization factor of the loop is the unrolling factor required by
1578      the SLP instances.  If that unrolling factor is 1, we say, that we
1579      perform pure SLP on loop - cross iteration parallelism is not
1580      exploited.  */
1581   bool only_slp_in_loop = true;
1582   for (i = 0; i < nbbs; i++)
1583     {
1584       basic_block bb = bbs[i];
1585       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1586            gsi_next (&si))
1587         {
1588           gimple *stmt = gsi_stmt (si);
1589           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1590           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1591               && STMT_VINFO_RELATED_STMT (stmt_info))
1592             {
1593               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1594               stmt_info = vinfo_for_stmt (stmt);
1595             }
1596           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1597                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1598               && !PURE_SLP_STMT (stmt_info))
1599             /* STMT needs both SLP and loop-based vectorization.  */
1600             only_slp_in_loop = false;
1601         }
1602     }
1603
1604   if (only_slp_in_loop)
1605     {
1606       dump_printf_loc (MSG_NOTE, vect_location,
1607                        "Loop contains only SLP stmts\n");
1608       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1609     }
1610   else
1611     {
1612       dump_printf_loc (MSG_NOTE, vect_location,
1613                        "Loop contains SLP and non-SLP stmts\n");
1614       vectorization_factor
1615         = least_common_multiple (vectorization_factor,
1616                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1617     }
1618
1619   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1620   if (dump_enabled_p ())
1621     dump_printf_loc (MSG_NOTE, vect_location,
1622                      "Updating vectorization factor to %d\n",
1623                      vectorization_factor);
1624 }
1625
1626 /* Function vect_analyze_loop_operations.
1627
1628    Scan the loop stmts and make sure they are all vectorizable.  */
1629
1630 static bool
1631 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1632 {
1633   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1634   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1635   int nbbs = loop->num_nodes;
1636   int i;
1637   stmt_vec_info stmt_info;
1638   bool need_to_vectorize = false;
1639   bool ok;
1640
1641   if (dump_enabled_p ())
1642     dump_printf_loc (MSG_NOTE, vect_location,
1643                      "=== vect_analyze_loop_operations ===\n");
1644
1645   for (i = 0; i < nbbs; i++)
1646     {
1647       basic_block bb = bbs[i];
1648
1649       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1650            gsi_next (&si))
1651         {
1652           gphi *phi = si.phi ();
1653           ok = true;
1654
1655           stmt_info = vinfo_for_stmt (phi);
1656           if (dump_enabled_p ())
1657             {
1658               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1659               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1660             }
1661           if (virtual_operand_p (gimple_phi_result (phi)))
1662             continue;
1663
1664           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1665              (i.e., a phi in the tail of the outer-loop).  */
1666           if (! is_loop_header_bb_p (bb))
1667             {
1668               /* FORNOW: we currently don't support the case that these phis
1669                  are not used in the outerloop (unless it is double reduction,
1670                  i.e., this phi is vect_reduction_def), cause this case
1671                  requires to actually do something here.  */
1672               if (STMT_VINFO_LIVE_P (stmt_info)
1673                   && STMT_VINFO_DEF_TYPE (stmt_info)
1674                      != vect_double_reduction_def)
1675                 {
1676                   if (dump_enabled_p ())
1677                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1678                                      "Unsupported loop-closed phi in "
1679                                      "outer-loop.\n");
1680                   return false;
1681                 }
1682
1683               /* If PHI is used in the outer loop, we check that its operand
1684                  is defined in the inner loop.  */
1685               if (STMT_VINFO_RELEVANT_P (stmt_info))
1686                 {
1687                   tree phi_op;
1688                   gimple *op_def_stmt;
1689
1690                   if (gimple_phi_num_args (phi) != 1)
1691                     return false;
1692
1693                   phi_op = PHI_ARG_DEF (phi, 0);
1694                   if (TREE_CODE (phi_op) != SSA_NAME)
1695                     return false;
1696
1697                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1698                   if (gimple_nop_p (op_def_stmt)
1699                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1700                       || !vinfo_for_stmt (op_def_stmt))
1701                     return false;
1702
1703                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1704                         != vect_used_in_outer
1705                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1706                            != vect_used_in_outer_by_reduction)
1707                     return false;
1708                 }
1709
1710               continue;
1711             }
1712
1713           gcc_assert (stmt_info);
1714
1715           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1716                || STMT_VINFO_LIVE_P (stmt_info))
1717               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1718             {
1719               /* A scalar-dependence cycle that we don't support.  */
1720               if (dump_enabled_p ())
1721                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                                  "not vectorized: scalar dependence cycle.\n");
1723               return false;
1724             }
1725
1726           if (STMT_VINFO_RELEVANT_P (stmt_info))
1727             {
1728               need_to_vectorize = true;
1729               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1730                   && ! PURE_SLP_STMT (stmt_info))
1731                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1732               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1733                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1734                        && ! PURE_SLP_STMT (stmt_info))
1735                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1736             }
1737
1738           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1739             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1740
1741           if (!ok)
1742             {
1743               if (dump_enabled_p ())
1744                 {
1745                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746                                    "not vectorized: relevant phi not "
1747                                    "supported: ");
1748                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1749                 }
1750               return false;
1751             }
1752         }
1753
1754       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1755            gsi_next (&si))
1756         {
1757           gimple *stmt = gsi_stmt (si);
1758           if (!gimple_clobber_p (stmt)
1759               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1760             return false;
1761         }
1762     } /* bbs */
1763
1764   /* All operations in the loop are either irrelevant (deal with loop
1765      control, or dead), or only used outside the loop and can be moved
1766      out of the loop (e.g. invariants, inductions).  The loop can be
1767      optimized away by scalar optimizations.  We're better off not
1768      touching this loop.  */
1769   if (!need_to_vectorize)
1770     {
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_NOTE, vect_location,
1773                          "All the computation can be taken out of the loop.\n");
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776                          "not vectorized: redundant loop. no profit to "
1777                          "vectorize.\n");
1778       return false;
1779     }
1780
1781   return true;
1782 }
1783
1784
1785 /* Function vect_analyze_loop_2.
1786
1787    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1788    for it.  The different analyses will record information in the
1789    loop_vec_info struct.  */
1790 static bool
1791 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1792 {
1793   bool ok;
1794   int max_vf = MAX_VECTORIZATION_FACTOR;
1795   int min_vf = 2;
1796   unsigned int n_stmts = 0;
1797
1798   /* The first group of checks is independent of the vector size.  */
1799   fatal = true;
1800
1801   /* Find all data references in the loop (which correspond to vdefs/vuses)
1802      and analyze their evolution in the loop.  */
1803
1804   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1805
1806   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1807   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1808     {
1809       if (dump_enabled_p ())
1810         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1811                          "not vectorized: loop nest containing two "
1812                          "or more consecutive inner loops cannot be "
1813                          "vectorized\n");
1814       return false;
1815     }
1816
1817   for (unsigned i = 0; i < loop->num_nodes; i++)
1818     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1819          !gsi_end_p (gsi); gsi_next (&gsi))
1820       {
1821         gimple *stmt = gsi_stmt (gsi);
1822         if (is_gimple_debug (stmt))
1823           continue;
1824         ++n_stmts;
1825         if (!find_data_references_in_stmt (loop, stmt,
1826                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1827           {
1828             if (is_gimple_call (stmt) && loop->safelen)
1829               {
1830                 tree fndecl = gimple_call_fndecl (stmt), op;
1831                 if (fndecl != NULL_TREE)
1832                   {
1833                     cgraph_node *node = cgraph_node::get (fndecl);
1834                     if (node != NULL && node->simd_clones != NULL)
1835                       {
1836                         unsigned int j, n = gimple_call_num_args (stmt);
1837                         for (j = 0; j < n; j++)
1838                           {
1839                             op = gimple_call_arg (stmt, j);
1840                             if (DECL_P (op)
1841                                 || (REFERENCE_CLASS_P (op)
1842                                     && get_base_address (op)))
1843                               break;
1844                           }
1845                         op = gimple_call_lhs (stmt);
1846                         /* Ignore #pragma omp declare simd functions
1847                            if they don't have data references in the
1848                            call stmt itself.  */
1849                         if (j == n
1850                             && !(op
1851                                  && (DECL_P (op)
1852                                      || (REFERENCE_CLASS_P (op)
1853                                          && get_base_address (op)))))
1854                           continue;
1855                       }
1856                   }
1857               }
1858             if (dump_enabled_p ())
1859               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860                                "not vectorized: loop contains function "
1861                                "calls or data references that cannot "
1862                                "be analyzed\n");
1863             return false;
1864           }
1865       }
1866
1867   /* Analyze the data references and also adjust the minimal
1868      vectorization factor according to the loads and stores.  */
1869
1870   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1871   if (!ok)
1872     {
1873       if (dump_enabled_p ())
1874         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                          "bad data references.\n");
1876       return false;
1877     }
1878
1879   /* Classify all cross-iteration scalar data-flow cycles.
1880      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1881   vect_analyze_scalar_cycles (loop_vinfo);
1882
1883   vect_pattern_recog (loop_vinfo);
1884
1885   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1886
1887   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1888      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1889
1890   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1891   if (!ok)
1892     {
1893       if (dump_enabled_p ())
1894         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1895                          "bad data access.\n");
1896       return false;
1897     }
1898
1899   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1900
1901   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1902   if (!ok)
1903     {
1904       if (dump_enabled_p ())
1905         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906                          "unexpected pattern.\n");
1907       return false;
1908     }
1909
1910   /* While the rest of the analysis below depends on it in some way.  */
1911   fatal = false;
1912
1913   /* Analyze data dependences between the data-refs in the loop
1914      and adjust the maximum vectorization factor according to
1915      the dependences.
1916      FORNOW: fail at the first data dependence that we encounter.  */
1917
1918   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1919   if (!ok
1920       || max_vf < min_vf)
1921     {
1922       if (dump_enabled_p ())
1923             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                              "bad data dependence.\n");
1925       return false;
1926     }
1927   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1928
1929   ok = vect_determine_vectorization_factor (loop_vinfo);
1930   if (!ok)
1931     {
1932       if (dump_enabled_p ())
1933         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1934                          "can't determine vectorization factor.\n");
1935       return false;
1936     }
1937   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1938     {
1939       if (dump_enabled_p ())
1940         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1941                          "bad data dependence.\n");
1942       return false;
1943     }
1944
1945   /* Compute the scalar iteration cost.  */
1946   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1947
1948   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1949   HOST_WIDE_INT estimated_niter;
1950   unsigned th;
1951   int min_scalar_loop_bound;
1952
1953   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1954   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1955   if (!ok)
1956     return false;
1957
1958   /* If there are any SLP instances mark them as pure_slp.  */
1959   bool slp = vect_make_slp_decision (loop_vinfo);
1960   if (slp)
1961     {
1962       /* Find stmts that need to be both vectorized and SLPed.  */
1963       vect_detect_hybrid_slp (loop_vinfo);
1964
1965       /* Update the vectorization factor based on the SLP decision.  */
1966       vect_update_vf_for_slp (loop_vinfo);
1967     }
1968
1969   /* This is the point where we can re-start analysis with SLP forced off.  */
1970 start_over:
1971
1972   /* Now the vectorization factor is final.  */
1973   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1974   gcc_assert (vectorization_factor != 0);
1975
1976   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1977     dump_printf_loc (MSG_NOTE, vect_location,
1978                      "vectorization_factor = %d, niters = "
1979                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1980                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1981
1982   HOST_WIDE_INT max_niter
1983     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1984   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1985        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1986       || (max_niter != -1
1987           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1988     {
1989       if (dump_enabled_p ())
1990         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1991                          "not vectorized: iteration count smaller than "
1992                          "vectorization factor.\n");
1993       return false;
1994     }
1995
1996   /* Analyze the alignment of the data-refs in the loop.
1997      Fail if a data reference is found that cannot be vectorized.  */
1998
1999   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2000   if (!ok)
2001     {
2002       if (dump_enabled_p ())
2003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2004                          "bad data alignment.\n");
2005       return false;
2006     }
2007
2008   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2009      It is important to call pruning after vect_analyze_data_ref_accesses,
2010      since we use grouping information gathered by interleaving analysis.  */
2011   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2012   if (!ok)
2013     return false;
2014
2015   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2016      vectorization.  */
2017   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2018     {
2019     /* This pass will decide on using loop versioning and/or loop peeling in
2020        order to enhance the alignment of data references in the loop.  */
2021     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2022     if (!ok)
2023       {
2024         if (dump_enabled_p ())
2025           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2026                            "bad data alignment.\n");
2027         return false;
2028       }
2029     }
2030
2031   if (slp)
2032     {
2033       /* Analyze operations in the SLP instances.  Note this may
2034          remove unsupported SLP instances which makes the above
2035          SLP kind detection invalid.  */
2036       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2037       vect_slp_analyze_operations (loop_vinfo);
2038       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2039         goto again;
2040     }
2041
2042   /* Scan all the remaining operations in the loop that are not subject
2043      to SLP and make sure they are vectorizable.  */
2044   ok = vect_analyze_loop_operations (loop_vinfo);
2045   if (!ok)
2046     {
2047       if (dump_enabled_p ())
2048         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2049                          "bad operation or unsupported loop bound.\n");
2050       return false;
2051     }
2052
2053   /* If epilog loop is required because of data accesses with gaps,
2054      one additional iteration needs to be peeled.  Check if there is
2055      enough iterations for vectorization.  */
2056   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2057       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2058     {
2059       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2060       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2061
2062       if (wi::to_widest (scalar_niters) < vf)
2063         {
2064           if (dump_enabled_p ())
2065             dump_printf_loc (MSG_NOTE, vect_location,
2066                              "loop has no enough iterations to support"
2067                              " peeling for gaps.\n");
2068           return false;
2069         }
2070     }
2071
2072   /* Analyze cost.  Decide if worth while to vectorize.  */
2073   int min_profitable_estimate, min_profitable_iters;
2074   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2075                                       &min_profitable_estimate);
2076
2077   if (min_profitable_iters < 0)
2078     {
2079       if (dump_enabled_p ())
2080         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081                          "not vectorized: vectorization not profitable.\n");
2082       if (dump_enabled_p ())
2083         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2084                          "not vectorized: vector version will never be "
2085                          "profitable.\n");
2086       goto again;
2087     }
2088
2089   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2090                            * vectorization_factor);
2091
2092   /* Use the cost model only if it is more conservative than user specified
2093      threshold.  */
2094   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2095
2096   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2097
2098   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2099       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2100     {
2101       if (dump_enabled_p ())
2102         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2103                          "not vectorized: vectorization not profitable.\n");
2104       if (dump_enabled_p ())
2105         dump_printf_loc (MSG_NOTE, vect_location,
2106                          "not vectorized: iteration count smaller than user "
2107                          "specified loop bound parameter or minimum profitable "
2108                          "iterations (whichever is more conservative).\n");
2109       goto again;
2110     }
2111
2112   estimated_niter
2113     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2114   if (estimated_niter == -1)
2115     estimated_niter = max_niter;
2116   if (estimated_niter != -1
2117       && ((unsigned HOST_WIDE_INT) estimated_niter
2118           < MAX (th, (unsigned) min_profitable_estimate)))
2119     {
2120       if (dump_enabled_p ())
2121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2122                          "not vectorized: estimated iteration count too "
2123                          "small.\n");
2124       if (dump_enabled_p ())
2125         dump_printf_loc (MSG_NOTE, vect_location,
2126                          "not vectorized: estimated iteration count smaller "
2127                          "than specified loop bound parameter or minimum "
2128                          "profitable iterations (whichever is more "
2129                          "conservative).\n");
2130       goto again;
2131     }
2132
2133   /* Decide whether we need to create an epilogue loop to handle
2134      remaining scalar iterations.  */
2135   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2136          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2137         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2138
2139   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2140       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2141     {
2142       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2143                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2144           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2145         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2146     }
2147   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2148            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2149                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2150                /* In case of versioning, check if the maximum number of
2151                   iterations is greater than th.  If they are identical,
2152                   the epilogue is unnecessary.  */
2153                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2154                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2155     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2156
2157   /* If an epilogue loop is required make sure we can create one.  */
2158   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2159       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2160     {
2161       if (dump_enabled_p ())
2162         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2163       if (!vect_can_advance_ivs_p (loop_vinfo)
2164           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2165                                            single_exit (LOOP_VINFO_LOOP
2166                                                          (loop_vinfo))))
2167         {
2168           if (dump_enabled_p ())
2169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2170                              "not vectorized: can't create required "
2171                              "epilog loop\n");
2172           goto again;
2173         }
2174     }
2175
2176   /* During peeling, we need to check if number of loop iterations is
2177      enough for both peeled prolog loop and vector loop.  This check
2178      can be merged along with threshold check of loop versioning, so
2179      increase threshold for this case if necessary.  */
2180   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2181     {
2182       poly_uint64 niters_th;
2183
2184       /* Niters for peeled prolog loop.  */
2185       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2186         {
2187           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2188           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2189
2190           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2191         }
2192       else
2193         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2194
2195       /* Niters for at least one iteration of vectorized loop.  */
2196       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       /* One additional iteration because of peeling for gap.  */
2198       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2199         niters_th += 1;
2200       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2201     }
2202
2203   gcc_assert (vectorization_factor
2204               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2205
2206   /* Ok to vectorize!  */
2207   return true;
2208
2209 again:
2210   /* Try again with SLP forced off but if we didn't do any SLP there is
2211      no point in re-trying.  */
2212   if (!slp)
2213     return false;
2214
2215   /* If there are reduction chains re-trying will fail anyway.  */
2216   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2217     return false;
2218
2219   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2220      via interleaving or lane instructions.  */
2221   slp_instance instance;
2222   slp_tree node;
2223   unsigned i, j;
2224   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2225     {
2226       stmt_vec_info vinfo;
2227       vinfo = vinfo_for_stmt
2228           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2229       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2230         continue;
2231       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2232       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2233       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2234       if (! vect_store_lanes_supported (vectype, size)
2235           && ! vect_grouped_store_supported (vectype, size))
2236         return false;
2237       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2238         {
2239           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2240           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2241           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2242           size = STMT_VINFO_GROUP_SIZE (vinfo);
2243           vectype = STMT_VINFO_VECTYPE (vinfo);
2244           if (! vect_load_lanes_supported (vectype, size)
2245               && ! vect_grouped_load_supported (vectype, single_element_p,
2246                                                 size))
2247             return false;
2248         }
2249     }
2250
2251   if (dump_enabled_p ())
2252     dump_printf_loc (MSG_NOTE, vect_location,
2253                      "re-trying with SLP disabled\n");
2254
2255   /* Roll back state appropriately.  No SLP this time.  */
2256   slp = false;
2257   /* Restore vectorization factor as it were without SLP.  */
2258   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2259   /* Free the SLP instances.  */
2260   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2261     vect_free_slp_instance (instance);
2262   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2263   /* Reset SLP type to loop_vect on all stmts.  */
2264   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2265     {
2266       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2267       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2268            !gsi_end_p (si); gsi_next (&si))
2269         {
2270           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2271           STMT_SLP_TYPE (stmt_info) = loop_vect;
2272         }
2273       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2274            !gsi_end_p (si); gsi_next (&si))
2275         {
2276           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2277           STMT_SLP_TYPE (stmt_info) = loop_vect;
2278           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2279             {
2280               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2281               STMT_SLP_TYPE (stmt_info) = loop_vect;
2282               for (gimple_stmt_iterator pi
2283                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2284                    !gsi_end_p (pi); gsi_next (&pi))
2285                 {
2286                   gimple *pstmt = gsi_stmt (pi);
2287                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2288                 }
2289             }
2290         }
2291     }
2292   /* Free optimized alias test DDRS.  */
2293   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2294   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2295   /* Reset target cost data.  */
2296   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2297   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2298     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2299   /* Reset assorted flags.  */
2300   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2301   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2302   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2303   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2304
2305   goto start_over;
2306 }
2307
2308 /* Function vect_analyze_loop.
2309
2310    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2311    for it.  The different analyses will record information in the
2312    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2313    be vectorized.  */
2314 loop_vec_info
2315 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2316 {
2317   loop_vec_info loop_vinfo;
2318   unsigned int vector_sizes;
2319
2320   /* Autodetect first vector size we try.  */
2321   current_vector_size = 0;
2322   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2323
2324   if (dump_enabled_p ())
2325     dump_printf_loc (MSG_NOTE, vect_location,
2326                      "===== analyze_loop_nest =====\n");
2327
2328   if (loop_outer (loop)
2329       && loop_vec_info_for_loop (loop_outer (loop))
2330       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_NOTE, vect_location,
2334                          "outer-loop already vectorized.\n");
2335       return NULL;
2336     }
2337
2338   while (1)
2339     {
2340       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2341       loop_vinfo = vect_analyze_loop_form (loop);
2342       if (!loop_vinfo)
2343         {
2344           if (dump_enabled_p ())
2345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                              "bad loop form.\n");
2347           return NULL;
2348         }
2349
2350       bool fatal = false;
2351
2352       if (orig_loop_vinfo)
2353         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2354
2355       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2356         {
2357           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2358
2359           return loop_vinfo;
2360         }
2361
2362       delete loop_vinfo;
2363
2364       vector_sizes &= ~current_vector_size;
2365       if (fatal
2366           || vector_sizes == 0
2367           || current_vector_size == 0)
2368         return NULL;
2369
2370       /* Try the next biggest vector size.  */
2371       current_vector_size = 1 << floor_log2 (vector_sizes);
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_NOTE, vect_location,
2374                          "***** Re-trying analysis with "
2375                          "vector size %d\n", current_vector_size);
2376     }
2377 }
2378
2379
2380 /* Function reduction_fn_for_scalar_code
2381
2382    Input:
2383    CODE - tree_code of a reduction operations.
2384
2385    Output:
2386    REDUC_FN - the corresponding internal function to be used to reduce the
2387       vector of partial results into a single scalar result, or IFN_LAST
2388       if the operation is a supported reduction operation, but does not have
2389       such an internal function.
2390
2391    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2392
2393 static bool
2394 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 {
2396   switch (code)
2397     {
2398       case MAX_EXPR:
2399         *reduc_fn = IFN_REDUC_MAX;
2400         return true;
2401
2402       case MIN_EXPR:
2403         *reduc_fn = IFN_REDUC_MIN;
2404         return true;
2405
2406       case PLUS_EXPR:
2407         *reduc_fn = IFN_REDUC_PLUS;
2408         return true;
2409
2410       case MULT_EXPR:
2411       case MINUS_EXPR:
2412       case BIT_IOR_EXPR:
2413       case BIT_XOR_EXPR:
2414       case BIT_AND_EXPR:
2415         *reduc_fn = IFN_LAST;
2416         return true;
2417
2418       default:
2419        return false;
2420     }
2421 }
2422
2423
2424 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2425    STMT is printed with a message MSG. */
2426
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2429 {
2430   dump_printf_loc (msg_type, vect_location, "%s", msg);
2431   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2432 }
2433
2434
2435 /* Detect SLP reduction of the form:
2436
2437    #a1 = phi <a5, a0>
2438    a2 = operation (a1)
2439    a3 = operation (a2)
2440    a4 = operation (a3)
2441    a5 = operation (a4)
2442
2443    #a = phi <a5>
2444
2445    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446    FIRST_STMT is the first reduction stmt in the chain
2447    (a2 = operation (a1)).
2448
2449    Return TRUE if a reduction chain was detected.  */
2450
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453                        gimple *first_stmt)
2454 {
2455   struct loop *loop = (gimple_bb (phi))->loop_father;
2456   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457   enum tree_code code;
2458   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459   stmt_vec_info use_stmt_info, current_stmt_info;
2460   tree lhs;
2461   imm_use_iterator imm_iter;
2462   use_operand_p use_p;
2463   int nloop_uses, size = 0, n_out_of_loop_uses;
2464   bool found = false;
2465
2466   if (loop != vect_loop)
2467     return false;
2468
2469   lhs = PHI_RESULT (phi);
2470   code = gimple_assign_rhs_code (first_stmt);
2471   while (1)
2472     {
2473       nloop_uses = 0;
2474       n_out_of_loop_uses = 0;
2475       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2476         {
2477           gimple *use_stmt = USE_STMT (use_p);
2478           if (is_gimple_debug (use_stmt))
2479             continue;
2480
2481           /* Check if we got back to the reduction phi.  */
2482           if (use_stmt == phi)
2483             {
2484               loop_use_stmt = use_stmt;
2485               found = true;
2486               break;
2487             }
2488
2489           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2490             {
2491               loop_use_stmt = use_stmt;
2492               nloop_uses++;
2493             }
2494            else
2495              n_out_of_loop_uses++;
2496
2497            /* There are can be either a single use in the loop or two uses in
2498               phi nodes.  */
2499            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500              return false;
2501         }
2502
2503       if (found)
2504         break;
2505
2506       /* We reached a statement with no loop uses.  */
2507       if (nloop_uses == 0)
2508         return false;
2509
2510       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2511       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512         return false;
2513
2514       if (!is_gimple_assign (loop_use_stmt)
2515           || code != gimple_assign_rhs_code (loop_use_stmt)
2516           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517         return false;
2518
2519       /* Insert USE_STMT into reduction chain.  */
2520       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521       if (current_stmt)
2522         {
2523           current_stmt_info = vinfo_for_stmt (current_stmt);
2524           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525           GROUP_FIRST_ELEMENT (use_stmt_info)
2526             = GROUP_FIRST_ELEMENT (current_stmt_info);
2527         }
2528       else
2529         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2530
2531       lhs = gimple_assign_lhs (loop_use_stmt);
2532       current_stmt = loop_use_stmt;
2533       size++;
2534    }
2535
2536   if (!found || loop_use_stmt != phi || size < 2)
2537     return false;
2538
2539   /* Swap the operands, if needed, to make the reduction operand be the second
2540      operand.  */
2541   lhs = PHI_RESULT (phi);
2542   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543   while (next_stmt)
2544     {
2545       if (gimple_assign_rhs2 (next_stmt) == lhs)
2546         {
2547           tree op = gimple_assign_rhs1 (next_stmt);
2548           gimple *def_stmt = NULL;
2549
2550           if (TREE_CODE (op) == SSA_NAME)
2551             def_stmt = SSA_NAME_DEF_STMT (op);
2552
2553           /* Check that the other def is either defined in the loop
2554              ("vect_internal_def"), or it's an induction (defined by a
2555              loop-header phi-node).  */
2556           if (def_stmt
2557               && gimple_bb (def_stmt)
2558               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559               && (is_gimple_assign (def_stmt)
2560                   || is_gimple_call (def_stmt)
2561                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562                            == vect_induction_def
2563                   || (gimple_code (def_stmt) == GIMPLE_PHI
2564                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565                                   == vect_internal_def
2566                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2567             {
2568               lhs = gimple_assign_lhs (next_stmt);
2569               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570               continue;
2571             }
2572
2573           return false;
2574         }
2575       else
2576         {
2577           tree op = gimple_assign_rhs2 (next_stmt);
2578           gimple *def_stmt = NULL;
2579
2580           if (TREE_CODE (op) == SSA_NAME)
2581             def_stmt = SSA_NAME_DEF_STMT (op);
2582
2583           /* Check that the other def is either defined in the loop
2584             ("vect_internal_def"), or it's an induction (defined by a
2585             loop-header phi-node).  */
2586           if (def_stmt
2587               && gimple_bb (def_stmt)
2588               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589               && (is_gimple_assign (def_stmt)
2590                   || is_gimple_call (def_stmt)
2591                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592                               == vect_induction_def
2593                   || (gimple_code (def_stmt) == GIMPLE_PHI
2594                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595                                   == vect_internal_def
2596                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2597             {
2598               if (dump_enabled_p ())
2599                 {
2600                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2602                 }
2603
2604               swap_ssa_operands (next_stmt,
2605                                  gimple_assign_rhs1_ptr (next_stmt),
2606                                  gimple_assign_rhs2_ptr (next_stmt));
2607               update_stmt (next_stmt);
2608
2609               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2611             }
2612           else
2613             return false;
2614         }
2615
2616       lhs = gimple_assign_lhs (next_stmt);
2617       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2618     }
2619
2620   /* Save the chain for further analysis in SLP detection.  */
2621   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2624
2625   return true;
2626 }
2627
2628
2629 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2630    reduction operation CODE has a handled computation expression.  */
2631
2632 bool
2633 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2634                       enum tree_code code)
2635 {
2636   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2637   auto_bitmap visited;
2638   tree lookfor = PHI_RESULT (phi);
2639   ssa_op_iter curri;
2640   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2641   while (USE_FROM_PTR (curr) != loop_arg)
2642     curr = op_iter_next_use (&curri);
2643   curri.i = curri.numops;
2644   do
2645     {
2646       path.safe_push (std::make_pair (curri, curr));
2647       tree use = USE_FROM_PTR (curr);
2648       if (use == lookfor)
2649         break;
2650       gimple *def = SSA_NAME_DEF_STMT (use);
2651       if (gimple_nop_p (def)
2652           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2653         {
2654 pop:
2655           do
2656             {
2657               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2658               curri = x.first;
2659               curr = x.second;
2660               do
2661                 curr = op_iter_next_use (&curri);
2662               /* Skip already visited or non-SSA operands (from iterating
2663                  over PHI args).  */
2664               while (curr != NULL_USE_OPERAND_P
2665                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2666                          || ! bitmap_set_bit (visited,
2667                                               SSA_NAME_VERSION
2668                                                 (USE_FROM_PTR (curr)))));
2669             }
2670           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2671           if (curr == NULL_USE_OPERAND_P)
2672             break;
2673         }
2674       else
2675         {
2676           if (gimple_code (def) == GIMPLE_PHI)
2677             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2678           else
2679             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2680           while (curr != NULL_USE_OPERAND_P
2681                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2682                      || ! bitmap_set_bit (visited,
2683                                           SSA_NAME_VERSION
2684                                             (USE_FROM_PTR (curr)))))
2685             curr = op_iter_next_use (&curri);
2686           if (curr == NULL_USE_OPERAND_P)
2687             goto pop;
2688         }
2689     }
2690   while (1);
2691   if (dump_file && (dump_flags & TDF_DETAILS))
2692     {
2693       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2694       unsigned i;
2695       std::pair<ssa_op_iter, use_operand_p> *x;
2696       FOR_EACH_VEC_ELT (path, i, x)
2697         {
2698           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2699           dump_printf (MSG_NOTE, " ");
2700         }
2701       dump_printf (MSG_NOTE, "\n");
2702     }
2703
2704   /* Check whether the reduction path detected is valid.  */
2705   bool fail = path.length () == 0;
2706   bool neg = false;
2707   for (unsigned i = 1; i < path.length (); ++i)
2708     {
2709       gimple *use_stmt = USE_STMT (path[i].second);
2710       tree op = USE_FROM_PTR (path[i].second);
2711       if (! has_single_use (op)
2712           || ! is_gimple_assign (use_stmt))
2713         {
2714           fail = true;
2715           break;
2716         }
2717       if (gimple_assign_rhs_code (use_stmt) != code)
2718         {
2719           if (code == PLUS_EXPR
2720               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2721             {
2722               /* Track whether we negate the reduction value each iteration.  */
2723               if (gimple_assign_rhs2 (use_stmt) == op)
2724                 neg = ! neg;
2725             }
2726           else
2727             {
2728               fail = true;
2729               break;
2730             }
2731         }
2732     }
2733   return ! fail && ! neg;
2734 }
2735
2736
2737 /* Function vect_is_simple_reduction
2738
2739    (1) Detect a cross-iteration def-use cycle that represents a simple
2740    reduction computation.  We look for the following pattern:
2741
2742    loop_header:
2743      a1 = phi < a0, a2 >
2744      a3 = ...
2745      a2 = operation (a3, a1)
2746
2747    or
2748
2749    a3 = ...
2750    loop_header:
2751      a1 = phi < a0, a2 >
2752      a2 = operation (a3, a1)
2753
2754    such that:
2755    1. operation is commutative and associative and it is safe to
2756       change the order of the computation
2757    2. no uses for a2 in the loop (a2 is used out of the loop)
2758    3. no uses of a1 in the loop besides the reduction operation
2759    4. no uses of a1 outside the loop.
2760
2761    Conditions 1,4 are tested here.
2762    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2763
2764    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2765    nested cycles.
2766
2767    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2768    reductions:
2769
2770      a1 = phi < a0, a2 >
2771      inner loop (def of a3)
2772      a2 = phi < a3 >
2773
2774    (4) Detect condition expressions, ie:
2775      for (int i = 0; i < N; i++)
2776        if (a[i] < val)
2777         ret_val = a[i];
2778
2779 */
2780
2781 static gimple *
2782 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2783                           bool *double_reduc,
2784                           bool need_wrapping_integral_overflow,
2785                           enum vect_reduction_type *v_reduc_type)
2786 {
2787   struct loop *loop = (gimple_bb (phi))->loop_father;
2788   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2789   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2790   enum tree_code orig_code, code;
2791   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2792   tree type;
2793   int nloop_uses;
2794   tree name;
2795   imm_use_iterator imm_iter;
2796   use_operand_p use_p;
2797   bool phi_def;
2798
2799   *double_reduc = false;
2800   *v_reduc_type = TREE_CODE_REDUCTION;
2801
2802   tree phi_name = PHI_RESULT (phi);
2803   /* ???  If there are no uses of the PHI result the inner loop reduction
2804      won't be detected as possibly double-reduction by vectorizable_reduction
2805      because that tries to walk the PHI arg from the preheader edge which
2806      can be constant.  See PR60382.  */
2807   if (has_zero_uses (phi_name))
2808     return NULL;
2809   nloop_uses = 0;
2810   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2811     {
2812       gimple *use_stmt = USE_STMT (use_p);
2813       if (is_gimple_debug (use_stmt))
2814         continue;
2815
2816       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2817         {
2818           if (dump_enabled_p ())
2819             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2820                              "intermediate value used outside loop.\n");
2821
2822           return NULL;
2823         }
2824
2825       nloop_uses++;
2826       if (nloop_uses > 1)
2827         {
2828           if (dump_enabled_p ())
2829             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2830                              "reduction value used in loop.\n");
2831           return NULL;
2832         }
2833
2834       phi_use_stmt = use_stmt;
2835     }
2836
2837   edge latch_e = loop_latch_edge (loop);
2838   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2839   if (TREE_CODE (loop_arg) != SSA_NAME)
2840     {
2841       if (dump_enabled_p ())
2842         {
2843           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2844                            "reduction: not ssa_name: ");
2845           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2846           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2847         }
2848       return NULL;
2849     }
2850
2851   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2852   if (is_gimple_assign (def_stmt))
2853     {
2854       name = gimple_assign_lhs (def_stmt);
2855       phi_def = false;
2856     }
2857   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2858     {
2859       name = PHI_RESULT (def_stmt);
2860       phi_def = true;
2861     }
2862   else
2863     {
2864       if (dump_enabled_p ())
2865         {
2866           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2867                            "reduction: unhandled reduction operation: ");
2868           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2869         }
2870       return NULL;
2871     }
2872
2873   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2874     return NULL;
2875
2876   nloop_uses = 0;
2877   auto_vec<gphi *, 3> lcphis;
2878   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2879     {
2880       gimple *use_stmt = USE_STMT (use_p);
2881       if (is_gimple_debug (use_stmt))
2882         continue;
2883       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2884         nloop_uses++;
2885       else
2886         /* We can have more than one loop-closed PHI.  */
2887         lcphis.safe_push (as_a <gphi *> (use_stmt));
2888       if (nloop_uses > 1)
2889         {
2890           if (dump_enabled_p ())
2891             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2892                              "reduction used in loop.\n");
2893           return NULL;
2894         }
2895     }
2896
2897   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2898      defined in the inner loop.  */
2899   if (phi_def)
2900     {
2901       op1 = PHI_ARG_DEF (def_stmt, 0);
2902
2903       if (gimple_phi_num_args (def_stmt) != 1
2904           || TREE_CODE (op1) != SSA_NAME)
2905         {
2906           if (dump_enabled_p ())
2907             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2908                              "unsupported phi node definition.\n");
2909
2910           return NULL;
2911         }
2912
2913       def1 = SSA_NAME_DEF_STMT (op1);
2914       if (gimple_bb (def1)
2915           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2916           && loop->inner
2917           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2918           && is_gimple_assign (def1)
2919           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2920         {
2921           if (dump_enabled_p ())
2922             report_vect_op (MSG_NOTE, def_stmt,
2923                             "detected double reduction: ");
2924
2925           *double_reduc = true;
2926           return def_stmt;
2927         }
2928
2929       return NULL;
2930     }
2931
2932   /* If we are vectorizing an inner reduction we are executing that
2933      in the original order only in case we are not dealing with a
2934      double reduction.  */
2935   bool check_reduction = true;
2936   if (flow_loop_nested_p (vect_loop, loop))
2937     {
2938       gphi *lcphi;
2939       unsigned i;
2940       check_reduction = false;
2941       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2942         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2943           {
2944             gimple *use_stmt = USE_STMT (use_p);
2945             if (is_gimple_debug (use_stmt))
2946               continue;
2947             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2948               check_reduction = true;
2949           }
2950     }
2951
2952   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2953   code = orig_code = gimple_assign_rhs_code (def_stmt);
2954
2955   /* We can handle "res -= x[i]", which is non-associative by
2956      simply rewriting this into "res += -x[i]".  Avoid changing
2957      gimple instruction for the first simple tests and only do this
2958      if we're allowed to change code at all.  */
2959   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2960     code = PLUS_EXPR;
2961
2962   if (code == COND_EXPR)
2963     {
2964       if (! nested_in_vect_loop)
2965         *v_reduc_type = COND_REDUCTION;
2966
2967       op3 = gimple_assign_rhs1 (def_stmt);
2968       if (COMPARISON_CLASS_P (op3))
2969         {
2970           op4 = TREE_OPERAND (op3, 1);
2971           op3 = TREE_OPERAND (op3, 0);
2972         }
2973       if (op3 == phi_name || op4 == phi_name)
2974         {
2975           if (dump_enabled_p ())
2976             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2977                             "reduction: condition depends on previous"
2978                             " iteration: ");
2979           return NULL;
2980         }
2981
2982       op1 = gimple_assign_rhs2 (def_stmt);
2983       op2 = gimple_assign_rhs3 (def_stmt);
2984     }
2985   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2986     {
2987       if (dump_enabled_p ())
2988         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2989                         "reduction: not commutative/associative: ");
2990       return NULL;
2991     }
2992   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2993     {
2994       op1 = gimple_assign_rhs1 (def_stmt);
2995       op2 = gimple_assign_rhs2 (def_stmt);
2996     }
2997   else
2998     {
2999       if (dump_enabled_p ())
3000         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3001                         "reduction: not handled operation: ");
3002       return NULL;
3003     }
3004
3005   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3006     {
3007       if (dump_enabled_p ())
3008         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3009                         "reduction: both uses not ssa_names: ");
3010
3011       return NULL;
3012     }
3013
3014   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3015   if ((TREE_CODE (op1) == SSA_NAME
3016        && !types_compatible_p (type,TREE_TYPE (op1)))
3017       || (TREE_CODE (op2) == SSA_NAME
3018           && !types_compatible_p (type, TREE_TYPE (op2)))
3019       || (op3 && TREE_CODE (op3) == SSA_NAME
3020           && !types_compatible_p (type, TREE_TYPE (op3)))
3021       || (op4 && TREE_CODE (op4) == SSA_NAME
3022           && !types_compatible_p (type, TREE_TYPE (op4))))
3023     {
3024       if (dump_enabled_p ())
3025         {
3026           dump_printf_loc (MSG_NOTE, vect_location,
3027                            "reduction: multiple types: operation type: ");
3028           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3029           dump_printf (MSG_NOTE, ", operands types: ");
3030           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3031                              TREE_TYPE (op1));
3032           dump_printf (MSG_NOTE, ",");
3033           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3034                              TREE_TYPE (op2));
3035           if (op3)
3036             {
3037               dump_printf (MSG_NOTE, ",");
3038               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3039                                  TREE_TYPE (op3));
3040             }
3041
3042           if (op4)
3043             {
3044               dump_printf (MSG_NOTE, ",");
3045               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3046                                  TREE_TYPE (op4));
3047             }
3048           dump_printf (MSG_NOTE, "\n");
3049         }
3050
3051       return NULL;
3052     }
3053
3054   /* Check that it's ok to change the order of the computation.
3055      Generally, when vectorizing a reduction we change the order of the
3056      computation.  This may change the behavior of the program in some
3057      cases, so we need to check that this is ok.  One exception is when
3058      vectorizing an outer-loop: the inner-loop is executed sequentially,
3059      and therefore vectorizing reductions in the inner-loop during
3060      outer-loop vectorization is safe.  */
3061
3062   if (*v_reduc_type != COND_REDUCTION
3063       && check_reduction)
3064     {
3065       /* CHECKME: check for !flag_finite_math_only too?  */
3066       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3067         {
3068           /* Changing the order of operations changes the semantics.  */
3069           if (dump_enabled_p ())
3070             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3071                         "reduction: unsafe fp math optimization: ");
3072           return NULL;
3073         }
3074       else if (INTEGRAL_TYPE_P (type))
3075         {
3076           if (!operation_no_trapping_overflow (type, code))
3077             {
3078               /* Changing the order of operations changes the semantics.  */
3079               if (dump_enabled_p ())
3080                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3081                                 "reduction: unsafe int math optimization"
3082                                 " (overflow traps): ");
3083               return NULL;
3084             }
3085           if (need_wrapping_integral_overflow
3086               && !TYPE_OVERFLOW_WRAPS (type)
3087               && operation_can_overflow (code))
3088             {
3089               /* Changing the order of operations changes the semantics.  */
3090               if (dump_enabled_p ())
3091                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3092                                 "reduction: unsafe int math optimization"
3093                                 " (overflow doesn't wrap): ");
3094               return NULL;
3095             }
3096         }
3097       else if (SAT_FIXED_POINT_TYPE_P (type))
3098         {
3099           /* Changing the order of operations changes the semantics.  */
3100           if (dump_enabled_p ())
3101           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3102                           "reduction: unsafe fixed-point math optimization: ");
3103           return NULL;
3104         }
3105     }
3106
3107   /* Reduction is safe. We're dealing with one of the following:
3108      1) integer arithmetic and no trapv
3109      2) floating point arithmetic, and special flags permit this optimization
3110      3) nested cycle (i.e., outer loop vectorization).  */
3111   if (TREE_CODE (op1) == SSA_NAME)
3112     def1 = SSA_NAME_DEF_STMT (op1);
3113
3114   if (TREE_CODE (op2) == SSA_NAME)
3115     def2 = SSA_NAME_DEF_STMT (op2);
3116
3117   if (code != COND_EXPR
3118       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3119     {
3120       if (dump_enabled_p ())
3121         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3122       return NULL;
3123     }
3124
3125   /* Check that one def is the reduction def, defined by PHI,
3126      the other def is either defined in the loop ("vect_internal_def"),
3127      or it's an induction (defined by a loop-header phi-node).  */
3128
3129   if (def2 && def2 == phi
3130       && (code == COND_EXPR
3131           || !def1 || gimple_nop_p (def1)
3132           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3133           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3134               && (is_gimple_assign (def1)
3135                   || is_gimple_call (def1)
3136                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3137                       == vect_induction_def
3138                   || (gimple_code (def1) == GIMPLE_PHI
3139                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3140                           == vect_internal_def
3141                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3142     {
3143       if (dump_enabled_p ())
3144         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3145       return def_stmt;
3146     }
3147
3148   if (def1 && def1 == phi
3149       && (code == COND_EXPR
3150           || !def2 || gimple_nop_p (def2)
3151           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3152           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3153               && (is_gimple_assign (def2)
3154                   || is_gimple_call (def2)
3155                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3156                        == vect_induction_def
3157                   || (gimple_code (def2) == GIMPLE_PHI
3158                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3159                            == vect_internal_def
3160                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3161     {
3162       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3163         {
3164           /* Check if we can swap operands (just for simplicity - so that
3165              the rest of the code can assume that the reduction variable
3166              is always the last (second) argument).  */
3167           if (code == COND_EXPR)
3168             {
3169               /* Swap cond_expr by inverting the condition.  */
3170               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3171               enum tree_code invert_code = ERROR_MARK;
3172               enum tree_code cond_code = TREE_CODE (cond_expr);
3173
3174               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3175                 {
3176                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3177                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3178                 }
3179               if (invert_code != ERROR_MARK)
3180                 {
3181                   TREE_SET_CODE (cond_expr, invert_code);
3182                   swap_ssa_operands (def_stmt,
3183                                      gimple_assign_rhs2_ptr (def_stmt),
3184                                      gimple_assign_rhs3_ptr (def_stmt));
3185                 }
3186               else
3187                 {
3188                   if (dump_enabled_p ())
3189                     report_vect_op (MSG_NOTE, def_stmt,
3190                                     "detected reduction: cannot swap operands "
3191                                     "for cond_expr");
3192                   return NULL;
3193                 }
3194             }
3195           else
3196             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3197                                gimple_assign_rhs2_ptr (def_stmt));
3198
3199           if (dump_enabled_p ())
3200             report_vect_op (MSG_NOTE, def_stmt,
3201                             "detected reduction: need to swap operands: ");
3202
3203           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3204             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3205         }
3206       else
3207         {
3208           if (dump_enabled_p ())
3209             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3210         }
3211
3212       return def_stmt;
3213     }
3214
3215   /* Try to find SLP reduction chain.  */
3216   if (! nested_in_vect_loop
3217       && code != COND_EXPR
3218       && orig_code != MINUS_EXPR
3219       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3220     {
3221       if (dump_enabled_p ())
3222         report_vect_op (MSG_NOTE, def_stmt,
3223                         "reduction: detected reduction chain: ");
3224
3225       return def_stmt;
3226     }
3227
3228   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3229   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3230   while (first)
3231     {
3232       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3233       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3234       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3235       first = next;
3236     }
3237
3238   /* Look for the expression computing loop_arg from loop PHI result.  */
3239   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3240                             code))
3241     return def_stmt;
3242
3243   if (dump_enabled_p ())
3244     {
3245       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3246                       "reduction: unknown pattern: ");
3247     }
3248
3249   return NULL;
3250 }
3251
3252 /* Wrapper around vect_is_simple_reduction, which will modify code
3253    in-place if it enables detection of more reductions.  Arguments
3254    as there.  */
3255
3256 gimple *
3257 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3258                              bool *double_reduc,
3259                              bool need_wrapping_integral_overflow)
3260 {
3261   enum vect_reduction_type v_reduc_type;
3262   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3263                                           need_wrapping_integral_overflow,
3264                                           &v_reduc_type);
3265   if (def)
3266     {
3267       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3268       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3269       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3270       reduc_def_info = vinfo_for_stmt (def);
3271       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3272     }
3273   return def;
3274 }
3275
3276 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3277 int
3278 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3279                              int *peel_iters_epilogue,
3280                              stmt_vector_for_cost *scalar_cost_vec,
3281                              stmt_vector_for_cost *prologue_cost_vec,
3282                              stmt_vector_for_cost *epilogue_cost_vec)
3283 {
3284   int retval = 0;
3285   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3286
3287   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3288     {
3289       *peel_iters_epilogue = vf/2;
3290       if (dump_enabled_p ())
3291         dump_printf_loc (MSG_NOTE, vect_location,
3292                          "cost model: epilogue peel iters set to vf/2 "
3293                          "because loop iterations are unknown .\n");
3294
3295       /* If peeled iterations are known but number of scalar loop
3296          iterations are unknown, count a taken branch per peeled loop.  */
3297       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3298                                  NULL, 0, vect_prologue);
3299       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3300                                  NULL, 0, vect_epilogue);
3301     }
3302   else
3303     {
3304       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3305       peel_iters_prologue = niters < peel_iters_prologue ?
3306                             niters : peel_iters_prologue;
3307       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3308       /* If we need to peel for gaps, but no peeling is required, we have to
3309          peel VF iterations.  */
3310       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3311         *peel_iters_epilogue = vf;
3312     }
3313
3314   stmt_info_for_cost *si;
3315   int j;
3316   if (peel_iters_prologue)
3317     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3318         {
3319           stmt_vec_info stmt_info
3320             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321           retval += record_stmt_cost (prologue_cost_vec,
3322                                       si->count * peel_iters_prologue,
3323                                       si->kind, stmt_info, si->misalign,
3324                                       vect_prologue);
3325         }
3326   if (*peel_iters_epilogue)
3327     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3328         {
3329           stmt_vec_info stmt_info
3330             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3331           retval += record_stmt_cost (epilogue_cost_vec,
3332                                       si->count * *peel_iters_epilogue,
3333                                       si->kind, stmt_info, si->misalign,
3334                                       vect_epilogue);
3335         }
3336
3337   return retval;
3338 }
3339
3340 /* Function vect_estimate_min_profitable_iters
3341
3342    Return the number of iterations required for the vector version of the
3343    loop to be profitable relative to the cost of the scalar version of the
3344    loop.
3345
3346    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3347    of iterations for vectorization.  -1 value means loop vectorization
3348    is not profitable.  This returned value may be used for dynamic
3349    profitability check.
3350
3351    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3352    for static check against estimated number of iterations.  */
3353
3354 static void
3355 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3356                                     int *ret_min_profitable_niters,
3357                                     int *ret_min_profitable_estimate)
3358 {
3359   int min_profitable_iters;
3360   int min_profitable_estimate;
3361   int peel_iters_prologue;
3362   int peel_iters_epilogue;
3363   unsigned vec_inside_cost = 0;
3364   int vec_outside_cost = 0;
3365   unsigned vec_prologue_cost = 0;
3366   unsigned vec_epilogue_cost = 0;
3367   int scalar_single_iter_cost = 0;
3368   int scalar_outside_cost = 0;
3369   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3370   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3371   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3372
3373   /* Cost model disabled.  */
3374   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3375     {
3376       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3377       *ret_min_profitable_niters = 0;
3378       *ret_min_profitable_estimate = 0;
3379       return;
3380     }
3381
3382   /* Requires loop versioning tests to handle misalignment.  */
3383   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3384     {
3385       /*  FIXME: Make cost depend on complexity of individual check.  */
3386       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3387       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3388                             vect_prologue);
3389       dump_printf (MSG_NOTE,
3390                    "cost model: Adding cost of checks for loop "
3391                    "versioning to treat misalignment.\n");
3392     }
3393
3394   /* Requires loop versioning with alias checks.  */
3395   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3396     {
3397       /*  FIXME: Make cost depend on complexity of individual check.  */
3398       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3399       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3400                             vect_prologue);
3401       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3402       if (len)
3403         /* Count LEN - 1 ANDs and LEN comparisons.  */
3404         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3405                               NULL, 0, vect_prologue);
3406       dump_printf (MSG_NOTE,
3407                    "cost model: Adding cost of checks for loop "
3408                    "versioning aliasing.\n");
3409     }
3410
3411   /* Requires loop versioning with niter checks.  */
3412   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3413     {
3414       /*  FIXME: Make cost depend on complexity of individual check.  */
3415       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3416                             vect_prologue);
3417       dump_printf (MSG_NOTE,
3418                    "cost model: Adding cost of checks for loop "
3419                    "versioning niters.\n");
3420     }
3421
3422   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3423     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3424                           vect_prologue);
3425
3426   /* Count statements in scalar loop.  Using this as scalar cost for a single
3427      iteration for now.
3428
3429      TODO: Add outer loop support.
3430
3431      TODO: Consider assigning different costs to different scalar
3432      statements.  */
3433
3434   scalar_single_iter_cost
3435     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3436
3437   /* Add additional cost for the peeled instructions in prologue and epilogue
3438      loop.
3439
3440      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3441      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3442
3443      TODO: Build an expression that represents peel_iters for prologue and
3444      epilogue to be used in a run-time test.  */
3445
3446   if (npeel  < 0)
3447     {
3448       peel_iters_prologue = vf/2;
3449       dump_printf (MSG_NOTE, "cost model: "
3450                    "prologue peel iters set to vf/2.\n");
3451
3452       /* If peeling for alignment is unknown, loop bound of main loop becomes
3453          unknown.  */
3454       peel_iters_epilogue = vf/2;
3455       dump_printf (MSG_NOTE, "cost model: "
3456                    "epilogue peel iters set to vf/2 because "
3457                    "peeling for alignment is unknown.\n");
3458
3459       /* If peeled iterations are unknown, count a taken branch and a not taken
3460          branch per peeled loop. Even if scalar loop iterations are known,
3461          vector iterations are not known since peeled prologue iterations are
3462          not known. Hence guards remain the same.  */
3463       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3464                             NULL, 0, vect_prologue);
3465       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3466                             NULL, 0, vect_prologue);
3467       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3468                             NULL, 0, vect_epilogue);
3469       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3470                             NULL, 0, vect_epilogue);
3471       stmt_info_for_cost *si;
3472       int j;
3473       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3474         {
3475           struct _stmt_vec_info *stmt_info
3476             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3477           (void) add_stmt_cost (target_cost_data,
3478                                 si->count * peel_iters_prologue,
3479                                 si->kind, stmt_info, si->misalign,
3480                                 vect_prologue);
3481           (void) add_stmt_cost (target_cost_data,
3482                                 si->count * peel_iters_epilogue,
3483                                 si->kind, stmt_info, si->misalign,
3484                                 vect_epilogue);
3485         }
3486     }
3487   else
3488     {
3489       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3490       stmt_info_for_cost *si;
3491       int j;
3492       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3493
3494       prologue_cost_vec.create (2);
3495       epilogue_cost_vec.create (2);
3496       peel_iters_prologue = npeel;
3497
3498       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3499                                           &peel_iters_epilogue,
3500                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3501                                             (loop_vinfo),
3502                                           &prologue_cost_vec,
3503                                           &epilogue_cost_vec);
3504
3505       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3506         {
3507           struct _stmt_vec_info *stmt_info
3508             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3509           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3510                                 si->misalign, vect_prologue);
3511         }
3512
3513       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3514         {
3515           struct _stmt_vec_info *stmt_info
3516             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3517           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3518                                 si->misalign, vect_epilogue);
3519         }
3520
3521       prologue_cost_vec.release ();
3522       epilogue_cost_vec.release ();
3523     }
3524
3525   /* FORNOW: The scalar outside cost is incremented in one of the
3526      following ways:
3527
3528      1. The vectorizer checks for alignment and aliasing and generates
3529      a condition that allows dynamic vectorization.  A cost model
3530      check is ANDED with the versioning condition.  Hence scalar code
3531      path now has the added cost of the versioning check.
3532
3533        if (cost > th & versioning_check)
3534          jmp to vector code
3535
3536      Hence run-time scalar is incremented by not-taken branch cost.
3537
3538      2. The vectorizer then checks if a prologue is required.  If the
3539      cost model check was not done before during versioning, it has to
3540      be done before the prologue check.
3541
3542        if (cost <= th)
3543          prologue = scalar_iters
3544        if (prologue == 0)
3545          jmp to vector code
3546        else
3547          execute prologue
3548        if (prologue == num_iters)
3549          go to exit
3550
3551      Hence the run-time scalar cost is incremented by a taken branch,
3552      plus a not-taken branch, plus a taken branch cost.
3553
3554      3. The vectorizer then checks if an epilogue is required.  If the
3555      cost model check was not done before during prologue check, it
3556      has to be done with the epilogue check.
3557
3558        if (prologue == 0)
3559          jmp to vector code
3560        else
3561          execute prologue
3562        if (prologue == num_iters)
3563          go to exit
3564        vector code:
3565          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3566            jmp to epilogue
3567
3568      Hence the run-time scalar cost should be incremented by 2 taken
3569      branches.
3570
3571      TODO: The back end may reorder the BBS's differently and reverse
3572      conditions/branch directions.  Change the estimates below to
3573      something more reasonable.  */
3574
3575   /* If the number of iterations is known and we do not do versioning, we can
3576      decide whether to vectorize at compile time.  Hence the scalar version
3577      do not carry cost model guard costs.  */
3578   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3579       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3580     {
3581       /* Cost model check occurs at versioning.  */
3582       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3583         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3584       else
3585         {
3586           /* Cost model check occurs at prologue generation.  */
3587           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3588             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3589               + vect_get_stmt_cost (cond_branch_not_taken);
3590           /* Cost model check occurs at epilogue generation.  */
3591           else
3592             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3593         }
3594     }
3595
3596   /* Complete the target-specific cost calculations.  */
3597   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3598                &vec_inside_cost, &vec_epilogue_cost);
3599
3600   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3601
3602   if (dump_enabled_p ())
3603     {
3604       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3605       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3606                    vec_inside_cost);
3607       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3608                    vec_prologue_cost);
3609       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3610                    vec_epilogue_cost);
3611       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3612                    scalar_single_iter_cost);
3613       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3614                    scalar_outside_cost);
3615       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3616                    vec_outside_cost);
3617       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3618                    peel_iters_prologue);
3619       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3620                    peel_iters_epilogue);
3621     }
3622
3623   /* Calculate number of iterations required to make the vector version
3624      profitable, relative to the loop bodies only.  The following condition
3625      must hold true:
3626      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3627      where
3628      SIC = scalar iteration cost, VIC = vector iteration cost,
3629      VOC = vector outside cost, VF = vectorization factor,
3630      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3631      SOC = scalar outside cost for run time cost model check.  */
3632
3633   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3634     {
3635       if (vec_outside_cost <= 0)
3636         min_profitable_iters = 0;
3637       else
3638         {
3639           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3640                                   - vec_inside_cost * peel_iters_prologue
3641                                   - vec_inside_cost * peel_iters_epilogue)
3642                                  / ((scalar_single_iter_cost * vf)
3643                                     - vec_inside_cost);
3644
3645           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3646               <= (((int) vec_inside_cost * min_profitable_iters)
3647                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3648             min_profitable_iters++;
3649         }
3650     }
3651   /* vector version will never be profitable.  */
3652   else
3653     {
3654       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3655         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3656                     "did not happen for a simd loop");
3657
3658       if (dump_enabled_p ())
3659         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3660                          "cost model: the vector iteration cost = %d "
3661                          "divided by the scalar iteration cost = %d "
3662                          "is greater or equal to the vectorization factor = %d"
3663                          ".\n",
3664                          vec_inside_cost, scalar_single_iter_cost, vf);
3665       *ret_min_profitable_niters = -1;
3666       *ret_min_profitable_estimate = -1;
3667       return;
3668     }
3669
3670   dump_printf (MSG_NOTE,
3671                "  Calculated minimum iters for profitability: %d\n",
3672                min_profitable_iters);
3673
3674   /* We want the vectorized loop to execute at least once.  */
3675   if (min_profitable_iters < (vf + peel_iters_prologue))
3676     min_profitable_iters = vf + peel_iters_prologue;
3677
3678   if (dump_enabled_p ())
3679     dump_printf_loc (MSG_NOTE, vect_location,
3680                      "  Runtime profitability threshold = %d\n",
3681                      min_profitable_iters);
3682
3683   *ret_min_profitable_niters = min_profitable_iters;
3684
3685   /* Calculate number of iterations required to make the vector version
3686      profitable, relative to the loop bodies only.
3687
3688      Non-vectorized variant is SIC * niters and it must win over vector
3689      variant on the expected loop trip count.  The following condition must hold true:
3690      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3691
3692   if (vec_outside_cost <= 0)
3693     min_profitable_estimate = 0;
3694   else
3695     {
3696       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3697                                  - vec_inside_cost * peel_iters_prologue
3698                                  - vec_inside_cost * peel_iters_epilogue)
3699                                  / ((scalar_single_iter_cost * vf)
3700                                    - vec_inside_cost);
3701     }
3702   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3703   if (dump_enabled_p ())
3704     dump_printf_loc (MSG_NOTE, vect_location,
3705                      "  Static estimate profitability threshold = %d\n",
3706                      min_profitable_estimate);
3707
3708   *ret_min_profitable_estimate = min_profitable_estimate;
3709 }
3710
3711 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3712    vector elements (not bits) for a vector with NELT elements.  */
3713 static void
3714 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3715                               vec_perm_indices *sel)
3716 {
3717   unsigned int i;
3718
3719   for (i = 0; i < nelt; i++)
3720     sel->quick_push ((i + offset) & (2 * nelt - 1));
3721 }
3722
3723 /* Checks whether the target supports whole-vector shifts for vectors of mode
3724    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3725    it supports vec_perm_const with masks for all necessary shift amounts.  */
3726 static bool
3727 have_whole_vector_shift (machine_mode mode)
3728 {
3729   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3730     return true;
3731
3732   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3733     return false;
3734
3735   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3736   auto_vec_perm_indices sel (nelt);
3737
3738   for (i = nelt/2; i >= 1; i/=2)
3739     {
3740       sel.truncate (0);
3741       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3742       if (!can_vec_perm_p (mode, false, &sel))
3743         return false;
3744     }
3745   return true;
3746 }
3747
3748 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3749    functions. Design better to avoid maintenance issues.  */
3750
3751 /* Function vect_model_reduction_cost.
3752
3753    Models cost for a reduction operation, including the vector ops
3754    generated within the strip-mine loop, the initial definition before
3755    the loop, and the epilogue code that must be generated.  */
3756
3757 static void
3758 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3759                            int ncopies)
3760 {
3761   int prologue_cost = 0, epilogue_cost = 0;
3762   enum tree_code code;
3763   optab optab;
3764   tree vectype;
3765   gimple *orig_stmt;
3766   machine_mode mode;
3767   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3768   struct loop *loop = NULL;
3769   void *target_cost_data;
3770
3771   if (loop_vinfo)
3772     {
3773       loop = LOOP_VINFO_LOOP (loop_vinfo);
3774       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3775     }
3776   else
3777     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3778
3779   /* Condition reductions generate two reductions in the loop.  */
3780   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3781     ncopies *= 2;
3782
3783   /* Cost of reduction op inside loop.  */
3784   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3785                                         stmt_info, 0, vect_body);
3786
3787   vectype = STMT_VINFO_VECTYPE (stmt_info);
3788   mode = TYPE_MODE (vectype);
3789   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3790
3791   if (!orig_stmt)
3792     orig_stmt = STMT_VINFO_STMT (stmt_info);
3793
3794   code = gimple_assign_rhs_code (orig_stmt);
3795
3796   /* Add in cost for initial definition.
3797      For cond reduction we have four vectors: initial index, step, initial
3798      result of the data reduction, initial value of the index reduction.  */
3799   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3800                        == COND_REDUCTION ? 4 : 1;
3801   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3802                                   scalar_to_vec, stmt_info, 0,
3803                                   vect_prologue);
3804
3805   /* Determine cost of epilogue code.
3806
3807      We have a reduction operator that will reduce the vector in one statement.
3808      Also requires scalar extract.  */
3809
3810   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3811     {
3812       if (reduc_fn != IFN_LAST)
3813         {
3814           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3815             {
3816               /* An EQ stmt and an COND_EXPR stmt.  */
3817               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3818                                               vector_stmt, stmt_info, 0,
3819                                               vect_epilogue);
3820               /* Reduction of the max index and a reduction of the found
3821                  values.  */
3822               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3823                                               vec_to_scalar, stmt_info, 0,
3824                                               vect_epilogue);
3825               /* A broadcast of the max value.  */
3826               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3827                                               scalar_to_vec, stmt_info, 0,
3828                                               vect_epilogue);
3829             }
3830           else
3831             {
3832               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3833                                               stmt_info, 0, vect_epilogue);
3834               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3835                                               vec_to_scalar, stmt_info, 0,
3836                                               vect_epilogue);
3837             }
3838         }
3839       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3840         {
3841           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3842           /* Extraction of scalar elements.  */
3843           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3844                                           vec_to_scalar, stmt_info, 0,
3845                                           vect_epilogue);
3846           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3847           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3848                                           scalar_stmt, stmt_info, 0,
3849                                           vect_epilogue);
3850         }
3851       else
3852         {
3853           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3854           tree bitsize =
3855             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3856           int element_bitsize = tree_to_uhwi (bitsize);
3857           int nelements = vec_size_in_bits / element_bitsize;
3858
3859           if (code == COND_EXPR)
3860             code = MAX_EXPR;
3861
3862           optab = optab_for_tree_code (code, vectype, optab_default);
3863
3864           /* We have a whole vector shift available.  */
3865           if (optab != unknown_optab
3866               && VECTOR_MODE_P (mode)
3867               && optab_handler (optab, mode) != CODE_FOR_nothing
3868               && have_whole_vector_shift (mode))
3869             {
3870               /* Final reduction via vector shifts and the reduction operator.
3871                  Also requires scalar extract.  */
3872               epilogue_cost += add_stmt_cost (target_cost_data,
3873                                               exact_log2 (nelements) * 2,
3874                                               vector_stmt, stmt_info, 0,
3875                                               vect_epilogue);
3876               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3877                                               vec_to_scalar, stmt_info, 0,
3878                                               vect_epilogue);
3879             }
3880           else
3881             /* Use extracts and reduction op for final reduction.  For N
3882                elements, we have N extracts and N-1 reduction ops.  */
3883             epilogue_cost += add_stmt_cost (target_cost_data,
3884                                             nelements + nelements - 1,
3885                                             vector_stmt, stmt_info, 0,
3886                                             vect_epilogue);
3887         }
3888     }
3889
3890   if (dump_enabled_p ())
3891     dump_printf (MSG_NOTE,
3892                  "vect_model_reduction_cost: inside_cost = %d, "
3893                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3894                  prologue_cost, epilogue_cost);
3895 }
3896
3897
3898 /* Function vect_model_induction_cost.
3899
3900    Models cost for induction operations.  */
3901
3902 static void
3903 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3904 {
3905   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3906   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3907   unsigned inside_cost, prologue_cost;
3908
3909   if (PURE_SLP_STMT (stmt_info))
3910     return;
3911
3912   /* loop cost for vec_loop.  */
3913   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3914                                stmt_info, 0, vect_body);
3915
3916   /* prologue cost for vec_init and vec_step.  */
3917   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3918                                  stmt_info, 0, vect_prologue);
3919
3920   if (dump_enabled_p ())
3921     dump_printf_loc (MSG_NOTE, vect_location,
3922                      "vect_model_induction_cost: inside_cost = %d, "
3923                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3924 }
3925
3926
3927
3928 /* Function get_initial_def_for_reduction
3929
3930    Input:
3931    STMT - a stmt that performs a reduction operation in the loop.
3932    INIT_VAL - the initial value of the reduction variable
3933
3934    Output:
3935    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3936         of the reduction (used for adjusting the epilog - see below).
3937    Return a vector variable, initialized according to the operation that STMT
3938         performs. This vector will be used as the initial value of the
3939         vector of partial results.
3940
3941    Option1 (adjust in epilog): Initialize the vector as follows:
3942      add/bit or/xor:    [0,0,...,0,0]
3943      mult/bit and:      [1,1,...,1,1]
3944      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3945    and when necessary (e.g. add/mult case) let the caller know
3946    that it needs to adjust the result by init_val.
3947
3948    Option2: Initialize the vector as follows:
3949      add/bit or/xor:    [init_val,0,0,...,0]
3950      mult/bit and:      [init_val,1,1,...,1]
3951      min/max/cond_expr: [init_val,init_val,...,init_val]
3952    and no adjustments are needed.
3953
3954    For example, for the following code:
3955
3956    s = init_val;
3957    for (i=0;i<n;i++)
3958      s = s + a[i];
3959
3960    STMT is 's = s + a[i]', and the reduction variable is 's'.
3961    For a vector of 4 units, we want to return either [0,0,0,init_val],
3962    or [0,0,0,0] and let the caller know that it needs to adjust
3963    the result at the end by 'init_val'.
3964
3965    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3966    initialization vector is simpler (same element in all entries), if
3967    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3968
3969    A cost model should help decide between these two schemes.  */
3970
3971 tree
3972 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3973                                tree *adjustment_def)
3974 {
3975   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3976   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3978   tree scalar_type = TREE_TYPE (init_val);
3979   tree vectype = get_vectype_for_scalar_type (scalar_type);
3980   enum tree_code code = gimple_assign_rhs_code (stmt);
3981   tree def_for_init;
3982   tree init_def;
3983   bool nested_in_vect_loop = false;
3984   REAL_VALUE_TYPE real_init_val = dconst0;
3985   int int_init_val = 0;
3986   gimple *def_stmt = NULL;
3987   gimple_seq stmts = NULL;
3988
3989   gcc_assert (vectype);
3990
3991   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3992               || SCALAR_FLOAT_TYPE_P (scalar_type));
3993
3994   if (nested_in_vect_loop_p (loop, stmt))
3995     nested_in_vect_loop = true;
3996   else
3997     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3998
3999   /* In case of double reduction we only create a vector variable to be put
4000      in the reduction phi node.  The actual statement creation is done in
4001      vect_create_epilog_for_reduction.  */
4002   if (adjustment_def && nested_in_vect_loop
4003       && TREE_CODE (init_val) == SSA_NAME
4004       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4005       && gimple_code (def_stmt) == GIMPLE_PHI
4006       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4007       && vinfo_for_stmt (def_stmt)
4008       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4009           == vect_double_reduction_def)
4010     {
4011       *adjustment_def = NULL;
4012       return vect_create_destination_var (init_val, vectype);
4013     }
4014
4015   /* In case of a nested reduction do not use an adjustment def as
4016      that case is not supported by the epilogue generation correctly
4017      if ncopies is not one.  */
4018   if (adjustment_def && nested_in_vect_loop)
4019     {
4020       *adjustment_def = NULL;
4021       return vect_get_vec_def_for_operand (init_val, stmt);
4022     }
4023
4024   switch (code)
4025     {
4026     case WIDEN_SUM_EXPR:
4027     case DOT_PROD_EXPR:
4028     case SAD_EXPR:
4029     case PLUS_EXPR:
4030     case MINUS_EXPR:
4031     case BIT_IOR_EXPR:
4032     case BIT_XOR_EXPR:
4033     case MULT_EXPR:
4034     case BIT_AND_EXPR:
4035       {
4036         /* ADJUSTMENT_DEF is NULL when called from
4037            vect_create_epilog_for_reduction to vectorize double reduction.  */
4038         if (adjustment_def)
4039           *adjustment_def = init_val;
4040
4041         if (code == MULT_EXPR)
4042           {
4043             real_init_val = dconst1;
4044             int_init_val = 1;
4045           }
4046
4047         if (code == BIT_AND_EXPR)
4048           int_init_val = -1;
4049
4050         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4051           def_for_init = build_real (scalar_type, real_init_val);
4052         else
4053           def_for_init = build_int_cst (scalar_type, int_init_val);
4054
4055         if (adjustment_def)
4056           /* Option1: the first element is '0' or '1' as well.  */
4057           init_def = gimple_build_vector_from_val (&stmts, vectype,
4058                                                    def_for_init);
4059         else
4060           {
4061             /* Option2: the first element is INIT_VAL.  */
4062             tree_vector_builder elts (vectype, 1, 2);
4063             elts.quick_push (init_val);
4064             elts.quick_push (def_for_init);
4065             init_def = gimple_build_vector (&stmts, &elts);
4066           }
4067       }
4068       break;
4069
4070     case MIN_EXPR:
4071     case MAX_EXPR:
4072     case COND_EXPR:
4073       {
4074         if (adjustment_def)
4075           {
4076             *adjustment_def = NULL_TREE;
4077             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4078               {
4079                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4080                 break;
4081               }
4082           }
4083         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4084         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4085       }
4086       break;
4087
4088     default:
4089       gcc_unreachable ();
4090     }
4091
4092   if (stmts)
4093     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4094   return init_def;
4095 }
4096
4097 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4098    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4099
4100 static void
4101 get_initial_defs_for_reduction (slp_tree slp_node,
4102                                 vec<tree> *vec_oprnds,
4103                                 unsigned int number_of_vectors,
4104                                 enum tree_code code, bool reduc_chain)
4105 {
4106   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4107   gimple *stmt = stmts[0];
4108   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4109   unsigned nunits;
4110   unsigned j, number_of_places_left_in_vector;
4111   tree vector_type, scalar_type;
4112   tree vop;
4113   int group_size = stmts.length ();
4114   unsigned int vec_num, i;
4115   unsigned number_of_copies = 1;
4116   vec<tree> voprnds;
4117   voprnds.create (number_of_vectors);
4118   tree neutral_op = NULL;
4119   struct loop *loop;
4120
4121   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4122   scalar_type = TREE_TYPE (vector_type);
4123   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4124
4125   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4126
4127   loop = (gimple_bb (stmt))->loop_father;
4128   gcc_assert (loop);
4129   edge pe = loop_preheader_edge (loop);
4130
4131   /* op is the reduction operand of the first stmt already.  */
4132   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4133      we need either neutral operands or the original operands.  See
4134      get_initial_def_for_reduction() for details.  */
4135   switch (code)
4136     {
4137     case WIDEN_SUM_EXPR:
4138     case DOT_PROD_EXPR:
4139     case SAD_EXPR:
4140     case PLUS_EXPR:
4141     case MINUS_EXPR:
4142     case BIT_IOR_EXPR:
4143     case BIT_XOR_EXPR:
4144       neutral_op = build_zero_cst (scalar_type);
4145       break;
4146
4147     case MULT_EXPR:
4148       neutral_op = build_one_cst (scalar_type);
4149       break;
4150
4151     case BIT_AND_EXPR:
4152       neutral_op = build_all_ones_cst (scalar_type);
4153       break;
4154
4155     /* For MIN/MAX we don't have an easy neutral operand but
4156        the initial values can be used fine here.  Only for
4157        a reduction chain we have to force a neutral element.  */
4158     case MAX_EXPR:
4159     case MIN_EXPR:
4160       if (! reduc_chain)
4161         neutral_op = NULL;
4162       else
4163         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4164       break;
4165
4166     default:
4167       gcc_assert (! reduc_chain);
4168       neutral_op = NULL;
4169     }
4170
4171   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4172      created vectors. It is greater than 1 if unrolling is performed.
4173
4174      For example, we have two scalar operands, s1 and s2 (e.g., group of
4175      strided accesses of size two), while NUNITS is four (i.e., four scalars
4176      of this type can be packed in a vector).  The output vector will contain
4177      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4178      will be 2).
4179
4180      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4181      containing the operands.
4182
4183      For example, NUNITS is four as before, and the group size is 8
4184      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4185      {s5, s6, s7, s8}.  */
4186
4187   number_of_copies = nunits * number_of_vectors / group_size;
4188
4189   number_of_places_left_in_vector = nunits;
4190   tree_vector_builder elts (vector_type, nunits, 1);
4191   elts.quick_grow (nunits);
4192   for (j = 0; j < number_of_copies; j++)
4193     {
4194       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4195         {
4196           tree op;
4197           /* Get the def before the loop.  In reduction chain we have only
4198              one initial value.  */
4199           if ((j != (number_of_copies - 1)
4200                || (reduc_chain && i != 0))
4201               && neutral_op)
4202             op = neutral_op;
4203           else
4204             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4205
4206           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4207           number_of_places_left_in_vector--;
4208           elts[number_of_places_left_in_vector] = op;
4209
4210           if (number_of_places_left_in_vector == 0)
4211             {
4212               gimple_seq ctor_seq = NULL;
4213               tree init = gimple_build_vector (&ctor_seq, &elts);
4214               if (ctor_seq != NULL)
4215                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4216               voprnds.quick_push (init);
4217
4218               number_of_places_left_in_vector = nunits;
4219               elts.new_vector (vector_type, nunits, 1);
4220               elts.quick_grow (nunits);
4221             }
4222         }
4223     }
4224
4225   /* Since the vectors are created in the reverse order, we should invert
4226      them.  */
4227   vec_num = voprnds.length ();
4228   for (j = vec_num; j != 0; j--)
4229     {
4230       vop = voprnds[j - 1];
4231       vec_oprnds->quick_push (vop);
4232     }
4233
4234   voprnds.release ();
4235
4236   /* In case that VF is greater than the unrolling factor needed for the SLP
4237      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4238      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4239      to replicate the vectors.  */
4240   tree neutral_vec = NULL;
4241   while (number_of_vectors > vec_oprnds->length ())
4242     {
4243       if (neutral_op)
4244         {
4245           if (!neutral_vec)
4246             {
4247               gimple_seq ctor_seq = NULL;
4248               neutral_vec = gimple_build_vector_from_val
4249                 (&ctor_seq, vector_type, neutral_op);
4250               if (ctor_seq != NULL)
4251                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4252             }
4253           vec_oprnds->quick_push (neutral_vec);
4254         }
4255       else
4256         {
4257           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4258             vec_oprnds->quick_push (vop);
4259         }
4260     }
4261 }
4262
4263
4264 /* Function vect_create_epilog_for_reduction
4265
4266    Create code at the loop-epilog to finalize the result of a reduction
4267    computation.
4268
4269    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4270      reduction statements.
4271    STMT is the scalar reduction stmt that is being vectorized.
4272    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4273      number of elements that we can fit in a vectype (nunits).  In this case
4274      we have to generate more than one vector stmt - i.e - we need to "unroll"
4275      the vector stmt by a factor VF/nunits.  For more details see documentation
4276      in vectorizable_operation.
4277    REDUC_FN is the internal function for the epilog reduction.
4278    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4279      computation.
4280    REDUC_INDEX is the index of the operand in the right hand side of the
4281      statement that is defined by REDUCTION_PHI.
4282    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4283    SLP_NODE is an SLP node containing a group of reduction statements. The
4284      first one in this group is STMT.
4285    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4286      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4287      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4288      any value of the IV in the loop.
4289    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4290
4291    This function:
4292    1. Creates the reduction def-use cycles: sets the arguments for
4293       REDUCTION_PHIS:
4294       The loop-entry argument is the vectorized initial-value of the reduction.
4295       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4296       sums.
4297    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4298       by calling the function specified by REDUC_FN if available, or by
4299       other means (whole-vector shifts or a scalar loop).
4300       The function also creates a new phi node at the loop exit to preserve
4301       loop-closed form, as illustrated below.
4302
4303      The flow at the entry to this function:
4304
4305         loop:
4306           vec_def = phi <null, null>            # REDUCTION_PHI
4307           VECT_DEF = vector_stmt                # vectorized form of STMT
4308           s_loop = scalar_stmt                  # (scalar) STMT
4309         loop_exit:
4310           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4311           use <s_out0>
4312           use <s_out0>
4313
4314      The above is transformed by this function into:
4315
4316         loop:
4317           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4318           VECT_DEF = vector_stmt                # vectorized form of STMT
4319           s_loop = scalar_stmt                  # (scalar) STMT
4320         loop_exit:
4321           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4322           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4323           v_out2 = reduce <v_out1>
4324           s_out3 = extract_field <v_out2, 0>
4325           s_out4 = adjust_result <s_out3>
4326           use <s_out4>
4327           use <s_out4>
4328 */
4329
4330 static void
4331 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4332                                   gimple *reduc_def_stmt,
4333                                   int ncopies, internal_fn reduc_fn,
4334                                   vec<gimple *> reduction_phis,
4335                                   bool double_reduc,
4336                                   slp_tree slp_node,
4337                                   slp_instance slp_node_instance,
4338                                   tree induc_val, enum tree_code induc_code)
4339 {
4340   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4341   stmt_vec_info prev_phi_info;
4342   tree vectype;
4343   machine_mode mode;
4344   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4345   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4346   basic_block exit_bb;
4347   tree scalar_dest;
4348   tree scalar_type;
4349   gimple *new_phi = NULL, *phi;
4350   gimple_stmt_iterator exit_gsi;
4351   tree vec_dest;
4352   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4353   gimple *epilog_stmt = NULL;
4354   enum tree_code code = gimple_assign_rhs_code (stmt);
4355   gimple *exit_phi;
4356   tree bitsize;
4357   tree adjustment_def = NULL;
4358   tree vec_initial_def = NULL;
4359   tree expr, def, initial_def = NULL;
4360   tree orig_name, scalar_result;
4361   imm_use_iterator imm_iter, phi_imm_iter;
4362   use_operand_p use_p, phi_use_p;
4363   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4364   bool nested_in_vect_loop = false;
4365   auto_vec<gimple *> new_phis;
4366   auto_vec<gimple *> inner_phis;
4367   enum vect_def_type dt = vect_unknown_def_type;
4368   int j, i;
4369   auto_vec<tree> scalar_results;
4370   unsigned int group_size = 1, k, ratio;
4371   auto_vec<tree> vec_initial_defs;
4372   auto_vec<gimple *> phis;
4373   bool slp_reduc = false;
4374   tree new_phi_result;
4375   gimple *inner_phi = NULL;
4376   tree induction_index = NULL_TREE;
4377
4378   if (slp_node)
4379     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4380
4381   if (nested_in_vect_loop_p (loop, stmt))
4382     {
4383       outer_loop = loop;
4384       loop = loop->inner;
4385       nested_in_vect_loop = true;
4386       gcc_assert (!slp_node);
4387     }
4388
4389   vectype = STMT_VINFO_VECTYPE (stmt_info);
4390   gcc_assert (vectype);
4391   mode = TYPE_MODE (vectype);
4392
4393   /* 1. Create the reduction def-use cycle:
4394      Set the arguments of REDUCTION_PHIS, i.e., transform
4395
4396         loop:
4397           vec_def = phi <null, null>            # REDUCTION_PHI
4398           VECT_DEF = vector_stmt                # vectorized form of STMT
4399           ...
4400
4401      into:
4402
4403         loop:
4404           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4405           VECT_DEF = vector_stmt                # vectorized form of STMT
4406           ...
4407
4408      (in case of SLP, do it for all the phis). */
4409
4410   /* Get the loop-entry arguments.  */
4411   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4412   if (slp_node)
4413     {
4414       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4415       vec_initial_defs.reserve (vec_num);
4416       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4417                                       &vec_initial_defs, vec_num, code,
4418                                       GROUP_FIRST_ELEMENT (stmt_info));
4419     }
4420   else
4421     {
4422       /* Get at the scalar def before the loop, that defines the initial value
4423          of the reduction variable.  */
4424       gimple *def_stmt;
4425       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4426                                            loop_preheader_edge (loop));
4427       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4428          and we can't use zero for induc_val, use initial_def.  Similarly
4429          for REDUC_MIN and initial_def larger than the base.  */
4430       if (TREE_CODE (initial_def) == INTEGER_CST
4431           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4432               == INTEGER_INDUC_COND_REDUCTION)
4433           && !integer_zerop (induc_val)
4434           && ((induc_code == MAX_EXPR
4435                && tree_int_cst_lt (initial_def, induc_val))
4436               || (induc_code == MIN_EXPR
4437                   && tree_int_cst_lt (induc_val, initial_def))))
4438         induc_val = initial_def;
4439       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4440       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4441                                                        &adjustment_def);
4442       vec_initial_defs.create (1);
4443       vec_initial_defs.quick_push (vec_initial_def);
4444     }
4445
4446   /* Set phi nodes arguments.  */
4447   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4448     {
4449       tree vec_init_def = vec_initial_defs[i];
4450       tree def = vect_defs[i];
4451       for (j = 0; j < ncopies; j++)
4452         {
4453           if (j != 0)
4454             {
4455               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4456               if (nested_in_vect_loop)
4457                 vec_init_def
4458                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4459                                                     vec_init_def);
4460             }
4461
4462           /* Set the loop-entry arg of the reduction-phi.  */
4463
4464           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4465               == INTEGER_INDUC_COND_REDUCTION)
4466             {
4467               /* Initialise the reduction phi to zero.  This prevents initial
4468                  values of non-zero interferring with the reduction op.  */
4469               gcc_assert (ncopies == 1);
4470               gcc_assert (i == 0);
4471
4472               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4473               tree induc_val_vec
4474                 = build_vector_from_val (vec_init_def_type, induc_val);
4475
4476               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4477                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4478             }
4479           else
4480             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4481                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4482
4483           /* Set the loop-latch arg for the reduction-phi.  */
4484           if (j > 0)
4485             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4486
4487           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4488                        UNKNOWN_LOCATION);
4489
4490           if (dump_enabled_p ())
4491             {
4492               dump_printf_loc (MSG_NOTE, vect_location,
4493                                "transform reduction: created def-use cycle: ");
4494               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4495               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4496             }
4497         }
4498     }
4499
4500   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4501      which is updated with the current index of the loop for every match of
4502      the original loop's cond_expr (VEC_STMT).  This results in a vector
4503      containing the last time the condition passed for that vector lane.
4504      The first match will be a 1 to allow 0 to be used for non-matching
4505      indexes.  If there are no matches at all then the vector will be all
4506      zeroes.  */
4507   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4508     {
4509       tree indx_before_incr, indx_after_incr;
4510       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4511       int k;
4512
4513       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4514       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4515
4516       int scalar_precision
4517         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4518       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4519       tree cr_index_vector_type = build_vector_type
4520         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4521
4522       /* First we create a simple vector induction variable which starts
4523          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4524          vector size (STEP).  */
4525
4526       /* Create a {1,2,3,...} vector.  */
4527       tree_vector_builder vtemp (cr_index_vector_type, 1, 3);
4528       for (k = 0; k < 3; ++k)
4529         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4530       tree series_vect = vtemp.build ();
4531
4532       /* Create a vector of the step value.  */
4533       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4534       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4535
4536       /* Create an induction variable.  */
4537       gimple_stmt_iterator incr_gsi;
4538       bool insert_after;
4539       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4540       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4541                  insert_after, &indx_before_incr, &indx_after_incr);
4542
4543       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4544          filled with zeros (VEC_ZERO).  */
4545
4546       /* Create a vector of 0s.  */
4547       tree zero = build_zero_cst (cr_index_scalar_type);
4548       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4549
4550       /* Create a vector phi node.  */
4551       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4552       new_phi = create_phi_node (new_phi_tree, loop->header);
4553       set_vinfo_for_stmt (new_phi,
4554                           new_stmt_vec_info (new_phi, loop_vinfo));
4555       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4556                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4557
4558       /* Now take the condition from the loops original cond_expr
4559          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4560          every match uses values from the induction variable
4561          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4562          (NEW_PHI_TREE).
4563          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4564          the new cond_expr (INDEX_COND_EXPR).  */
4565
4566       /* Duplicate the condition from vec_stmt.  */
4567       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4568
4569       /* Create a conditional, where the condition is taken from vec_stmt
4570          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4571          else is the phi (NEW_PHI_TREE).  */
4572       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4573                                      ccompare, indx_before_incr,
4574                                      new_phi_tree);
4575       induction_index = make_ssa_name (cr_index_vector_type);
4576       gimple *index_condition = gimple_build_assign (induction_index,
4577                                                      index_cond_expr);
4578       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4579       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4580                                                         loop_vinfo);
4581       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4582       set_vinfo_for_stmt (index_condition, index_vec_info);
4583
4584       /* Update the phi with the vec cond.  */
4585       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4586                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4587     }
4588
4589   /* 2. Create epilog code.
4590         The reduction epilog code operates across the elements of the vector
4591         of partial results computed by the vectorized loop.
4592         The reduction epilog code consists of:
4593
4594         step 1: compute the scalar result in a vector (v_out2)
4595         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4596         step 3: adjust the scalar result (s_out3) if needed.
4597
4598         Step 1 can be accomplished using one the following three schemes:
4599           (scheme 1) using reduc_fn, if available.
4600           (scheme 2) using whole-vector shifts, if available.
4601           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4602                      combined.
4603
4604           The overall epilog code looks like this:
4605
4606           s_out0 = phi <s_loop>         # original EXIT_PHI
4607           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4608           v_out2 = reduce <v_out1>              # step 1
4609           s_out3 = extract_field <v_out2, 0>    # step 2
4610           s_out4 = adjust_result <s_out3>       # step 3
4611
4612           (step 3 is optional, and steps 1 and 2 may be combined).
4613           Lastly, the uses of s_out0 are replaced by s_out4.  */
4614
4615
4616   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4617          v_out1 = phi <VECT_DEF>
4618          Store them in NEW_PHIS.  */
4619
4620   exit_bb = single_exit (loop)->dest;
4621   prev_phi_info = NULL;
4622   new_phis.create (vect_defs.length ());
4623   FOR_EACH_VEC_ELT (vect_defs, i, def)
4624     {
4625       for (j = 0; j < ncopies; j++)
4626         {
4627           tree new_def = copy_ssa_name (def);
4628           phi = create_phi_node (new_def, exit_bb);
4629           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4630           if (j == 0)
4631             new_phis.quick_push (phi);
4632           else
4633             {
4634               def = vect_get_vec_def_for_stmt_copy (dt, def);
4635               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4636             }
4637
4638           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4639           prev_phi_info = vinfo_for_stmt (phi);
4640         }
4641     }
4642
4643   /* The epilogue is created for the outer-loop, i.e., for the loop being
4644      vectorized.  Create exit phis for the outer loop.  */
4645   if (double_reduc)
4646     {
4647       loop = outer_loop;
4648       exit_bb = single_exit (loop)->dest;
4649       inner_phis.create (vect_defs.length ());
4650       FOR_EACH_VEC_ELT (new_phis, i, phi)
4651         {
4652           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4653           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4654           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4655                            PHI_RESULT (phi));
4656           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4657                                                             loop_vinfo));
4658           inner_phis.quick_push (phi);
4659           new_phis[i] = outer_phi;
4660           prev_phi_info = vinfo_for_stmt (outer_phi);
4661           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4662             {
4663               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4664               new_result = copy_ssa_name (PHI_RESULT (phi));
4665               outer_phi = create_phi_node (new_result, exit_bb);
4666               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4667                                PHI_RESULT (phi));
4668               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4669                                                                 loop_vinfo));
4670               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4671               prev_phi_info = vinfo_for_stmt (outer_phi);
4672             }
4673         }
4674     }
4675
4676   exit_gsi = gsi_after_labels (exit_bb);
4677
4678   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4679          (i.e. when reduc_fn is not available) and in the final adjustment
4680          code (if needed).  Also get the original scalar reduction variable as
4681          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4682          represents a reduction pattern), the tree-code and scalar-def are
4683          taken from the original stmt that the pattern-stmt (STMT) replaces.
4684          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4685          are taken from STMT.  */
4686
4687   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4688   if (!orig_stmt)
4689     {
4690       /* Regular reduction  */
4691       orig_stmt = stmt;
4692     }
4693   else
4694     {
4695       /* Reduction pattern  */
4696       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4697       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4698       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4699     }
4700
4701   code = gimple_assign_rhs_code (orig_stmt);
4702   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4703      partial results are added and not subtracted.  */
4704   if (code == MINUS_EXPR)
4705     code = PLUS_EXPR;
4706
4707   scalar_dest = gimple_assign_lhs (orig_stmt);
4708   scalar_type = TREE_TYPE (scalar_dest);
4709   scalar_results.create (group_size);
4710   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4711   bitsize = TYPE_SIZE (scalar_type);
4712
4713   /* In case this is a reduction in an inner-loop while vectorizing an outer
4714      loop - we don't need to extract a single scalar result at the end of the
4715      inner-loop (unless it is double reduction, i.e., the use of reduction is
4716      outside the outer-loop).  The final vector of partial results will be used
4717      in the vectorized outer-loop, or reduced to a scalar result at the end of
4718      the outer-loop.  */
4719   if (nested_in_vect_loop && !double_reduc)
4720     goto vect_finalize_reduction;
4721
4722   /* SLP reduction without reduction chain, e.g.,
4723      # a1 = phi <a2, a0>
4724      # b1 = phi <b2, b0>
4725      a2 = operation (a1)
4726      b2 = operation (b1)  */
4727   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4728
4729   /* In case of reduction chain, e.g.,
4730      # a1 = phi <a3, a0>
4731      a2 = operation (a1)
4732      a3 = operation (a2),
4733
4734      we may end up with more than one vector result.  Here we reduce them to
4735      one vector.  */
4736   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4737     {
4738       tree first_vect = PHI_RESULT (new_phis[0]);
4739       gassign *new_vec_stmt = NULL;
4740       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4741       for (k = 1; k < new_phis.length (); k++)
4742         {
4743           gimple *next_phi = new_phis[k];
4744           tree second_vect = PHI_RESULT (next_phi);
4745           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4746           new_vec_stmt = gimple_build_assign (tem, code,
4747                                               first_vect, second_vect);
4748           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4749           first_vect = tem;
4750         }
4751
4752       new_phi_result = first_vect;
4753       if (new_vec_stmt)
4754         {
4755           new_phis.truncate (0);
4756           new_phis.safe_push (new_vec_stmt);
4757         }
4758     }
4759   /* Likewise if we couldn't use a single defuse cycle.  */
4760   else if (ncopies > 1)
4761     {
4762       gcc_assert (new_phis.length () == 1);
4763       tree first_vect = PHI_RESULT (new_phis[0]);
4764       gassign *new_vec_stmt = NULL;
4765       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4766       gimple *next_phi = new_phis[0];
4767       for (int k = 1; k < ncopies; ++k)
4768         {
4769           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4770           tree second_vect = PHI_RESULT (next_phi);
4771           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4772           new_vec_stmt = gimple_build_assign (tem, code,
4773                                               first_vect, second_vect);
4774           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4775           first_vect = tem;
4776         }
4777       new_phi_result = first_vect;
4778       new_phis.truncate (0);
4779       new_phis.safe_push (new_vec_stmt);
4780     }
4781   else
4782     new_phi_result = PHI_RESULT (new_phis[0]);
4783
4784   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4785       && reduc_fn != IFN_LAST)
4786     {
4787       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4788          various data values where the condition matched and another vector
4789          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4790          need to extract the last matching index (which will be the index with
4791          highest value) and use this to index into the data vector.
4792          For the case where there were no matches, the data vector will contain
4793          all default values and the index vector will be all zeros.  */
4794
4795       /* Get various versions of the type of the vector of indexes.  */
4796       tree index_vec_type = TREE_TYPE (induction_index);
4797       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4798       tree index_scalar_type = TREE_TYPE (index_vec_type);
4799       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4800         (index_vec_type);
4801
4802       /* Get an unsigned integer version of the type of the data vector.  */
4803       int scalar_precision
4804         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4805       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4806       tree vectype_unsigned = build_vector_type
4807         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4808
4809       /* First we need to create a vector (ZERO_VEC) of zeros and another
4810          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4811          can create using a MAX reduction and then expanding.
4812          In the case where the loop never made any matches, the max index will
4813          be zero.  */
4814
4815       /* Vector of {0, 0, 0,...}.  */
4816       tree zero_vec = make_ssa_name (vectype);
4817       tree zero_vec_rhs = build_zero_cst (vectype);
4818       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4819       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4820
4821       /* Find maximum value from the vector of found indexes.  */
4822       tree max_index = make_ssa_name (index_scalar_type);
4823       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4824                                                           1, induction_index);
4825       gimple_call_set_lhs (max_index_stmt, max_index);
4826       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4827
4828       /* Vector of {max_index, max_index, max_index,...}.  */
4829       tree max_index_vec = make_ssa_name (index_vec_type);
4830       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4831                                                       max_index);
4832       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4833                                                         max_index_vec_rhs);
4834       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4835
4836       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4837          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4838          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4839          otherwise.  Only one value should match, resulting in a vector
4840          (VEC_COND) with one data value and the rest zeros.
4841          In the case where the loop never made any matches, every index will
4842          match, resulting in a vector with all data values (which will all be
4843          the default value).  */
4844
4845       /* Compare the max index vector to the vector of found indexes to find
4846          the position of the max value.  */
4847       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4848       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4849                                                       induction_index,
4850                                                       max_index_vec);
4851       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4852
4853       /* Use the compare to choose either values from the data vector or
4854          zero.  */
4855       tree vec_cond = make_ssa_name (vectype);
4856       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4857                                                    vec_compare, new_phi_result,
4858                                                    zero_vec);
4859       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4860
4861       /* Finally we need to extract the data value from the vector (VEC_COND)
4862          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4863          reduction, but because this doesn't exist, we can use a MAX reduction
4864          instead.  The data value might be signed or a float so we need to cast
4865          it first.
4866          In the case where the loop never made any matches, the data values are
4867          all identical, and so will reduce down correctly.  */
4868
4869       /* Make the matched data values unsigned.  */
4870       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4871       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4872                                        vec_cond);
4873       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4874                                                         VIEW_CONVERT_EXPR,
4875                                                         vec_cond_cast_rhs);
4876       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4877
4878       /* Reduce down to a scalar value.  */
4879       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4880       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4881                                                            1, vec_cond_cast);
4882       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4883       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4884
4885       /* Convert the reduced value back to the result type and set as the
4886          result.  */
4887       gimple_seq stmts = NULL;
4888       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4889                                data_reduc);
4890       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4891       scalar_results.safe_push (new_temp);
4892     }
4893   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4894            && reduc_fn == IFN_LAST)
4895     {
4896       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4897          idx = 0;
4898          idx_val = induction_index[0];
4899          val = data_reduc[0];
4900          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4901            if (induction_index[i] > idx_val)
4902              val = data_reduc[i], idx_val = induction_index[i];
4903          return val;  */
4904
4905       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4906       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4907       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4908       unsigned HOST_WIDE_INT v_size
4909         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4910       tree idx_val = NULL_TREE, val = NULL_TREE;
4911       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4912         {
4913           tree old_idx_val = idx_val;
4914           tree old_val = val;
4915           idx_val = make_ssa_name (idx_eltype);
4916           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4917                                              build3 (BIT_FIELD_REF, idx_eltype,
4918                                                      induction_index,
4919                                                      bitsize_int (el_size),
4920                                                      bitsize_int (off)));
4921           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4922           val = make_ssa_name (data_eltype);
4923           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4924                                              build3 (BIT_FIELD_REF,
4925                                                      data_eltype,
4926                                                      new_phi_result,
4927                                                      bitsize_int (el_size),
4928                                                      bitsize_int (off)));
4929           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4930           if (off != 0)
4931             {
4932               tree new_idx_val = idx_val;
4933               tree new_val = val;
4934               if (off != v_size - el_size)
4935                 {
4936                   new_idx_val = make_ssa_name (idx_eltype);
4937                   epilog_stmt = gimple_build_assign (new_idx_val,
4938                                                      MAX_EXPR, idx_val,
4939                                                      old_idx_val);
4940                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4941                 }
4942               new_val = make_ssa_name (data_eltype);
4943               epilog_stmt = gimple_build_assign (new_val,
4944                                                  COND_EXPR,
4945                                                  build2 (GT_EXPR,
4946                                                          boolean_type_node,
4947                                                          idx_val,
4948                                                          old_idx_val),
4949                                                  val, old_val);
4950               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4951               idx_val = new_idx_val;
4952               val = new_val;
4953             }
4954         }
4955       /* Convert the reduced value back to the result type and set as the
4956          result.  */
4957       gimple_seq stmts = NULL;
4958       val = gimple_convert (&stmts, scalar_type, val);
4959       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4960       scalar_results.safe_push (val);
4961     }
4962
4963   /* 2.3 Create the reduction code, using one of the three schemes described
4964          above. In SLP we simply need to extract all the elements from the
4965          vector (without reducing them), so we use scalar shifts.  */
4966   else if (reduc_fn != IFN_LAST && !slp_reduc)
4967     {
4968       tree tmp;
4969       tree vec_elem_type;
4970
4971       /* Case 1:  Create:
4972          v_out2 = reduc_expr <v_out1>  */
4973
4974       if (dump_enabled_p ())
4975         dump_printf_loc (MSG_NOTE, vect_location,
4976                          "Reduce using direct vector reduction.\n");
4977
4978       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4979       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4980         {
4981           tree tmp_dest
4982             = vect_create_destination_var (scalar_dest, vec_elem_type);
4983           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4984                                                     new_phi_result);
4985           gimple_set_lhs (epilog_stmt, tmp_dest);
4986           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4987           gimple_set_lhs (epilog_stmt, new_temp);
4988           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4989
4990           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4991                                              new_temp);
4992         }
4993       else
4994         {
4995           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4996                                                     new_phi_result);
4997           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4998         }
4999
5000       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5001       gimple_set_lhs (epilog_stmt, new_temp);
5002       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5003
5004       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5005            == INTEGER_INDUC_COND_REDUCTION)
5006           && !operand_equal_p (initial_def, induc_val, 0))
5007         {
5008           /* Earlier we set the initial value to be a vector if induc_val
5009              values.  Check the result and if it is induc_val then replace
5010              with the original initial value, unless induc_val is
5011              the same as initial_def already.  */
5012           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5013                                   induc_val);
5014
5015           tmp = make_ssa_name (new_scalar_dest);
5016           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5017                                              initial_def, new_temp);
5018           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5019           new_temp = tmp;
5020         }
5021
5022       scalar_results.safe_push (new_temp);
5023     }
5024   else
5025     {
5026       bool reduce_with_shift = have_whole_vector_shift (mode);
5027       int element_bitsize = tree_to_uhwi (bitsize);
5028       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5029       tree vec_temp;
5030
5031       /* COND reductions all do the final reduction with MAX_EXPR
5032          or MIN_EXPR.  */
5033       if (code == COND_EXPR)
5034         {
5035           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5036               == INTEGER_INDUC_COND_REDUCTION)
5037             code = induc_code;
5038           else
5039             code = MAX_EXPR;
5040         }
5041
5042       /* Regardless of whether we have a whole vector shift, if we're
5043          emulating the operation via tree-vect-generic, we don't want
5044          to use it.  Only the first round of the reduction is likely
5045          to still be profitable via emulation.  */
5046       /* ??? It might be better to emit a reduction tree code here, so that
5047          tree-vect-generic can expand the first round via bit tricks.  */
5048       if (!VECTOR_MODE_P (mode))
5049         reduce_with_shift = false;
5050       else
5051         {
5052           optab optab = optab_for_tree_code (code, vectype, optab_default);
5053           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5054             reduce_with_shift = false;
5055         }
5056
5057       if (reduce_with_shift && !slp_reduc)
5058         {
5059           int nelements = vec_size_in_bits / element_bitsize;
5060           auto_vec_perm_indices sel (nelements);
5061
5062           int elt_offset;
5063
5064           tree zero_vec = build_zero_cst (vectype);
5065           /* Case 2: Create:
5066              for (offset = nelements/2; offset >= 1; offset/=2)
5067                 {
5068                   Create:  va' = vec_shift <va, offset>
5069                   Create:  va = vop <va, va'>
5070                 }  */
5071
5072           tree rhs;
5073
5074           if (dump_enabled_p ())
5075             dump_printf_loc (MSG_NOTE, vect_location,
5076                              "Reduce using vector shifts\n");
5077
5078           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5079           new_temp = new_phi_result;
5080           for (elt_offset = nelements / 2;
5081                elt_offset >= 1;
5082                elt_offset /= 2)
5083             {
5084               sel.truncate (0);
5085               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5086               tree mask = vect_gen_perm_mask_any (vectype, sel);
5087               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5088                                                  new_temp, zero_vec, mask);
5089               new_name = make_ssa_name (vec_dest, epilog_stmt);
5090               gimple_assign_set_lhs (epilog_stmt, new_name);
5091               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5092
5093               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5094                                                  new_temp);
5095               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5096               gimple_assign_set_lhs (epilog_stmt, new_temp);
5097               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5098             }
5099
5100           /* 2.4  Extract the final scalar result.  Create:
5101              s_out3 = extract_field <v_out2, bitpos>  */
5102
5103           if (dump_enabled_p ())
5104             dump_printf_loc (MSG_NOTE, vect_location,
5105                              "extract scalar result\n");
5106
5107           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5108                         bitsize, bitsize_zero_node);
5109           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5110           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5111           gimple_assign_set_lhs (epilog_stmt, new_temp);
5112           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5113           scalar_results.safe_push (new_temp);
5114         }
5115       else
5116         {
5117           /* Case 3: Create:
5118              s = extract_field <v_out2, 0>
5119              for (offset = element_size;
5120                   offset < vector_size;
5121                   offset += element_size;)
5122                {
5123                  Create:  s' = extract_field <v_out2, offset>
5124                  Create:  s = op <s, s'>  // For non SLP cases
5125                }  */
5126
5127           if (dump_enabled_p ())
5128             dump_printf_loc (MSG_NOTE, vect_location,
5129                              "Reduce using scalar code.\n");
5130
5131           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5132           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5133             {
5134               int bit_offset;
5135               if (gimple_code (new_phi) == GIMPLE_PHI)
5136                 vec_temp = PHI_RESULT (new_phi);
5137               else
5138                 vec_temp = gimple_assign_lhs (new_phi);
5139               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5140                                  bitsize_zero_node);
5141               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5142               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5143               gimple_assign_set_lhs (epilog_stmt, new_temp);
5144               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5145
5146               /* In SLP we don't need to apply reduction operation, so we just
5147                  collect s' values in SCALAR_RESULTS.  */
5148               if (slp_reduc)
5149                 scalar_results.safe_push (new_temp);
5150
5151               for (bit_offset = element_bitsize;
5152                    bit_offset < vec_size_in_bits;
5153                    bit_offset += element_bitsize)
5154                 {
5155                   tree bitpos = bitsize_int (bit_offset);
5156                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5157                                      bitsize, bitpos);
5158
5159                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5160                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5161                   gimple_assign_set_lhs (epilog_stmt, new_name);
5162                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5163
5164                   if (slp_reduc)
5165                     {
5166                       /* In SLP we don't need to apply reduction operation, so
5167                          we just collect s' values in SCALAR_RESULTS.  */
5168                       new_temp = new_name;
5169                       scalar_results.safe_push (new_name);
5170                     }
5171                   else
5172                     {
5173                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5174                                                          new_name, new_temp);
5175                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5176                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5177                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5178                     }
5179                 }
5180             }
5181
5182           /* The only case where we need to reduce scalar results in SLP, is
5183              unrolling.  If the size of SCALAR_RESULTS is greater than
5184              GROUP_SIZE, we reduce them combining elements modulo
5185              GROUP_SIZE.  */
5186           if (slp_reduc)
5187             {
5188               tree res, first_res, new_res;
5189               gimple *new_stmt;
5190
5191               /* Reduce multiple scalar results in case of SLP unrolling.  */
5192               for (j = group_size; scalar_results.iterate (j, &res);
5193                    j++)
5194                 {
5195                   first_res = scalar_results[j % group_size];
5196                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5197                                                   first_res, res);
5198                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5199                   gimple_assign_set_lhs (new_stmt, new_res);
5200                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5201                   scalar_results[j % group_size] = new_res;
5202                 }
5203             }
5204           else
5205             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5206             scalar_results.safe_push (new_temp);
5207         }
5208
5209       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5210            == INTEGER_INDUC_COND_REDUCTION)
5211           && !operand_equal_p (initial_def, induc_val, 0))
5212         {
5213           /* Earlier we set the initial value to be a vector if induc_val
5214              values.  Check the result and if it is induc_val then replace
5215              with the original initial value, unless induc_val is
5216              the same as initial_def already.  */
5217           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5218                                   induc_val);
5219
5220           tree tmp = make_ssa_name (new_scalar_dest);
5221           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5222                                              initial_def, new_temp);
5223           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5224           scalar_results[0] = tmp;
5225         }
5226     }
5227
5228 vect_finalize_reduction:
5229
5230   if (double_reduc)
5231     loop = loop->inner;
5232
5233   /* 2.5 Adjust the final result by the initial value of the reduction
5234          variable. (When such adjustment is not needed, then
5235          'adjustment_def' is zero).  For example, if code is PLUS we create:
5236          new_temp = loop_exit_def + adjustment_def  */
5237
5238   if (adjustment_def)
5239     {
5240       gcc_assert (!slp_reduc);
5241       if (nested_in_vect_loop)
5242         {
5243           new_phi = new_phis[0];
5244           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5245           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5246           new_dest = vect_create_destination_var (scalar_dest, vectype);
5247         }
5248       else
5249         {
5250           new_temp = scalar_results[0];
5251           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5252           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5253           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5254         }
5255
5256       epilog_stmt = gimple_build_assign (new_dest, expr);
5257       new_temp = make_ssa_name (new_dest, epilog_stmt);
5258       gimple_assign_set_lhs (epilog_stmt, new_temp);
5259       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5260       if (nested_in_vect_loop)
5261         {
5262           set_vinfo_for_stmt (epilog_stmt,
5263                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5264           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5265                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5266
5267           if (!double_reduc)
5268             scalar_results.quick_push (new_temp);
5269           else
5270             scalar_results[0] = new_temp;
5271         }
5272       else
5273         scalar_results[0] = new_temp;
5274
5275       new_phis[0] = epilog_stmt;
5276     }
5277
5278   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5279           phis with new adjusted scalar results, i.e., replace use <s_out0>
5280           with use <s_out4>.
5281
5282      Transform:
5283         loop_exit:
5284           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5285           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5286           v_out2 = reduce <v_out1>
5287           s_out3 = extract_field <v_out2, 0>
5288           s_out4 = adjust_result <s_out3>
5289           use <s_out0>
5290           use <s_out0>
5291
5292      into:
5293
5294         loop_exit:
5295           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5296           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5297           v_out2 = reduce <v_out1>
5298           s_out3 = extract_field <v_out2, 0>
5299           s_out4 = adjust_result <s_out3>
5300           use <s_out4>
5301           use <s_out4> */
5302
5303
5304   /* In SLP reduction chain we reduce vector results into one vector if
5305      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5306      the last stmt in the reduction chain, since we are looking for the loop
5307      exit phi node.  */
5308   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5309     {
5310       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5311       /* Handle reduction patterns.  */
5312       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5313         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5314
5315       scalar_dest = gimple_assign_lhs (dest_stmt);
5316       group_size = 1;
5317     }
5318
5319   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5320      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5321      need to match SCALAR_RESULTS with corresponding statements.  The first
5322      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5323      the first vector stmt, etc.
5324      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5325   if (group_size > new_phis.length ())
5326     {
5327       ratio = group_size / new_phis.length ();
5328       gcc_assert (!(group_size % new_phis.length ()));
5329     }
5330   else
5331     ratio = 1;
5332
5333   for (k = 0; k < group_size; k++)
5334     {
5335       if (k % ratio == 0)
5336         {
5337           epilog_stmt = new_phis[k / ratio];
5338           reduction_phi = reduction_phis[k / ratio];
5339           if (double_reduc)
5340             inner_phi = inner_phis[k / ratio];
5341         }
5342
5343       if (slp_reduc)
5344         {
5345           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5346
5347           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5348           /* SLP statements can't participate in patterns.  */
5349           gcc_assert (!orig_stmt);
5350           scalar_dest = gimple_assign_lhs (current_stmt);
5351         }
5352
5353       phis.create (3);
5354       /* Find the loop-closed-use at the loop exit of the original scalar
5355          result.  (The reduction result is expected to have two immediate uses -
5356          one at the latch block, and one at the loop exit).  */
5357       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5358         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5359             && !is_gimple_debug (USE_STMT (use_p)))
5360           phis.safe_push (USE_STMT (use_p));
5361
5362       /* While we expect to have found an exit_phi because of loop-closed-ssa
5363          form we can end up without one if the scalar cycle is dead.  */
5364
5365       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5366         {
5367           if (outer_loop)
5368             {
5369               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5370               gphi *vect_phi;
5371
5372               /* FORNOW. Currently not supporting the case that an inner-loop
5373                  reduction is not used in the outer-loop (but only outside the
5374                  outer-loop), unless it is double reduction.  */
5375               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5376                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5377                           || double_reduc);
5378
5379               if (double_reduc)
5380                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5381               else
5382                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5383               if (!double_reduc
5384                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5385                       != vect_double_reduction_def)
5386                 continue;
5387
5388               /* Handle double reduction:
5389
5390                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5391                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5392                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5393                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5394
5395                  At that point the regular reduction (stmt2 and stmt3) is
5396                  already vectorized, as well as the exit phi node, stmt4.
5397                  Here we vectorize the phi node of double reduction, stmt1, and
5398                  update all relevant statements.  */
5399
5400               /* Go through all the uses of s2 to find double reduction phi
5401                  node, i.e., stmt1 above.  */
5402               orig_name = PHI_RESULT (exit_phi);
5403               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5404                 {
5405                   stmt_vec_info use_stmt_vinfo;
5406                   stmt_vec_info new_phi_vinfo;
5407                   tree vect_phi_init, preheader_arg, vect_phi_res;
5408                   basic_block bb = gimple_bb (use_stmt);
5409                   gimple *use;
5410
5411                   /* Check that USE_STMT is really double reduction phi
5412                      node.  */
5413                   if (gimple_code (use_stmt) != GIMPLE_PHI
5414                       || gimple_phi_num_args (use_stmt) != 2
5415                       || bb->loop_father != outer_loop)
5416                     continue;
5417                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5418                   if (!use_stmt_vinfo
5419                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5420                           != vect_double_reduction_def)
5421                     continue;
5422
5423                   /* Create vector phi node for double reduction:
5424                      vs1 = phi <vs0, vs2>
5425                      vs1 was created previously in this function by a call to
5426                        vect_get_vec_def_for_operand and is stored in
5427                        vec_initial_def;
5428                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5429                      vs0 is created here.  */
5430
5431                   /* Create vector phi node.  */
5432                   vect_phi = create_phi_node (vec_initial_def, bb);
5433                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5434                                     loop_vec_info_for_loop (outer_loop));
5435                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5436
5437                   /* Create vs0 - initial def of the double reduction phi.  */
5438                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5439                                              loop_preheader_edge (outer_loop));
5440                   vect_phi_init = get_initial_def_for_reduction
5441                     (stmt, preheader_arg, NULL);
5442
5443                   /* Update phi node arguments with vs0 and vs2.  */
5444                   add_phi_arg (vect_phi, vect_phi_init,
5445                                loop_preheader_edge (outer_loop),
5446                                UNKNOWN_LOCATION);
5447                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5448                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5449                   if (dump_enabled_p ())
5450                     {
5451                       dump_printf_loc (MSG_NOTE, vect_location,
5452                                        "created double reduction phi node: ");
5453                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5454                     }
5455
5456                   vect_phi_res = PHI_RESULT (vect_phi);
5457
5458                   /* Replace the use, i.e., set the correct vs1 in the regular
5459                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5460                      loop is redundant.  */
5461                   use = reduction_phi;
5462                   for (j = 0; j < ncopies; j++)
5463                     {
5464                       edge pr_edge = loop_preheader_edge (loop);
5465                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5466                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5467                     }
5468                 }
5469             }
5470         }
5471
5472       phis.release ();
5473       if (nested_in_vect_loop)
5474         {
5475           if (double_reduc)
5476             loop = outer_loop;
5477           else
5478             continue;
5479         }
5480
5481       phis.create (3);
5482       /* Find the loop-closed-use at the loop exit of the original scalar
5483          result.  (The reduction result is expected to have two immediate uses,
5484          one at the latch block, and one at the loop exit).  For double
5485          reductions we are looking for exit phis of the outer loop.  */
5486       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5487         {
5488           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5489             {
5490               if (!is_gimple_debug (USE_STMT (use_p)))
5491                 phis.safe_push (USE_STMT (use_p));
5492             }
5493           else
5494             {
5495               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5496                 {
5497                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5498
5499                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5500                     {
5501                       if (!flow_bb_inside_loop_p (loop,
5502                                              gimple_bb (USE_STMT (phi_use_p)))
5503                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5504                         phis.safe_push (USE_STMT (phi_use_p));
5505                     }
5506                 }
5507             }
5508         }
5509
5510       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5511         {
5512           /* Replace the uses:  */
5513           orig_name = PHI_RESULT (exit_phi);
5514           scalar_result = scalar_results[k];
5515           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5516             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5517               SET_USE (use_p, scalar_result);
5518         }
5519
5520       phis.release ();
5521     }
5522 }
5523
5524
5525 /* Function is_nonwrapping_integer_induction.
5526
5527    Check if STMT (which is part of loop LOOP) both increments and
5528    does not cause overflow.  */
5529
5530 static bool
5531 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5532 {
5533   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5534   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5535   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5536   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5537   widest_int ni, max_loop_value, lhs_max;
5538   bool overflow = false;
5539
5540   /* Make sure the loop is integer based.  */
5541   if (TREE_CODE (base) != INTEGER_CST
5542       || TREE_CODE (step) != INTEGER_CST)
5543     return false;
5544
5545   /* Check that the max size of the loop will not wrap.  */
5546
5547   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5548     return true;
5549
5550   if (! max_stmt_executions (loop, &ni))
5551     return false;
5552
5553   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5554                             &overflow);
5555   if (overflow)
5556     return false;
5557
5558   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5559                             TYPE_SIGN (lhs_type), &overflow);
5560   if (overflow)
5561     return false;
5562
5563   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5564           <= TYPE_PRECISION (lhs_type));
5565 }
5566
5567 /* Function vectorizable_reduction.
5568
5569    Check if STMT performs a reduction operation that can be vectorized.
5570    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5571    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5572    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5573
5574    This function also handles reduction idioms (patterns) that have been
5575    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5576    of this form:
5577      X = pattern_expr (arg0, arg1, ..., X)
5578    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5579    sequence that had been detected and replaced by the pattern-stmt (STMT).
5580
5581    This function also handles reduction of condition expressions, for example:
5582      for (int i = 0; i < N; i++)
5583        if (a[i] < value)
5584          last = a[i];
5585    This is handled by vectorising the loop and creating an additional vector
5586    containing the loop indexes for which "a[i] < value" was true.  In the
5587    function epilogue this is reduced to a single max value and then used to
5588    index into the vector of results.
5589
5590    In some cases of reduction patterns, the type of the reduction variable X is
5591    different than the type of the other arguments of STMT.
5592    In such cases, the vectype that is used when transforming STMT into a vector
5593    stmt is different than the vectype that is used to determine the
5594    vectorization factor, because it consists of a different number of elements
5595    than the actual number of elements that are being operated upon in parallel.
5596
5597    For example, consider an accumulation of shorts into an int accumulator.
5598    On some targets it's possible to vectorize this pattern operating on 8
5599    shorts at a time (hence, the vectype for purposes of determining the
5600    vectorization factor should be V8HI); on the other hand, the vectype that
5601    is used to create the vector form is actually V4SI (the type of the result).
5602
5603    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5604    indicates what is the actual level of parallelism (V8HI in the example), so
5605    that the right vectorization factor would be derived.  This vectype
5606    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5607    be used to create the vectorized stmt.  The right vectype for the vectorized
5608    stmt is obtained from the type of the result X:
5609         get_vectype_for_scalar_type (TREE_TYPE (X))
5610
5611    This means that, contrary to "regular" reductions (or "regular" stmts in
5612    general), the following equation:
5613       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5614    does *NOT* necessarily hold for reduction patterns.  */
5615
5616 bool
5617 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5618                         gimple **vec_stmt, slp_tree slp_node,
5619                         slp_instance slp_node_instance)
5620 {
5621   tree vec_dest;
5622   tree scalar_dest;
5623   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5624   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5625   tree vectype_in = NULL_TREE;
5626   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5627   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5628   enum tree_code code, orig_code;
5629   internal_fn reduc_fn;
5630   machine_mode vec_mode;
5631   int op_type;
5632   optab optab;
5633   tree new_temp = NULL_TREE;
5634   gimple *def_stmt;
5635   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5636   gimple *cond_reduc_def_stmt = NULL;
5637   enum tree_code cond_reduc_op_code = ERROR_MARK;
5638   tree scalar_type;
5639   bool is_simple_use;
5640   gimple *orig_stmt;
5641   stmt_vec_info orig_stmt_info = NULL;
5642   int i;
5643   int ncopies;
5644   int epilog_copies;
5645   stmt_vec_info prev_stmt_info, prev_phi_info;
5646   bool single_defuse_cycle = false;
5647   gimple *new_stmt = NULL;
5648   int j;
5649   tree ops[3];
5650   enum vect_def_type dts[3];
5651   bool nested_cycle = false, found_nested_cycle_def = false;
5652   bool double_reduc = false;
5653   basic_block def_bb;
5654   struct loop * def_stmt_loop, *outer_loop = NULL;
5655   tree def_arg;
5656   gimple *def_arg_stmt;
5657   auto_vec<tree> vec_oprnds0;
5658   auto_vec<tree> vec_oprnds1;
5659   auto_vec<tree> vec_oprnds2;
5660   auto_vec<tree> vect_defs;
5661   auto_vec<gimple *> phis;
5662   int vec_num;
5663   tree def0, tem;
5664   bool first_p = true;
5665   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5666   tree cond_reduc_val = NULL_TREE;
5667
5668   /* Make sure it was already recognized as a reduction computation.  */
5669   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5670       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5671     return false;
5672
5673   if (nested_in_vect_loop_p (loop, stmt))
5674     {
5675       outer_loop = loop;
5676       loop = loop->inner;
5677       nested_cycle = true;
5678     }
5679
5680   /* In case of reduction chain we switch to the first stmt in the chain, but
5681      we don't update STMT_INFO, since only the last stmt is marked as reduction
5682      and has reduction properties.  */
5683   if (GROUP_FIRST_ELEMENT (stmt_info)
5684       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5685     {
5686       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5687       first_p = false;
5688     }
5689
5690   if (gimple_code (stmt) == GIMPLE_PHI)
5691     {
5692       /* Analysis is fully done on the reduction stmt invocation.  */
5693       if (! vec_stmt)
5694         {
5695           if (slp_node)
5696             slp_node_instance->reduc_phis = slp_node;
5697
5698           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5699           return true;
5700         }
5701
5702       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5703       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5704         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5705
5706       gcc_assert (is_gimple_assign (reduc_stmt));
5707       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5708         {
5709           tree op = gimple_op (reduc_stmt, k);
5710           if (op == gimple_phi_result (stmt))
5711             continue;
5712           if (k == 1
5713               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5714             continue;
5715           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5716           if (! vectype_in
5717               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5718             vectype_in = tem;
5719           break;
5720         }
5721       gcc_assert (vectype_in);
5722
5723       if (slp_node)
5724         ncopies = 1;
5725       else
5726         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5727
5728       use_operand_p use_p;
5729       gimple *use_stmt;
5730       if (ncopies > 1
5731           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5732               <= vect_used_only_live)
5733           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5734           && (use_stmt == reduc_stmt
5735               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5736                   == reduc_stmt)))
5737         single_defuse_cycle = true;
5738
5739       /* Create the destination vector  */
5740       scalar_dest = gimple_assign_lhs (reduc_stmt);
5741       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5742
5743       if (slp_node)
5744         /* The size vect_schedule_slp_instance computes is off for us.  */
5745         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5746                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5747                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5748       else
5749         vec_num = 1;
5750
5751       /* Generate the reduction PHIs upfront.  */
5752       prev_phi_info = NULL;
5753       for (j = 0; j < ncopies; j++)
5754         {
5755           if (j == 0 || !single_defuse_cycle)
5756             {
5757               for (i = 0; i < vec_num; i++)
5758                 {
5759                   /* Create the reduction-phi that defines the reduction
5760                      operand.  */
5761                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5762                   set_vinfo_for_stmt (new_phi,
5763                                       new_stmt_vec_info (new_phi, loop_vinfo));
5764
5765                   if (slp_node)
5766                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5767                   else
5768                     {
5769                       if (j == 0)
5770                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5771                       else
5772                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5773                       prev_phi_info = vinfo_for_stmt (new_phi);
5774                     }
5775                 }
5776             }
5777         }
5778
5779       return true;
5780     }
5781
5782   /* 1. Is vectorizable reduction?  */
5783   /* Not supportable if the reduction variable is used in the loop, unless
5784      it's a reduction chain.  */
5785   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5786       && !GROUP_FIRST_ELEMENT (stmt_info))
5787     return false;
5788
5789   /* Reductions that are not used even in an enclosing outer-loop,
5790      are expected to be "live" (used out of the loop).  */
5791   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5792       && !STMT_VINFO_LIVE_P (stmt_info))
5793     return false;
5794
5795   /* 2. Has this been recognized as a reduction pattern?
5796
5797      Check if STMT represents a pattern that has been recognized
5798      in earlier analysis stages.  For stmts that represent a pattern,
5799      the STMT_VINFO_RELATED_STMT field records the last stmt in
5800      the original sequence that constitutes the pattern.  */
5801
5802   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5803   if (orig_stmt)
5804     {
5805       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5806       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5807       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5808     }
5809
5810   /* 3. Check the operands of the operation.  The first operands are defined
5811         inside the loop body. The last operand is the reduction variable,
5812         which is defined by the loop-header-phi.  */
5813
5814   gcc_assert (is_gimple_assign (stmt));
5815
5816   /* Flatten RHS.  */
5817   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5818     {
5819     case GIMPLE_BINARY_RHS:
5820       code = gimple_assign_rhs_code (stmt);
5821       op_type = TREE_CODE_LENGTH (code);
5822       gcc_assert (op_type == binary_op);
5823       ops[0] = gimple_assign_rhs1 (stmt);
5824       ops[1] = gimple_assign_rhs2 (stmt);
5825       break;
5826
5827     case GIMPLE_TERNARY_RHS:
5828       code = gimple_assign_rhs_code (stmt);
5829       op_type = TREE_CODE_LENGTH (code);
5830       gcc_assert (op_type == ternary_op);
5831       ops[0] = gimple_assign_rhs1 (stmt);
5832       ops[1] = gimple_assign_rhs2 (stmt);
5833       ops[2] = gimple_assign_rhs3 (stmt);
5834       break;
5835
5836     case GIMPLE_UNARY_RHS:
5837       return false;
5838
5839     default:
5840       gcc_unreachable ();
5841     }
5842
5843   if (code == COND_EXPR && slp_node)
5844     return false;
5845
5846   scalar_dest = gimple_assign_lhs (stmt);
5847   scalar_type = TREE_TYPE (scalar_dest);
5848   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5849       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5850     return false;
5851
5852   /* Do not try to vectorize bit-precision reductions.  */
5853   if (!type_has_mode_precision_p (scalar_type))
5854     return false;
5855
5856   /* All uses but the last are expected to be defined in the loop.
5857      The last use is the reduction variable.  In case of nested cycle this
5858      assumption is not true: we use reduc_index to record the index of the
5859      reduction variable.  */
5860   gimple *reduc_def_stmt = NULL;
5861   int reduc_index = -1;
5862   for (i = 0; i < op_type; i++)
5863     {
5864       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5865       if (i == 0 && code == COND_EXPR)
5866         continue;
5867
5868       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5869                                           &def_stmt, &dts[i], &tem);
5870       dt = dts[i];
5871       gcc_assert (is_simple_use);
5872       if (dt == vect_reduction_def)
5873         {
5874           reduc_def_stmt = def_stmt;
5875           reduc_index = i;
5876           continue;
5877         }
5878       else if (tem)
5879         {
5880           /* To properly compute ncopies we are interested in the widest
5881              input type in case we're looking at a widening accumulation.  */
5882           if (!vectype_in
5883               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5884             vectype_in = tem;
5885         }
5886
5887       if (dt != vect_internal_def
5888           && dt != vect_external_def
5889           && dt != vect_constant_def
5890           && dt != vect_induction_def
5891           && !(dt == vect_nested_cycle && nested_cycle))
5892         return false;
5893
5894       if (dt == vect_nested_cycle)
5895         {
5896           found_nested_cycle_def = true;
5897           reduc_def_stmt = def_stmt;
5898           reduc_index = i;
5899         }
5900
5901       if (i == 1 && code == COND_EXPR)
5902         {
5903           /* Record how value of COND_EXPR is defined.  */
5904           if (dt == vect_constant_def)
5905             {
5906               cond_reduc_dt = dt;
5907               cond_reduc_val = ops[i];
5908             }
5909           if (dt == vect_induction_def
5910               && def_stmt != NULL
5911               && is_nonwrapping_integer_induction (def_stmt, loop))
5912             {
5913               cond_reduc_dt = dt;
5914               cond_reduc_def_stmt = def_stmt;
5915             }
5916         }
5917     }
5918
5919   if (!vectype_in)
5920     vectype_in = vectype_out;
5921
5922   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5923      directy used in stmt.  */
5924   if (reduc_index == -1)
5925     {
5926       if (orig_stmt)
5927         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5928       else
5929         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5930     }
5931
5932   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5933     return false;
5934
5935   if (!(reduc_index == -1
5936         || dts[reduc_index] == vect_reduction_def
5937         || dts[reduc_index] == vect_nested_cycle
5938         || ((dts[reduc_index] == vect_internal_def
5939              || dts[reduc_index] == vect_external_def
5940              || dts[reduc_index] == vect_constant_def
5941              || dts[reduc_index] == vect_induction_def)
5942             && nested_cycle && found_nested_cycle_def)))
5943     {
5944       /* For pattern recognized stmts, orig_stmt might be a reduction,
5945          but some helper statements for the pattern might not, or
5946          might be COND_EXPRs with reduction uses in the condition.  */
5947       gcc_assert (orig_stmt);
5948       return false;
5949     }
5950
5951   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5952   enum vect_reduction_type v_reduc_type
5953     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5954   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5955
5956   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5957   /* If we have a condition reduction, see if we can simplify it further.  */
5958   if (v_reduc_type == COND_REDUCTION)
5959     {
5960       if (cond_reduc_dt == vect_induction_def)
5961         {
5962           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
5963           tree base
5964             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
5965           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
5966
5967           gcc_assert (TREE_CODE (base) == INTEGER_CST
5968                       && TREE_CODE (step) == INTEGER_CST);
5969           cond_reduc_val = NULL_TREE;
5970           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
5971              above base; punt if base is the minimum value of the type for
5972              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
5973           if (tree_int_cst_sgn (step) == -1)
5974             {
5975               cond_reduc_op_code = MIN_EXPR;
5976               if (tree_int_cst_sgn (base) == -1)
5977                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5978               else if (tree_int_cst_lt (base,
5979                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
5980                 cond_reduc_val
5981                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
5982             }
5983           else
5984             {
5985               cond_reduc_op_code = MAX_EXPR;
5986               if (tree_int_cst_sgn (base) == 1)
5987                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
5988               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
5989                                         base))
5990                 cond_reduc_val
5991                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
5992             }
5993           if (cond_reduc_val)
5994             {
5995               if (dump_enabled_p ())
5996                 dump_printf_loc (MSG_NOTE, vect_location,
5997                                  "condition expression based on "
5998                                  "integer induction.\n");
5999               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6000                 = INTEGER_INDUC_COND_REDUCTION;
6001             }
6002         }
6003
6004       /* Loop peeling modifies initial value of reduction PHI, which
6005          makes the reduction stmt to be transformed different to the
6006          original stmt analyzed.  We need to record reduction code for
6007          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6008          it can be used directly at transform stage.  */
6009       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6010           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6011         {
6012           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6013           gcc_assert (cond_reduc_dt == vect_constant_def);
6014           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6015         }
6016       else if (cond_reduc_dt == vect_constant_def)
6017         {
6018           enum vect_def_type cond_initial_dt;
6019           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6020           tree cond_initial_val
6021             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6022
6023           gcc_assert (cond_reduc_val != NULL_TREE);
6024           vect_is_simple_use (cond_initial_val, loop_vinfo,
6025                               &def_stmt, &cond_initial_dt);
6026           if (cond_initial_dt == vect_constant_def
6027               && types_compatible_p (TREE_TYPE (cond_initial_val),
6028                                      TREE_TYPE (cond_reduc_val)))
6029             {
6030               tree e = fold_binary (LE_EXPR, boolean_type_node,
6031                                     cond_initial_val, cond_reduc_val);
6032               if (e && (integer_onep (e) || integer_zerop (e)))
6033                 {
6034                   if (dump_enabled_p ())
6035                     dump_printf_loc (MSG_NOTE, vect_location,
6036                                      "condition expression based on "
6037                                      "compile time constant.\n");
6038                   /* Record reduction code at analysis stage.  */
6039                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6040                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6041                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6042                     = CONST_COND_REDUCTION;
6043                 }
6044             }
6045         }
6046     }
6047
6048   if (orig_stmt)
6049     gcc_assert (tmp == orig_stmt
6050                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6051   else
6052     /* We changed STMT to be the first stmt in reduction chain, hence we
6053        check that in this case the first element in the chain is STMT.  */
6054     gcc_assert (stmt == tmp
6055                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6056
6057   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6058     return false;
6059
6060   if (slp_node)
6061     ncopies = 1;
6062   else
6063     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6064
6065   gcc_assert (ncopies >= 1);
6066
6067   vec_mode = TYPE_MODE (vectype_in);
6068
6069   if (code == COND_EXPR)
6070     {
6071       /* Only call during the analysis stage, otherwise we'll lose
6072          STMT_VINFO_TYPE.  */
6073       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6074                                                 ops[reduc_index], 0, NULL))
6075         {
6076           if (dump_enabled_p ())
6077             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6078                              "unsupported condition in reduction\n");
6079           return false;
6080         }
6081     }
6082   else
6083     {
6084       /* 4. Supportable by target?  */
6085
6086       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6087           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6088         {
6089           /* Shifts and rotates are only supported by vectorizable_shifts,
6090              not vectorizable_reduction.  */
6091           if (dump_enabled_p ())
6092             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6093                              "unsupported shift or rotation.\n");
6094           return false;
6095         }
6096
6097       /* 4.1. check support for the operation in the loop  */
6098       optab = optab_for_tree_code (code, vectype_in, optab_default);
6099       if (!optab)
6100         {
6101           if (dump_enabled_p ())
6102             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6103                              "no optab.\n");
6104
6105           return false;
6106         }
6107
6108       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6109         {
6110           if (dump_enabled_p ())
6111             dump_printf (MSG_NOTE, "op not supported by target.\n");
6112
6113           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6114               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6115             return false;
6116
6117           if (dump_enabled_p ())
6118             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6119         }
6120
6121       /* Worthwhile without SIMD support?  */
6122       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6123           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6124         {
6125           if (dump_enabled_p ())
6126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127                              "not worthwhile without SIMD support.\n");
6128
6129           return false;
6130         }
6131     }
6132
6133   /* 4.2. Check support for the epilog operation.
6134
6135           If STMT represents a reduction pattern, then the type of the
6136           reduction variable may be different than the type of the rest
6137           of the arguments.  For example, consider the case of accumulation
6138           of shorts into an int accumulator; The original code:
6139                         S1: int_a = (int) short_a;
6140           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6141
6142           was replaced with:
6143                         STMT: int_acc = widen_sum <short_a, int_acc>
6144
6145           This means that:
6146           1. The tree-code that is used to create the vector operation in the
6147              epilog code (that reduces the partial results) is not the
6148              tree-code of STMT, but is rather the tree-code of the original
6149              stmt from the pattern that STMT is replacing.  I.e, in the example
6150              above we want to use 'widen_sum' in the loop, but 'plus' in the
6151              epilog.
6152           2. The type (mode) we use to check available target support
6153              for the vector operation to be created in the *epilog*, is
6154              determined by the type of the reduction variable (in the example
6155              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6156              However the type (mode) we use to check available target support
6157              for the vector operation to be created *inside the loop*, is
6158              determined by the type of the other arguments to STMT (in the
6159              example we'd check this: optab_handler (widen_sum_optab,
6160              vect_short_mode)).
6161
6162           This is contrary to "regular" reductions, in which the types of all
6163           the arguments are the same as the type of the reduction variable.
6164           For "regular" reductions we can therefore use the same vector type
6165           (and also the same tree-code) when generating the epilog code and
6166           when generating the code inside the loop.  */
6167
6168   if (orig_stmt)
6169     {
6170       /* This is a reduction pattern: get the vectype from the type of the
6171          reduction variable, and get the tree-code from orig_stmt.  */
6172       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6173                   == TREE_CODE_REDUCTION);
6174       orig_code = gimple_assign_rhs_code (orig_stmt);
6175       gcc_assert (vectype_out);
6176       vec_mode = TYPE_MODE (vectype_out);
6177     }
6178   else
6179     {
6180       /* Regular reduction: use the same vectype and tree-code as used for
6181          the vector code inside the loop can be used for the epilog code. */
6182       orig_code = code;
6183
6184       if (code == MINUS_EXPR)
6185         orig_code = PLUS_EXPR;
6186
6187       /* For simple condition reductions, replace with the actual expression
6188          we want to base our reduction around.  */
6189       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6190         {
6191           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6192           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6193         }
6194       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6195                == INTEGER_INDUC_COND_REDUCTION)
6196         orig_code = cond_reduc_op_code;
6197     }
6198
6199   if (nested_cycle)
6200     {
6201       def_bb = gimple_bb (reduc_def_stmt);
6202       def_stmt_loop = def_bb->loop_father;
6203       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6204                                        loop_preheader_edge (def_stmt_loop));
6205       if (TREE_CODE (def_arg) == SSA_NAME
6206           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6207           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6208           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6209           && vinfo_for_stmt (def_arg_stmt)
6210           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6211               == vect_double_reduction_def)
6212         double_reduc = true;
6213     }
6214
6215   reduc_fn = IFN_LAST;
6216
6217   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6218     {
6219       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6220         {
6221           if (reduc_fn != IFN_LAST
6222               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6223                                                   OPTIMIZE_FOR_SPEED))
6224             {
6225               if (dump_enabled_p ())
6226                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6227                                  "reduc op not supported by target.\n");
6228
6229               reduc_fn = IFN_LAST;
6230             }
6231         }
6232       else
6233         {
6234           if (!nested_cycle || double_reduc)
6235             {
6236               if (dump_enabled_p ())
6237                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6238                                  "no reduc code for scalar code.\n");
6239
6240               return false;
6241             }
6242         }
6243     }
6244   else
6245     {
6246       int scalar_precision
6247         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6248       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6249       cr_index_vector_type = build_vector_type
6250         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6251
6252       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6253                                           OPTIMIZE_FOR_SPEED))
6254         reduc_fn = IFN_REDUC_MAX;
6255     }
6256
6257   if ((double_reduc
6258        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6259       && ncopies > 1)
6260     {
6261       if (dump_enabled_p ())
6262         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6263                          "multiple types in double reduction or condition "
6264                          "reduction.\n");
6265       return false;
6266     }
6267
6268   /* In case of widenning multiplication by a constant, we update the type
6269      of the constant to be the type of the other operand.  We check that the
6270      constant fits the type in the pattern recognition pass.  */
6271   if (code == DOT_PROD_EXPR
6272       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6273     {
6274       if (TREE_CODE (ops[0]) == INTEGER_CST)
6275         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6276       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6277         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6278       else
6279         {
6280           if (dump_enabled_p ())
6281             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6282                              "invalid types in dot-prod\n");
6283
6284           return false;
6285         }
6286     }
6287
6288   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6289     {
6290       widest_int ni;
6291
6292       if (! max_loop_iterations (loop, &ni))
6293         {
6294           if (dump_enabled_p ())
6295             dump_printf_loc (MSG_NOTE, vect_location,
6296                              "loop count not known, cannot create cond "
6297                              "reduction.\n");
6298           return false;
6299         }
6300       /* Convert backedges to iterations.  */
6301       ni += 1;
6302
6303       /* The additional index will be the same type as the condition.  Check
6304          that the loop can fit into this less one (because we'll use up the
6305          zero slot for when there are no matches).  */
6306       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6307       if (wi::geu_p (ni, wi::to_widest (max_index)))
6308         {
6309           if (dump_enabled_p ())
6310             dump_printf_loc (MSG_NOTE, vect_location,
6311                              "loop size is greater than data size.\n");
6312           return false;
6313         }
6314     }
6315
6316   /* In case the vectorization factor (VF) is bigger than the number
6317      of elements that we can fit in a vectype (nunits), we have to generate
6318      more than one vector stmt - i.e - we need to "unroll" the
6319      vector stmt by a factor VF/nunits.  For more details see documentation
6320      in vectorizable_operation.  */
6321
6322   /* If the reduction is used in an outer loop we need to generate
6323      VF intermediate results, like so (e.g. for ncopies=2):
6324         r0 = phi (init, r0)
6325         r1 = phi (init, r1)
6326         r0 = x0 + r0;
6327         r1 = x1 + r1;
6328     (i.e. we generate VF results in 2 registers).
6329     In this case we have a separate def-use cycle for each copy, and therefore
6330     for each copy we get the vector def for the reduction variable from the
6331     respective phi node created for this copy.
6332
6333     Otherwise (the reduction is unused in the loop nest), we can combine
6334     together intermediate results, like so (e.g. for ncopies=2):
6335         r = phi (init, r)
6336         r = x0 + r;
6337         r = x1 + r;
6338    (i.e. we generate VF/2 results in a single register).
6339    In this case for each copy we get the vector def for the reduction variable
6340    from the vectorized reduction operation generated in the previous iteration.
6341
6342    This only works when we see both the reduction PHI and its only consumer
6343    in vectorizable_reduction and there are no intermediate stmts
6344    participating.  */
6345   use_operand_p use_p;
6346   gimple *use_stmt;
6347   if (ncopies > 1
6348       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6349       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6350       && (use_stmt == stmt
6351           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6352     {
6353       single_defuse_cycle = true;
6354       epilog_copies = 1;
6355     }
6356   else
6357     epilog_copies = ncopies;
6358
6359   /* If the reduction stmt is one of the patterns that have lane
6360      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6361   if ((ncopies > 1
6362        && ! single_defuse_cycle)
6363       && (code == DOT_PROD_EXPR
6364           || code == WIDEN_SUM_EXPR
6365           || code == SAD_EXPR))
6366     {
6367       if (dump_enabled_p ())
6368         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6369                          "multi def-use cycle not possible for lane-reducing "
6370                          "reduction operation\n");
6371       return false;
6372     }
6373
6374   if (!vec_stmt) /* transformation not required.  */
6375     {
6376       if (first_p)
6377         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6378       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6379       return true;
6380     }
6381
6382   /* Transform.  */
6383
6384   if (dump_enabled_p ())
6385     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6386
6387   /* FORNOW: Multiple types are not supported for condition.  */
6388   if (code == COND_EXPR)
6389     gcc_assert (ncopies == 1);
6390
6391   /* Create the destination vector  */
6392   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6393
6394   prev_stmt_info = NULL;
6395   prev_phi_info = NULL;
6396   if (slp_node)
6397     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6398   else
6399     {
6400       vec_num = 1;
6401       vec_oprnds0.create (1);
6402       vec_oprnds1.create (1);
6403       if (op_type == ternary_op)
6404         vec_oprnds2.create (1);
6405     }
6406
6407   phis.create (vec_num);
6408   vect_defs.create (vec_num);
6409   if (!slp_node)
6410     vect_defs.quick_push (NULL_TREE);
6411
6412   if (slp_node)
6413     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6414   else
6415     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6416
6417   for (j = 0; j < ncopies; j++)
6418     {
6419       if (code == COND_EXPR)
6420         {
6421           gcc_assert (!slp_node);
6422           vectorizable_condition (stmt, gsi, vec_stmt,
6423                                   PHI_RESULT (phis[0]),
6424                                   reduc_index, NULL);
6425           /* Multiple types are not supported for condition.  */
6426           break;
6427         }
6428
6429       /* Handle uses.  */
6430       if (j == 0)
6431         {
6432           if (slp_node)
6433             {
6434               /* Get vec defs for all the operands except the reduction index,
6435                  ensuring the ordering of the ops in the vector is kept.  */
6436               auto_vec<tree, 3> slp_ops;
6437               auto_vec<vec<tree>, 3> vec_defs;
6438
6439               slp_ops.quick_push (ops[0]);
6440               slp_ops.quick_push (ops[1]);
6441               if (op_type == ternary_op)
6442                 slp_ops.quick_push (ops[2]);
6443
6444               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6445
6446               vec_oprnds0.safe_splice (vec_defs[0]);
6447               vec_defs[0].release ();
6448               vec_oprnds1.safe_splice (vec_defs[1]);
6449               vec_defs[1].release ();
6450               if (op_type == ternary_op)
6451                 {
6452                   vec_oprnds2.safe_splice (vec_defs[2]);
6453                   vec_defs[2].release ();
6454                 }
6455             }
6456           else
6457             {
6458               vec_oprnds0.quick_push
6459                 (vect_get_vec_def_for_operand (ops[0], stmt));
6460               vec_oprnds1.quick_push
6461                 (vect_get_vec_def_for_operand (ops[1], stmt));
6462               if (op_type == ternary_op)
6463                 vec_oprnds2.quick_push
6464                   (vect_get_vec_def_for_operand (ops[2], stmt));
6465             }
6466         }
6467       else
6468         {
6469           if (!slp_node)
6470             {
6471               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6472
6473               if (single_defuse_cycle && reduc_index == 0)
6474                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6475               else
6476                 vec_oprnds0[0]
6477                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6478               if (single_defuse_cycle && reduc_index == 1)
6479                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6480               else
6481                 vec_oprnds1[0]
6482                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6483               if (op_type == ternary_op)
6484                 {
6485                   if (single_defuse_cycle && reduc_index == 2)
6486                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6487                   else
6488                     vec_oprnds2[0]
6489                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6490                 }
6491             }
6492         }
6493
6494       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6495         {
6496           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6497           if (op_type == ternary_op)
6498             vop[2] = vec_oprnds2[i];
6499
6500           new_temp = make_ssa_name (vec_dest, new_stmt);
6501           new_stmt = gimple_build_assign (new_temp, code,
6502                                           vop[0], vop[1], vop[2]);
6503           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6504
6505           if (slp_node)
6506             {
6507               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6508               vect_defs.quick_push (new_temp);
6509             }
6510           else
6511             vect_defs[0] = new_temp;
6512         }
6513
6514       if (slp_node)
6515         continue;
6516
6517       if (j == 0)
6518         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6519       else
6520         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6521
6522       prev_stmt_info = vinfo_for_stmt (new_stmt);
6523     }
6524
6525   /* Finalize the reduction-phi (set its arguments) and create the
6526      epilog reduction code.  */
6527   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6528     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6529
6530   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6531                                     epilog_copies, reduc_fn, phis,
6532                                     double_reduc, slp_node, slp_node_instance,
6533                                     cond_reduc_val, cond_reduc_op_code);
6534
6535   return true;
6536 }
6537
6538 /* Function vect_min_worthwhile_factor.
6539
6540    For a loop where we could vectorize the operation indicated by CODE,
6541    return the minimum vectorization factor that makes it worthwhile
6542    to use generic vectors.  */
6543 int
6544 vect_min_worthwhile_factor (enum tree_code code)
6545 {
6546   switch (code)
6547     {
6548     case PLUS_EXPR:
6549     case MINUS_EXPR:
6550     case NEGATE_EXPR:
6551       return 4;
6552
6553     case BIT_AND_EXPR:
6554     case BIT_IOR_EXPR:
6555     case BIT_XOR_EXPR:
6556     case BIT_NOT_EXPR:
6557       return 2;
6558
6559     default:
6560       return INT_MAX;
6561     }
6562 }
6563
6564 /* Return true if VINFO indicates we are doing loop vectorization and if
6565    it is worth decomposing CODE operations into scalar operations for
6566    that loop's vectorization factor.  */
6567
6568 bool
6569 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6570 {
6571   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6572   return (loop_vinfo
6573           && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6574               >= vect_min_worthwhile_factor (code)));
6575 }
6576
6577 /* Function vectorizable_induction
6578
6579    Check if PHI performs an induction computation that can be vectorized.
6580    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6581    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6582    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6583
6584 bool
6585 vectorizable_induction (gimple *phi,
6586                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6587                         gimple **vec_stmt, slp_tree slp_node)
6588 {
6589   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6590   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6591   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6592   unsigned ncopies;
6593   bool nested_in_vect_loop = false;
6594   struct loop *iv_loop;
6595   tree vec_def;
6596   edge pe = loop_preheader_edge (loop);
6597   basic_block new_bb;
6598   tree new_vec, vec_init, vec_step, t;
6599   tree new_name;
6600   gimple *new_stmt;
6601   gphi *induction_phi;
6602   tree induc_def, vec_dest;
6603   tree init_expr, step_expr;
6604   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6605   unsigned i;
6606   tree expr;
6607   gimple_seq stmts;
6608   imm_use_iterator imm_iter;
6609   use_operand_p use_p;
6610   gimple *exit_phi;
6611   edge latch_e;
6612   tree loop_arg;
6613   gimple_stmt_iterator si;
6614   basic_block bb = gimple_bb (phi);
6615
6616   if (gimple_code (phi) != GIMPLE_PHI)
6617     return false;
6618
6619   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6620     return false;
6621
6622   /* Make sure it was recognized as induction computation.  */
6623   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6624     return false;
6625
6626   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6627   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6628
6629   if (slp_node)
6630     ncopies = 1;
6631   else
6632     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6633   gcc_assert (ncopies >= 1);
6634
6635   /* FORNOW. These restrictions should be relaxed.  */
6636   if (nested_in_vect_loop_p (loop, phi))
6637     {
6638       imm_use_iterator imm_iter;
6639       use_operand_p use_p;
6640       gimple *exit_phi;
6641       edge latch_e;
6642       tree loop_arg;
6643
6644       if (ncopies > 1)
6645         {
6646           if (dump_enabled_p ())
6647             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6648                              "multiple types in nested loop.\n");
6649           return false;
6650         }
6651
6652       /* FORNOW: outer loop induction with SLP not supported.  */
6653       if (STMT_SLP_TYPE (stmt_info))
6654         return false;
6655
6656       exit_phi = NULL;
6657       latch_e = loop_latch_edge (loop->inner);
6658       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6659       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6660         {
6661           gimple *use_stmt = USE_STMT (use_p);
6662           if (is_gimple_debug (use_stmt))
6663             continue;
6664
6665           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6666             {
6667               exit_phi = use_stmt;
6668               break;
6669             }
6670         }
6671       if (exit_phi)
6672         {
6673           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6674           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6675                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6676             {
6677               if (dump_enabled_p ())
6678                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6679                                  "inner-loop induction only used outside "
6680                                  "of the outer vectorized loop.\n");
6681               return false;
6682             }
6683         }
6684
6685       nested_in_vect_loop = true;
6686       iv_loop = loop->inner;
6687     }
6688   else
6689     iv_loop = loop;
6690   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6691
6692   if (!vec_stmt) /* transformation not required.  */
6693     {
6694       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6695       if (dump_enabled_p ())
6696         dump_printf_loc (MSG_NOTE, vect_location,
6697                          "=== vectorizable_induction ===\n");
6698       vect_model_induction_cost (stmt_info, ncopies);
6699       return true;
6700     }
6701
6702   /* Transform.  */
6703
6704   /* Compute a vector variable, initialized with the first VF values of
6705      the induction variable.  E.g., for an iv with IV_PHI='X' and
6706      evolution S, for a vector of 4 units, we want to compute:
6707      [X, X + S, X + 2*S, X + 3*S].  */
6708
6709   if (dump_enabled_p ())
6710     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6711
6712   latch_e = loop_latch_edge (iv_loop);
6713   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6714
6715   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6716   gcc_assert (step_expr != NULL_TREE);
6717
6718   pe = loop_preheader_edge (iv_loop);
6719   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6720                                      loop_preheader_edge (iv_loop));
6721
6722   /* Convert the step to the desired type.  */
6723   stmts = NULL;
6724   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6725   if (stmts)
6726     {
6727       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6728       gcc_assert (!new_bb);
6729     }
6730
6731   /* Find the first insertion point in the BB.  */
6732   si = gsi_after_labels (bb);
6733
6734   /* For SLP induction we have to generate several IVs as for example
6735      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6736      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6737      [VF*S, VF*S, VF*S, VF*S] for all.  */
6738   if (slp_node)
6739     {
6740       /* Convert the init to the desired type.  */
6741       stmts = NULL;
6742       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6743       if (stmts)
6744         {
6745           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6746           gcc_assert (!new_bb);
6747         }
6748
6749       /* Generate [VF*S, VF*S, ... ].  */
6750       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6751         {
6752           expr = build_int_cst (integer_type_node, vf);
6753           expr = fold_convert (TREE_TYPE (step_expr), expr);
6754         }
6755       else
6756         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6757       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6758                               expr, step_expr);
6759       if (! CONSTANT_CLASS_P (new_name))
6760         new_name = vect_init_vector (phi, new_name,
6761                                      TREE_TYPE (step_expr), NULL);
6762       new_vec = build_vector_from_val (vectype, new_name);
6763       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6764
6765       /* Now generate the IVs.  */
6766       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6767       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6768       unsigned elts = nunits * nvects;
6769       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6770       gcc_assert (elts % group_size == 0);
6771       tree elt = init_expr;
6772       unsigned ivn;
6773       for (ivn = 0; ivn < nivs; ++ivn)
6774         {
6775           tree_vector_builder elts (vectype, nunits, 1);
6776           stmts = NULL;
6777           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6778             {
6779               if (ivn*nunits + eltn >= group_size
6780                   && (ivn*nunits + eltn) % group_size == 0)
6781                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6782                                     elt, step_expr);
6783               elts.quick_push (elt);
6784             }
6785           vec_init = gimple_build_vector (&stmts, &elts);
6786           if (stmts)
6787             {
6788               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6789               gcc_assert (!new_bb);
6790             }
6791
6792           /* Create the induction-phi that defines the induction-operand.  */
6793           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6794           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6795           set_vinfo_for_stmt (induction_phi,
6796                               new_stmt_vec_info (induction_phi, loop_vinfo));
6797           induc_def = PHI_RESULT (induction_phi);
6798
6799           /* Create the iv update inside the loop  */
6800           vec_def = make_ssa_name (vec_dest);
6801           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6802           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6803           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6804
6805           /* Set the arguments of the phi node:  */
6806           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6807           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6808                        UNKNOWN_LOCATION);
6809
6810           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6811         }
6812
6813       /* Re-use IVs when we can.  */
6814       if (ivn < nvects)
6815         {
6816           unsigned vfp
6817             = least_common_multiple (group_size, nunits) / group_size;
6818           /* Generate [VF'*S, VF'*S, ... ].  */
6819           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6820             {
6821               expr = build_int_cst (integer_type_node, vfp);
6822               expr = fold_convert (TREE_TYPE (step_expr), expr);
6823             }
6824           else
6825             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6826           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6827                                   expr, step_expr);
6828           if (! CONSTANT_CLASS_P (new_name))
6829             new_name = vect_init_vector (phi, new_name,
6830                                          TREE_TYPE (step_expr), NULL);
6831           new_vec = build_vector_from_val (vectype, new_name);
6832           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6833           for (; ivn < nvects; ++ivn)
6834             {
6835               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6836               tree def;
6837               if (gimple_code (iv) == GIMPLE_PHI)
6838                 def = gimple_phi_result (iv);
6839               else
6840                 def = gimple_assign_lhs (iv);
6841               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6842                                               PLUS_EXPR,
6843                                               def, vec_step);
6844               if (gimple_code (iv) == GIMPLE_PHI)
6845                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6846               else
6847                 {
6848                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6849                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6850                 }
6851               set_vinfo_for_stmt (new_stmt,
6852                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6853               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6854             }
6855         }
6856
6857       return true;
6858     }
6859
6860   /* Create the vector that holds the initial_value of the induction.  */
6861   if (nested_in_vect_loop)
6862     {
6863       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6864          been created during vectorization of previous stmts.  We obtain it
6865          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6866       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6867       /* If the initial value is not of proper type, convert it.  */
6868       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6869         {
6870           new_stmt
6871             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6872                                                           vect_simple_var,
6873                                                           "vec_iv_"),
6874                                    VIEW_CONVERT_EXPR,
6875                                    build1 (VIEW_CONVERT_EXPR, vectype,
6876                                            vec_init));
6877           vec_init = gimple_assign_lhs (new_stmt);
6878           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6879                                                  new_stmt);
6880           gcc_assert (!new_bb);
6881           set_vinfo_for_stmt (new_stmt,
6882                               new_stmt_vec_info (new_stmt, loop_vinfo));
6883         }
6884     }
6885   else
6886     {
6887       /* iv_loop is the loop to be vectorized. Create:
6888          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6889       stmts = NULL;
6890       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6891
6892       tree_vector_builder elts (vectype, nunits, 1);
6893       elts.quick_push (new_name);
6894       for (i = 1; i < nunits; i++)
6895         {
6896           /* Create: new_name_i = new_name + step_expr  */
6897           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6898                                    new_name, step_expr);
6899           elts.quick_push (new_name);
6900         }
6901       /* Create a vector from [new_name_0, new_name_1, ...,
6902          new_name_nunits-1]  */
6903       vec_init = gimple_build_vector (&stmts, &elts);
6904       if (stmts)
6905         {
6906           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6907           gcc_assert (!new_bb);
6908         }
6909     }
6910
6911
6912   /* Create the vector that holds the step of the induction.  */
6913   if (nested_in_vect_loop)
6914     /* iv_loop is nested in the loop to be vectorized. Generate:
6915        vec_step = [S, S, S, S]  */
6916     new_name = step_expr;
6917   else
6918     {
6919       /* iv_loop is the loop to be vectorized. Generate:
6920           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6921       gimple_seq seq = NULL;
6922       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6923         {
6924           expr = build_int_cst (integer_type_node, vf);
6925           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6926         }
6927       else
6928         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6929       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6930                                expr, step_expr);
6931       if (seq)
6932         {
6933           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6934           gcc_assert (!new_bb);
6935         }
6936     }
6937
6938   t = unshare_expr (new_name);
6939   gcc_assert (CONSTANT_CLASS_P (new_name)
6940               || TREE_CODE (new_name) == SSA_NAME);
6941   new_vec = build_vector_from_val (vectype, t);
6942   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6943
6944
6945   /* Create the following def-use cycle:
6946      loop prolog:
6947          vec_init = ...
6948          vec_step = ...
6949      loop:
6950          vec_iv = PHI <vec_init, vec_loop>
6951          ...
6952          STMT
6953          ...
6954          vec_loop = vec_iv + vec_step;  */
6955
6956   /* Create the induction-phi that defines the induction-operand.  */
6957   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6958   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6959   set_vinfo_for_stmt (induction_phi,
6960                       new_stmt_vec_info (induction_phi, loop_vinfo));
6961   induc_def = PHI_RESULT (induction_phi);
6962
6963   /* Create the iv update inside the loop  */
6964   vec_def = make_ssa_name (vec_dest);
6965   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6966   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6967   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6968
6969   /* Set the arguments of the phi node:  */
6970   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6971   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6972                UNKNOWN_LOCATION);
6973
6974   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6975
6976   /* In case that vectorization factor (VF) is bigger than the number
6977      of elements that we can fit in a vectype (nunits), we have to generate
6978      more than one vector stmt - i.e - we need to "unroll" the
6979      vector stmt by a factor VF/nunits.  For more details see documentation
6980      in vectorizable_operation.  */
6981
6982   if (ncopies > 1)
6983     {
6984       gimple_seq seq = NULL;
6985       stmt_vec_info prev_stmt_vinfo;
6986       /* FORNOW. This restriction should be relaxed.  */
6987       gcc_assert (!nested_in_vect_loop);
6988
6989       /* Create the vector that holds the step of the induction.  */
6990       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6991         {
6992           expr = build_int_cst (integer_type_node, nunits);
6993           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6994         }
6995       else
6996         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6997       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6998                                expr, step_expr);
6999       if (seq)
7000         {
7001           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7002           gcc_assert (!new_bb);
7003         }
7004
7005       t = unshare_expr (new_name);
7006       gcc_assert (CONSTANT_CLASS_P (new_name)
7007                   || TREE_CODE (new_name) == SSA_NAME);
7008       new_vec = build_vector_from_val (vectype, t);
7009       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7010
7011       vec_def = induc_def;
7012       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7013       for (i = 1; i < ncopies; i++)
7014         {
7015           /* vec_i = vec_prev + vec_step  */
7016           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7017                                           vec_def, vec_step);
7018           vec_def = make_ssa_name (vec_dest, new_stmt);
7019           gimple_assign_set_lhs (new_stmt, vec_def);
7020
7021           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7022           set_vinfo_for_stmt (new_stmt,
7023                               new_stmt_vec_info (new_stmt, loop_vinfo));
7024           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7025           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7026         }
7027     }
7028
7029   if (nested_in_vect_loop)
7030     {
7031       /* Find the loop-closed exit-phi of the induction, and record
7032          the final vector of induction results:  */
7033       exit_phi = NULL;
7034       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7035         {
7036           gimple *use_stmt = USE_STMT (use_p);
7037           if (is_gimple_debug (use_stmt))
7038             continue;
7039
7040           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7041             {
7042               exit_phi = use_stmt;
7043               break;
7044             }
7045         }
7046       if (exit_phi)
7047         {
7048           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7049           /* FORNOW. Currently not supporting the case that an inner-loop induction
7050              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7051           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7052                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7053
7054           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7055           if (dump_enabled_p ())
7056             {
7057               dump_printf_loc (MSG_NOTE, vect_location,
7058                                "vector of inductions after inner-loop:");
7059               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7060             }
7061         }
7062     }
7063
7064
7065   if (dump_enabled_p ())
7066     {
7067       dump_printf_loc (MSG_NOTE, vect_location,
7068                        "transform induction: created def-use cycle: ");
7069       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7070       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7071                         SSA_NAME_DEF_STMT (vec_def), 0);
7072     }
7073
7074   return true;
7075 }
7076
7077 /* Function vectorizable_live_operation.
7078
7079    STMT computes a value that is used outside the loop.  Check if
7080    it can be supported.  */
7081
7082 bool
7083 vectorizable_live_operation (gimple *stmt,
7084                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7085                              slp_tree slp_node, int slp_index,
7086                              gimple **vec_stmt)
7087 {
7088   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7089   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7090   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7091   imm_use_iterator imm_iter;
7092   tree lhs, lhs_type, bitsize, vec_bitsize;
7093   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7094   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7095   int ncopies;
7096   gimple *use_stmt;
7097   auto_vec<tree> vec_oprnds;
7098
7099   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7100
7101   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7102     return false;
7103
7104   /* FORNOW.  CHECKME.  */
7105   if (nested_in_vect_loop_p (loop, stmt))
7106     return false;
7107
7108   /* If STMT is not relevant and it is a simple assignment and its inputs are
7109      invariant then it can remain in place, unvectorized.  The original last
7110      scalar value that it computes will be used.  */
7111   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7112     {
7113       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7114       if (dump_enabled_p ())
7115         dump_printf_loc (MSG_NOTE, vect_location,
7116                          "statement is simple and uses invariant.  Leaving in "
7117                          "place.\n");
7118       return true;
7119     }
7120
7121   if (slp_node)
7122     ncopies = 1;
7123   else
7124     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7125
7126   if (!vec_stmt)
7127     /* No transformation required.  */
7128     return true;
7129
7130   /* If stmt has a related stmt, then use that for getting the lhs.  */
7131   if (is_pattern_stmt_p (stmt_info))
7132     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7133
7134   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7135         : gimple_get_lhs (stmt);
7136   lhs_type = TREE_TYPE (lhs);
7137
7138   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7139              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7140              : TYPE_SIZE (TREE_TYPE (vectype)));
7141   vec_bitsize = TYPE_SIZE (vectype);
7142
7143   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7144   tree vec_lhs, bitstart;
7145   if (slp_node)
7146     {
7147       gcc_assert (slp_index >= 0);
7148
7149       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7150       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7151
7152       /* Get the last occurrence of the scalar index from the concatenation of
7153          all the slp vectors. Calculate which slp vector it is and the index
7154          within.  */
7155       int pos = (num_vec * nunits) - num_scalar + slp_index;
7156       int vec_entry = pos / nunits;
7157       int vec_index = pos % nunits;
7158
7159       /* Get the correct slp vectorized stmt.  */
7160       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7161
7162       /* Get entry to use.  */
7163       bitstart = bitsize_int (vec_index);
7164       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7165     }
7166   else
7167     {
7168       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7169       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7170
7171       /* For multiple copies, get the last copy.  */
7172       for (int i = 1; i < ncopies; ++i)
7173         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7174                                                   vec_lhs);
7175
7176       /* Get the last lane in the vector.  */
7177       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7178     }
7179
7180   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7181      loop.  */
7182   gimple_seq stmts = NULL;
7183   tree bftype = TREE_TYPE (vectype);
7184   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7185     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7186   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7187   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7188                                    true, NULL_TREE);
7189   if (stmts)
7190     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7191
7192   /* Replace use of lhs with newly computed result.  If the use stmt is a
7193      single arg PHI, just replace all uses of PHI result.  It's necessary
7194      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7195   use_operand_p use_p;
7196   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7197     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7198         && !is_gimple_debug (use_stmt))
7199     {
7200       if (gimple_code (use_stmt) == GIMPLE_PHI
7201           && gimple_phi_num_args (use_stmt) == 1)
7202         {
7203           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7204         }
7205       else
7206         {
7207           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7208             SET_USE (use_p, new_tree);
7209         }
7210       update_stmt (use_stmt);
7211     }
7212
7213   return true;
7214 }
7215
7216 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7217
7218 static void
7219 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7220 {
7221   ssa_op_iter op_iter;
7222   imm_use_iterator imm_iter;
7223   def_operand_p def_p;
7224   gimple *ustmt;
7225
7226   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7227     {
7228       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7229         {
7230           basic_block bb;
7231
7232           if (!is_gimple_debug (ustmt))
7233             continue;
7234
7235           bb = gimple_bb (ustmt);
7236
7237           if (!flow_bb_inside_loop_p (loop, bb))
7238             {
7239               if (gimple_debug_bind_p (ustmt))
7240                 {
7241                   if (dump_enabled_p ())
7242                     dump_printf_loc (MSG_NOTE, vect_location,
7243                                      "killing debug use\n");
7244
7245                   gimple_debug_bind_reset_value (ustmt);
7246                   update_stmt (ustmt);
7247                 }
7248               else
7249                 gcc_unreachable ();
7250             }
7251         }
7252     }
7253 }
7254
7255 /* Given loop represented by LOOP_VINFO, return true if computation of
7256    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7257    otherwise.  */
7258
7259 static bool
7260 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7261 {
7262   /* Constant case.  */
7263   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7264     {
7265       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7266       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7267
7268       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7269       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7270       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7271         return true;
7272     }
7273
7274   widest_int max;
7275   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7276   /* Check the upper bound of loop niters.  */
7277   if (get_max_loop_iterations (loop, &max))
7278     {
7279       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7280       signop sgn = TYPE_SIGN (type);
7281       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7282       if (max < type_max)
7283         return true;
7284     }
7285   return false;
7286 }
7287
7288 /* Scale profiling counters by estimation for LOOP which is vectorized
7289    by factor VF.  */
7290
7291 static void
7292 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7293 {
7294   edge preheader = loop_preheader_edge (loop);
7295   /* Reduce loop iterations by the vectorization factor.  */
7296   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7297   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7298
7299   if (freq_h.nonzero_p ())
7300     {
7301       profile_probability p;
7302
7303       /* Avoid dropping loop body profile counter to 0 because of zero count
7304          in loop's preheader.  */
7305       if (!(freq_e == profile_count::zero ()))
7306         freq_e = freq_e.force_nonzero ();
7307       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7308       scale_loop_frequencies (loop, p);
7309     }
7310
7311   edge exit_e = single_exit (loop);
7312   exit_e->probability = profile_probability::always ()
7313                                  .apply_scale (1, new_est_niter + 1);
7314
7315   edge exit_l = single_pred_edge (loop->latch);
7316   profile_probability prob = exit_l->probability;
7317   exit_l->probability = exit_e->probability.invert ();
7318   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7319     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7320 }
7321
7322 /* Function vect_transform_loop.
7323
7324    The analysis phase has determined that the loop is vectorizable.
7325    Vectorize the loop - created vectorized stmts to replace the scalar
7326    stmts in the loop, and update the loop exit condition.
7327    Returns scalar epilogue loop if any.  */
7328
7329 struct loop *
7330 vect_transform_loop (loop_vec_info loop_vinfo)
7331 {
7332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7333   struct loop *epilogue = NULL;
7334   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7335   int nbbs = loop->num_nodes;
7336   int i;
7337   tree niters_vector = NULL;
7338   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7339   bool grouped_store;
7340   bool slp_scheduled = false;
7341   gimple *stmt, *pattern_stmt;
7342   gimple_seq pattern_def_seq = NULL;
7343   gimple_stmt_iterator pattern_def_si = gsi_none ();
7344   bool transform_pattern_stmt = false;
7345   bool check_profitability = false;
7346   int th;
7347
7348   if (dump_enabled_p ())
7349     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7350
7351   /* Use the more conservative vectorization threshold.  If the number
7352      of iterations is constant assume the cost check has been performed
7353      by our caller.  If the threshold makes all loops profitable that
7354      run at least the vectorization factor number of times checking
7355      is pointless, too.  */
7356   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7357   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7358       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7359     {
7360       if (dump_enabled_p ())
7361         dump_printf_loc (MSG_NOTE, vect_location,
7362                          "Profitability threshold is %d loop iterations.\n",
7363                          th);
7364       check_profitability = true;
7365     }
7366
7367   /* Make sure there exists a single-predecessor exit bb.  Do this before
7368      versioning.   */
7369   edge e = single_exit (loop);
7370   if (! single_pred_p (e->dest))
7371     {
7372       split_loop_exit_edge (e);
7373       if (dump_enabled_p ())
7374         dump_printf (MSG_NOTE, "split exit edge\n");
7375     }
7376
7377   /* Version the loop first, if required, so the profitability check
7378      comes first.  */
7379
7380   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7381     {
7382       poly_uint64 versioning_threshold
7383         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
7384       if (check_profitability
7385           && ordered_p (poly_uint64 (th), versioning_threshold))
7386         {
7387           versioning_threshold = ordered_max (poly_uint64 (th),
7388                                               versioning_threshold);
7389           check_profitability = false;
7390         }
7391       vect_loop_versioning (loop_vinfo, th, check_profitability,
7392                             versioning_threshold);
7393       check_profitability = false;
7394     }
7395
7396   /* Make sure there exists a single-predecessor exit bb also on the
7397      scalar loop copy.  Do this after versioning but before peeling
7398      so CFG structure is fine for both scalar and if-converted loop
7399      to make slpeel_duplicate_current_defs_from_edges face matched
7400      loop closed PHI nodes on the exit.  */
7401   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7402     {
7403       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7404       if (! single_pred_p (e->dest))
7405         {
7406           split_loop_exit_edge (e);
7407           if (dump_enabled_p ())
7408             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7409         }
7410     }
7411
7412   tree niters = vect_build_loop_niters (loop_vinfo);
7413   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7414   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7415   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7416   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7417                               check_profitability, niters_no_overflow);
7418   if (niters_vector == NULL_TREE)
7419     {
7420       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7421         niters_vector
7422           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7423                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7424       else
7425         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7426                                      niters_no_overflow);
7427     }
7428
7429   /* 1) Make sure the loop header has exactly two entries
7430      2) Make sure we have a preheader basic block.  */
7431
7432   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7433
7434   split_edge (loop_preheader_edge (loop));
7435
7436   /* FORNOW: the vectorizer supports only loops which body consist
7437      of one basic block (header + empty latch). When the vectorizer will
7438      support more involved loop forms, the order by which the BBs are
7439      traversed need to be reconsidered.  */
7440
7441   for (i = 0; i < nbbs; i++)
7442     {
7443       basic_block bb = bbs[i];
7444       stmt_vec_info stmt_info;
7445
7446       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7447            gsi_next (&si))
7448         {
7449           gphi *phi = si.phi ();
7450           if (dump_enabled_p ())
7451             {
7452               dump_printf_loc (MSG_NOTE, vect_location,
7453                                "------>vectorizing phi: ");
7454               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7455             }
7456           stmt_info = vinfo_for_stmt (phi);
7457           if (!stmt_info)
7458             continue;
7459
7460           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7461             vect_loop_kill_debug_uses (loop, phi);
7462
7463           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7464               && !STMT_VINFO_LIVE_P (stmt_info))
7465             continue;
7466
7467           if (STMT_VINFO_VECTYPE (stmt_info)
7468               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7469                   != (unsigned HOST_WIDE_INT) vf)
7470               && dump_enabled_p ())
7471             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7472
7473           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7474                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7475                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7476               && ! PURE_SLP_STMT (stmt_info))
7477             {
7478               if (dump_enabled_p ())
7479                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7480               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7481             }
7482         }
7483
7484       pattern_stmt = NULL;
7485       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7486            !gsi_end_p (si) || transform_pattern_stmt;)
7487         {
7488           bool is_store;
7489
7490           if (transform_pattern_stmt)
7491             stmt = pattern_stmt;
7492           else
7493             {
7494               stmt = gsi_stmt (si);
7495               /* During vectorization remove existing clobber stmts.  */
7496               if (gimple_clobber_p (stmt))
7497                 {
7498                   unlink_stmt_vdef (stmt);
7499                   gsi_remove (&si, true);
7500                   release_defs (stmt);
7501                   continue;
7502                 }
7503             }
7504
7505           if (dump_enabled_p ())
7506             {
7507               dump_printf_loc (MSG_NOTE, vect_location,
7508                                "------>vectorizing statement: ");
7509               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7510             }
7511
7512           stmt_info = vinfo_for_stmt (stmt);
7513
7514           /* vector stmts created in the outer-loop during vectorization of
7515              stmts in an inner-loop may not have a stmt_info, and do not
7516              need to be vectorized.  */
7517           if (!stmt_info)
7518             {
7519               gsi_next (&si);
7520               continue;
7521             }
7522
7523           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7524             vect_loop_kill_debug_uses (loop, stmt);
7525
7526           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7527               && !STMT_VINFO_LIVE_P (stmt_info))
7528             {
7529               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7530                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7531                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7532                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7533                 {
7534                   stmt = pattern_stmt;
7535                   stmt_info = vinfo_for_stmt (stmt);
7536                 }
7537               else
7538                 {
7539                   gsi_next (&si);
7540                   continue;
7541                 }
7542             }
7543           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7544                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7545                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7546                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7547             transform_pattern_stmt = true;
7548
7549           /* If pattern statement has def stmts, vectorize them too.  */
7550           if (is_pattern_stmt_p (stmt_info))
7551             {
7552               if (pattern_def_seq == NULL)
7553                 {
7554                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7555                   pattern_def_si = gsi_start (pattern_def_seq);
7556                 }
7557               else if (!gsi_end_p (pattern_def_si))
7558                 gsi_next (&pattern_def_si);
7559               if (pattern_def_seq != NULL)
7560                 {
7561                   gimple *pattern_def_stmt = NULL;
7562                   stmt_vec_info pattern_def_stmt_info = NULL;
7563
7564                   while (!gsi_end_p (pattern_def_si))
7565                     {
7566                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7567                       pattern_def_stmt_info
7568                         = vinfo_for_stmt (pattern_def_stmt);
7569                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7570                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7571                         break;
7572                       gsi_next (&pattern_def_si);
7573                     }
7574
7575                   if (!gsi_end_p (pattern_def_si))
7576                     {
7577                       if (dump_enabled_p ())
7578                         {
7579                           dump_printf_loc (MSG_NOTE, vect_location,
7580                                            "==> vectorizing pattern def "
7581                                            "stmt: ");
7582                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7583                                             pattern_def_stmt, 0);
7584                         }
7585
7586                       stmt = pattern_def_stmt;
7587                       stmt_info = pattern_def_stmt_info;
7588                     }
7589                   else
7590                     {
7591                       pattern_def_si = gsi_none ();
7592                       transform_pattern_stmt = false;
7593                     }
7594                 }
7595               else
7596                 transform_pattern_stmt = false;
7597             }
7598
7599           if (STMT_VINFO_VECTYPE (stmt_info))
7600             {
7601               unsigned int nunits
7602                 = (unsigned int)
7603                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7604               if (!STMT_SLP_TYPE (stmt_info)
7605                   && nunits != (unsigned int) vf
7606                   && dump_enabled_p ())
7607                   /* For SLP VF is set according to unrolling factor, and not
7608                      to vector size, hence for SLP this print is not valid.  */
7609                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7610             }
7611
7612           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7613              reached.  */
7614           if (STMT_SLP_TYPE (stmt_info))
7615             {
7616               if (!slp_scheduled)
7617                 {
7618                   slp_scheduled = true;
7619
7620                   if (dump_enabled_p ())
7621                     dump_printf_loc (MSG_NOTE, vect_location,
7622                                      "=== scheduling SLP instances ===\n");
7623
7624                   vect_schedule_slp (loop_vinfo);
7625                 }
7626
7627               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7628               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7629                 {
7630                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7631                     {
7632                       pattern_def_seq = NULL;
7633                       gsi_next (&si);
7634                     }
7635                   continue;
7636                 }
7637             }
7638
7639           /* -------- vectorize statement ------------ */
7640           if (dump_enabled_p ())
7641             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7642
7643           grouped_store = false;
7644           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7645           if (is_store)
7646             {
7647               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7648                 {
7649                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7650                      interleaving chain was completed - free all the stores in
7651                      the chain.  */
7652                   gsi_next (&si);
7653                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7654                 }
7655               else
7656                 {
7657                   /* Free the attached stmt_vec_info and remove the stmt.  */
7658                   gimple *store = gsi_stmt (si);
7659                   free_stmt_vec_info (store);
7660                   unlink_stmt_vdef (store);
7661                   gsi_remove (&si, true);
7662                   release_defs (store);
7663                 }
7664
7665               /* Stores can only appear at the end of pattern statements.  */
7666               gcc_assert (!transform_pattern_stmt);
7667               pattern_def_seq = NULL;
7668             }
7669           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7670             {
7671               pattern_def_seq = NULL;
7672               gsi_next (&si);
7673             }
7674         }                       /* stmts in BB */
7675     }                           /* BBs in loop */
7676
7677   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7678
7679   scale_profile_for_vect_loop (loop, vf);
7680
7681   /* The minimum number of iterations performed by the epilogue.  This
7682      is 1 when peeling for gaps because we always need a final scalar
7683      iteration.  */
7684   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7685   /* +1 to convert latch counts to loop iteration counts,
7686      -min_epilogue_iters to remove iterations that cannot be performed
7687        by the vector code.  */
7688   int bias = 1 - min_epilogue_iters;
7689   /* In these calculations the "- 1" converts loop iteration counts
7690      back to latch counts.  */
7691   if (loop->any_upper_bound)
7692     loop->nb_iterations_upper_bound
7693       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7694   if (loop->any_likely_upper_bound)
7695     loop->nb_iterations_likely_upper_bound
7696       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7697   if (loop->any_estimate)
7698     loop->nb_iterations_estimate
7699       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7700
7701   if (dump_enabled_p ())
7702     {
7703       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7704         {
7705           dump_printf_loc (MSG_NOTE, vect_location,
7706                            "LOOP VECTORIZED\n");
7707           if (loop->inner)
7708             dump_printf_loc (MSG_NOTE, vect_location,
7709                              "OUTER LOOP VECTORIZED\n");
7710           dump_printf (MSG_NOTE, "\n");
7711         }
7712       else
7713         dump_printf_loc (MSG_NOTE, vect_location,
7714                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7715                          current_vector_size);
7716     }
7717
7718   /* Free SLP instances here because otherwise stmt reference counting
7719      won't work.  */
7720   slp_instance instance;
7721   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7722     vect_free_slp_instance (instance);
7723   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7724   /* Clear-up safelen field since its value is invalid after vectorization
7725      since vectorized loop can have loop-carried dependencies.  */
7726   loop->safelen = 0;
7727
7728   /* Don't vectorize epilogue for epilogue.  */
7729   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7730     epilogue = NULL;
7731
7732   if (epilogue)
7733     {
7734         unsigned int vector_sizes
7735           = targetm.vectorize.autovectorize_vector_sizes ();
7736         vector_sizes &= current_vector_size - 1;
7737
7738         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7739           epilogue = NULL;
7740         else if (!vector_sizes)
7741           epilogue = NULL;
7742         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7743                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7744           {
7745             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7746             int ratio = current_vector_size / smallest_vec_size;
7747             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7748               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7749             eiters = eiters % vf;
7750
7751             epilogue->nb_iterations_upper_bound = eiters - 1;
7752
7753             if (eiters < vf / ratio)
7754               epilogue = NULL;
7755             }
7756     }
7757
7758   if (epilogue)
7759     {
7760       epilogue->force_vectorize = loop->force_vectorize;
7761       epilogue->safelen = loop->safelen;
7762       epilogue->dont_vectorize = false;
7763
7764       /* We may need to if-convert epilogue to vectorize it.  */
7765       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7766         tree_if_conversion (epilogue);
7767     }
7768
7769   return epilogue;
7770 }
7771
7772 /* The code below is trying to perform simple optimization - revert
7773    if-conversion for masked stores, i.e. if the mask of a store is zero
7774    do not perform it and all stored value producers also if possible.
7775    For example,
7776      for (i=0; i<n; i++)
7777        if (c[i])
7778         {
7779           p1[i] += 1;
7780           p2[i] = p3[i] +2;
7781         }
7782    this transformation will produce the following semi-hammock:
7783
7784    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7785      {
7786        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7787        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7788        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7789        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7790        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7791        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7792      }
7793 */
7794
7795 void
7796 optimize_mask_stores (struct loop *loop)
7797 {
7798   basic_block *bbs = get_loop_body (loop);
7799   unsigned nbbs = loop->num_nodes;
7800   unsigned i;
7801   basic_block bb;
7802   struct loop *bb_loop;
7803   gimple_stmt_iterator gsi;
7804   gimple *stmt;
7805   auto_vec<gimple *> worklist;
7806
7807   vect_location = find_loop_location (loop);
7808   /* Pick up all masked stores in loop if any.  */
7809   for (i = 0; i < nbbs; i++)
7810     {
7811       bb = bbs[i];
7812       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7813            gsi_next (&gsi))
7814         {
7815           stmt = gsi_stmt (gsi);
7816           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7817             worklist.safe_push (stmt);
7818         }
7819     }
7820
7821   free (bbs);
7822   if (worklist.is_empty ())
7823     return;
7824
7825   /* Loop has masked stores.  */
7826   while (!worklist.is_empty ())
7827     {
7828       gimple *last, *last_store;
7829       edge e, efalse;
7830       tree mask;
7831       basic_block store_bb, join_bb;
7832       gimple_stmt_iterator gsi_to;
7833       tree vdef, new_vdef;
7834       gphi *phi;
7835       tree vectype;
7836       tree zero;
7837
7838       last = worklist.pop ();
7839       mask = gimple_call_arg (last, 2);
7840       bb = gimple_bb (last);
7841       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7842          the same loop as if_bb.  It could be different to LOOP when two
7843          level loop-nest is vectorized and mask_store belongs to the inner
7844          one.  */
7845       e = split_block (bb, last);
7846       bb_loop = bb->loop_father;
7847       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7848       join_bb = e->dest;
7849       store_bb = create_empty_bb (bb);
7850       add_bb_to_loop (store_bb, bb_loop);
7851       e->flags = EDGE_TRUE_VALUE;
7852       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7853       /* Put STORE_BB to likely part.  */
7854       efalse->probability = profile_probability::unlikely ();
7855       store_bb->count = efalse->count ();
7856       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7857       if (dom_info_available_p (CDI_DOMINATORS))
7858         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7859       if (dump_enabled_p ())
7860         dump_printf_loc (MSG_NOTE, vect_location,
7861                          "Create new block %d to sink mask stores.",
7862                          store_bb->index);
7863       /* Create vector comparison with boolean result.  */
7864       vectype = TREE_TYPE (mask);
7865       zero = build_zero_cst (vectype);
7866       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7867       gsi = gsi_last_bb (bb);
7868       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7869       /* Create new PHI node for vdef of the last masked store:
7870          .MEM_2 = VDEF <.MEM_1>
7871          will be converted to
7872          .MEM.3 = VDEF <.MEM_1>
7873          and new PHI node will be created in join bb
7874          .MEM_2 = PHI <.MEM_1, .MEM_3>
7875       */
7876       vdef = gimple_vdef (last);
7877       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7878       gimple_set_vdef (last, new_vdef);
7879       phi = create_phi_node (vdef, join_bb);
7880       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7881
7882       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7883       while (true)
7884         {
7885           gimple_stmt_iterator gsi_from;
7886           gimple *stmt1 = NULL;
7887
7888           /* Move masked store to STORE_BB.  */
7889           last_store = last;
7890           gsi = gsi_for_stmt (last);
7891           gsi_from = gsi;
7892           /* Shift GSI to the previous stmt for further traversal.  */
7893           gsi_prev (&gsi);
7894           gsi_to = gsi_start_bb (store_bb);
7895           gsi_move_before (&gsi_from, &gsi_to);
7896           /* Setup GSI_TO to the non-empty block start.  */
7897           gsi_to = gsi_start_bb (store_bb);
7898           if (dump_enabled_p ())
7899             {
7900               dump_printf_loc (MSG_NOTE, vect_location,
7901                                "Move stmt to created bb\n");
7902               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7903             }
7904           /* Move all stored value producers if possible.  */
7905           while (!gsi_end_p (gsi))
7906             {
7907               tree lhs;
7908               imm_use_iterator imm_iter;
7909               use_operand_p use_p;
7910               bool res;
7911
7912               /* Skip debug statements.  */
7913               if (is_gimple_debug (gsi_stmt (gsi)))
7914                 {
7915                   gsi_prev (&gsi);
7916                   continue;
7917                 }
7918               stmt1 = gsi_stmt (gsi);
7919               /* Do not consider statements writing to memory or having
7920                  volatile operand.  */
7921               if (gimple_vdef (stmt1)
7922                   || gimple_has_volatile_ops (stmt1))
7923                 break;
7924               gsi_from = gsi;
7925               gsi_prev (&gsi);
7926               lhs = gimple_get_lhs (stmt1);
7927               if (!lhs)
7928                 break;
7929
7930               /* LHS of vectorized stmt must be SSA_NAME.  */
7931               if (TREE_CODE (lhs) != SSA_NAME)
7932                 break;
7933
7934               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7935                 {
7936                   /* Remove dead scalar statement.  */
7937                   if (has_zero_uses (lhs))
7938                     {
7939                       gsi_remove (&gsi_from, true);
7940                       continue;
7941                     }
7942                 }
7943
7944               /* Check that LHS does not have uses outside of STORE_BB.  */
7945               res = true;
7946               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7947                 {
7948                   gimple *use_stmt;
7949                   use_stmt = USE_STMT (use_p);
7950                   if (is_gimple_debug (use_stmt))
7951                     continue;
7952                   if (gimple_bb (use_stmt) != store_bb)
7953                     {
7954                       res = false;
7955                       break;
7956                     }
7957                 }
7958               if (!res)
7959                 break;
7960
7961               if (gimple_vuse (stmt1)
7962                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7963                 break;
7964
7965               /* Can move STMT1 to STORE_BB.  */
7966               if (dump_enabled_p ())
7967                 {
7968                   dump_printf_loc (MSG_NOTE, vect_location,
7969                                    "Move stmt to created bb\n");
7970                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7971                 }
7972               gsi_move_before (&gsi_from, &gsi_to);
7973               /* Shift GSI_TO for further insertion.  */
7974               gsi_prev (&gsi_to);
7975             }
7976           /* Put other masked stores with the same mask to STORE_BB.  */
7977           if (worklist.is_empty ()
7978               || gimple_call_arg (worklist.last (), 2) != mask
7979               || worklist.last () != stmt1)
7980             break;
7981           last = worklist.pop ();
7982         }
7983       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7984     }
7985 }