gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55
  56 /* Loop Vectorization Pass.
  57
  58    This pass tries to vectorize loops.
  59
  60    For example, the vectorizer transforms the following simple loop:
  61
  62         short a[N]; short b[N]; short c[N]; int i;
  63
  64         for (i=0; i<N; i++){
  65           a[i] = b[i] + c[i];
  66         }
  67
  68    as if it was manually vectorized by rewriting the source code into:
  69
  70         typedef int __attribute__((mode(V8HI))) v8hi;
  71         short a[N];  short b[N]; short c[N];   int i;
  72         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  73         v8hi va, vb, vc;
  74
  75         for (i=0; i<N/8; i++){
  76           vb = pb[i];
  77           vc = pc[i];
  78           va = vb + vc;
  79           pa[i] = va;
  80         }
  81
  82         The main entry to this pass is vectorize_loops(), in which
  83    the vectorizer applies a set of analyses on a given set of loops,
  84    followed by the actual vectorization transformation for the loops that
  85    had successfully passed the analysis phase.
  86         Throughout this pass we make a distinction between two types of
  87    data: scalars (which are represented by SSA_NAMES), and memory references
  88    ("data-refs").  These two types of data require different handling both
  89    during analysis and transformation. The types of data-refs that the
  90    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  91    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  92    accesses are required to have a simple (consecutive) access pattern.
  93
  94    Analysis phase:
  95    ===============
  96         The driver for the analysis phase is vect_analyze_loop().
  97    It applies a set of analyses, some of which rely on the scalar evolution
  98    analyzer (scev) developed by Sebastian Pop.
  99
 100         During the analysis phase the vectorizer records some information
 101    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 102    loop, as well as general information about the loop as a whole, which is
 103    recorded in a "loop_vec_info" struct attached to each loop.
 104
 105    Transformation phase:
 106    =====================
 107         The loop transformation phase scans all the stmts in the loop, and
 108    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 109    the loop that needs to be vectorized.  It inserts the vector code sequence
 110    just before the scalar stmt S, and records a pointer to the vector code
 111    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 112    attached to S).  This pointer will be used for the vectorization of following
 113    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 114    otherwise, we rely on dead code elimination for removing it.
 115
 116         For example, say stmt S1 was vectorized into stmt VS1:
 117
 118    VS1: vb = px[i];
 119    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 120    S2:  a = b;
 121
 122    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 123    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 124    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 125    resulting sequence would be:
 126
 127    VS1: vb = px[i];
 128    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 129    VS2: va = vb;
 130    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 131
 132         Operands that are not SSA_NAMEs, are data-refs that appear in
 133    load/store operations (like 'x[i]' in S1), and are handled differently.
 134
 135    Target modeling:
 136    =================
 137         Currently the only target specific information that is used is the
 138    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 139    Targets that can support different sizes of vectors, for now will need
 140    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 141    flexibility will be added in the future.
 142
 143         Since we only vectorize operations which vector form can be
 144    expressed using existing tree codes, to verify that an operation is
 145    supported, the vectorizer checks the relevant optab at the relevant
 146    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 147    the value found is CODE_FOR_nothing, then there's no target support, and
 148    we can't vectorize the stmt.
 149
 150    For additional information on this project see:
 151    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 152 */
 153
 154 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 155
 156 /* Function vect_determine_vectorization_factor
 157
 158    Determine the vectorization factor (VF).  VF is the number of data elements
 159    that are operated upon in parallel in a single iteration of the vectorized
 160    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 161    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 162    elements can fit in a single vector register.
 163
 164    We currently support vectorization of loops in which all types operated upon
 165    are of the same size.  Therefore this function currently sets VF according to
 166    the size of the types operated upon, and fails if there are multiple sizes
 167    in the loop.
 168
 169    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 170    original loop:
 171         for (i=0; i<N; i++){
 172           a[i] = b[i] + c[i];
 173         }
 174
 175    vectorized loop:
 176         for (i=0; i<N; i+=VF){
 177           a[i:VF] = b[i:VF] + c[i:VF];
 178         }
 179 */
 180
 181 static bool
 182 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 183 {
 184   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 185   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 186   unsigned nbbs = loop->num_nodes;
 187   unsigned int vectorization_factor = 0;
 188   tree scalar_type = NULL_TREE;
 189   gphi *phi;
 190   tree vectype;
 191   unsigned int nunits;
 192   stmt_vec_info stmt_info;
 193   unsigned i;
 194   HOST_WIDE_INT dummy;
 195   gimple *stmt, *pattern_stmt = NULL;
 196   gimple_seq pattern_def_seq = NULL;
 197   gimple_stmt_iterator pattern_def_si = gsi_none ();
 198   bool analyze_pattern_stmt = false;
 199   bool bool_result;
 200   auto_vec<stmt_vec_info> mask_producers;
 201
 202   if (dump_enabled_p ())
 203     dump_printf_loc (MSG_NOTE, vect_location,
 204                      "=== vect_determine_vectorization_factor ===\n");
 205
 206   for (i = 0; i < nbbs; i++)
 207     {
 208       basic_block bb = bbs[i];
 209
 210       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 211            gsi_next (&si))
 212         {
 213           phi = si.phi ();
 214           stmt_info = vinfo_for_stmt (phi);
 215           if (dump_enabled_p ())
 216             {
 217               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 218               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info)
 224               || STMT_VINFO_LIVE_P (stmt_info))
 225             {
 226               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 227               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 228
 229               if (dump_enabled_p ())
 230                 {
 231                   dump_printf_loc (MSG_NOTE, vect_location,
 232                                    "get vectype for scalar type:  ");
 233                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 234                   dump_printf (MSG_NOTE, "\n");
 235                 }
 236
 237               vectype = get_vectype_for_scalar_type (scalar_type);
 238               if (!vectype)
 239                 {
 240                   if (dump_enabled_p ())
 241                     {
 242                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 243                                        "not vectorized: unsupported "
 244                                        "data-type ");
 245                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 246                                          scalar_type);
 247                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 248                     }
 249                   return false;
 250                 }
 251               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 252
 253               if (dump_enabled_p ())
 254                 {
 255                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 256                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 257                   dump_printf (MSG_NOTE, "\n");
 258                 }
 259
 260               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 261               if (dump_enabled_p ())
 262                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 263                                  nunits);
 264
 265               if (!vectorization_factor
 266                   || (nunits > vectorization_factor))
 267                 vectorization_factor = nunits;
 268             }
 269         }
 270
 271       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 272            !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                     }
 310                 }
 311               else
 312                 {
 313                   if (dump_enabled_p ())
 314                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 315                   gsi_next (&si);
 316                   continue;
 317                 }
 318             }
 319           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 320                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 321                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 322                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 323             analyze_pattern_stmt = true;
 324
 325           /* If a pattern statement has def stmts, analyze them too.  */
 326           if (is_pattern_stmt_p (stmt_info))
 327             {
 328               if (pattern_def_seq == NULL)
 329                 {
 330                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 331                   pattern_def_si = gsi_start (pattern_def_seq);
 332                 }
 333               else if (!gsi_end_p (pattern_def_si))
 334                 gsi_next (&pattern_def_si);
 335               if (pattern_def_seq != NULL)
 336                 {
 337                   gimple *pattern_def_stmt = NULL;
 338                   stmt_vec_info pattern_def_stmt_info = NULL;
 339
 340                   while (!gsi_end_p (pattern_def_si))
 341                     {
 342                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 343                       pattern_def_stmt_info
 344                         = vinfo_for_stmt (pattern_def_stmt);
 345                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 346                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 347                         break;
 348                       gsi_next (&pattern_def_si);
 349                     }
 350
 351                   if (!gsi_end_p (pattern_def_si))
 352                     {
 353                       if (dump_enabled_p ())
 354                         {
 355                           dump_printf_loc (MSG_NOTE, vect_location,
 356                                            "==> examining pattern def stmt: ");
 357                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 358                                             pattern_def_stmt, 0);
 359                         }
 360
 361                       stmt = pattern_def_stmt;
 362                       stmt_info = pattern_def_stmt_info;
 363                     }
 364                   else
 365                     {
 366                       pattern_def_si = gsi_none ();
 367                       analyze_pattern_stmt = false;
 368                     }
 369                 }
 370               else
 371                 analyze_pattern_stmt = false;
 372             }
 373
 374           if (gimple_get_lhs (stmt) == NULL_TREE
 375               /* MASK_STORE has no lhs, but is ok.  */
 376               && (!is_gimple_call (stmt)
 377                   || !gimple_call_internal_p (stmt)
 378                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 379             {
 380               if (is_gimple_call (stmt))
 381                 {
 382                   /* Ignore calls with no lhs.  These must be calls to
 383                      #pragma omp simd functions, and what vectorization factor
 384                      it really needs can't be determined until
 385                      vectorizable_simd_clone_call.  */
 386                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 387                     {
 388                       pattern_def_seq = NULL;
 389                       gsi_next (&si);
 390                     }
 391                   continue;
 392                 }
 393               if (dump_enabled_p ())
 394                 {
 395                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 396                                    "not vectorized: irregular stmt.");
 397                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 398                                     0);
 399                 }
 400               return false;
 401             }
 402
 403           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 404             {
 405               if (dump_enabled_p ())
 406                 {
 407                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 408                                    "not vectorized: vector stmt in loop:");
 409                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 410                 }
 411               return false;
 412             }
 413
 414           bool_result = false;
 415
 416           if (STMT_VINFO_VECTYPE (stmt_info))
 417             {
 418               /* The only case when a vectype had been already set is for stmts
 419                  that contain a dataref, or for "pattern-stmts" (stmts
 420                  generated by the vectorizer to represent/replace a certain
 421                  idiom).  */
 422               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 423                           || is_pattern_stmt_p (stmt_info)
 424                           || !gsi_end_p (pattern_def_si));
 425               vectype = STMT_VINFO_VECTYPE (stmt_info);
 426             }
 427           else
 428             {
 429               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 430               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 431                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 432               else
 433                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 434
 435               /* Bool ops don't participate in vectorization factor
 436                  computation.  For comparison use compared types to
 437                  compute a factor.  */
 438               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 439                   && is_gimple_assign (stmt)
 440                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 441                 {
 442                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 443                       || STMT_VINFO_LIVE_P (stmt_info))
 444                     mask_producers.safe_push (stmt_info);
 445                   bool_result = true;
 446
 447                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                       == tcc_comparison
 449                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 450                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 531                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 558           if (!vectorization_factor
 559               || (nunits > vectorization_factor))
 560             vectorization_factor = nunits;
 561
 562           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 563             {
 564               pattern_def_seq = NULL;
 565               gsi_next (&si);
 566             }
 567         }
 568     }
 569
 570   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 571   if (dump_enabled_p ())
 572     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 573                      vectorization_factor);
 574   if (vectorization_factor <= 1)
 575     {
 576       if (dump_enabled_p ())
 577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 578                          "not vectorized: unsupported data-type\n");
 579       return false;
 580     }
 581   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 582
 583   for (i = 0; i < mask_producers.length (); i++)
 584     {
 585       tree mask_type = NULL;
 586
 587       stmt = STMT_VINFO_STMT (mask_producers[i]);
 588
 589       if (is_gimple_assign (stmt)
 590           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 591           && !VECT_SCALAR_BOOLEAN_TYPE_P
 592                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 593         {
 594           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 595           mask_type = get_mask_type_for_scalar_type (scalar_type);
 596
 597           if (!mask_type)
 598             {
 599               if (dump_enabled_p ())
 600                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 601                                  "not vectorized: unsupported mask\n");
 602               return false;
 603             }
 604         }
 605       else
 606         {
 607           tree rhs;
 608           ssa_op_iter iter;
 609           gimple *def_stmt;
 610           enum vect_def_type dt;
 611
 612           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 613             {
 614               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 615                                        &def_stmt, &dt, &vectype))
 616                 {
 617                   if (dump_enabled_p ())
 618                     {
 619                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 620                                        "not vectorized: can't compute mask type "
 621                                        "for statement, ");
 622                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 623                                         0);
 624                     }
 625                   return false;
 626                 }
 627
 628               /* No vectype probably means external definition.
 629                  Allow it in case there is another operand which
 630                  allows to determine mask type.  */
 631               if (!vectype)
 632                 continue;
 633
 634               if (!mask_type)
 635                 mask_type = vectype;
 636               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 637                        != TYPE_VECTOR_SUBPARTS (vectype))
 638                 {
 639                   if (dump_enabled_p ())
 640                     {
 641                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 642                                        "not vectorized: different sized masks "
 643                                        "types in statement, ");
 644                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 645                                          mask_type);
 646                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 647                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 648                                          vectype);
 649                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 650                     }
 651                   return false;
 652                 }
 653               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 654                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 655                 {
 656                   if (dump_enabled_p ())
 657                     {
 658                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 659                                        "not vectorized: mixed mask and "
 660                                        "nonmask vector types in statement, ");
 661                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 662                                          mask_type);
 663                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 664                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 665                                          vectype);
 666                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 667                     }
 668                   return false;
 669                 }
 670             }
 671
 672           /* We may compare boolean value loaded as vector of integers.
 673              Fix mask_type in such case.  */
 674           if (mask_type
 675               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 676               && gimple_code (stmt) == GIMPLE_ASSIGN
 677               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 678             mask_type = build_same_sized_truth_vector_type (mask_type);
 679         }
 680
 681       /* No mask_type should mean loop invariant predicate.
 682          This is probably a subject for optimization in
 683          if-conversion.  */
 684       if (!mask_type)
 685         {
 686           if (dump_enabled_p ())
 687             {
 688               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 689                                "not vectorized: can't compute mask type "
 690                                "for statement, ");
 691               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 692                                 0);
 693             }
 694           return false;
 695         }
 696
 697       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 698     }
 699
 700   return true;
 701 }
 702
 703
 704 /* Function vect_is_simple_iv_evolution.
 705
 706    FORNOW: A simple evolution of an induction variables in the loop is
 707    considered a polynomial evolution.  */
 708
 709 static bool
 710 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 711                              tree * step)
 712 {
 713   tree init_expr;
 714   tree step_expr;
 715   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 716   basic_block bb;
 717
 718   /* When there is no evolution in this loop, the evolution function
 719      is not "simple".  */
 720   if (evolution_part == NULL_TREE)
 721     return false;
 722
 723   /* When the evolution is a polynomial of degree >= 2
 724      the evolution function is not "simple".  */
 725   if (tree_is_chrec (evolution_part))
 726     return false;
 727
 728   step_expr = evolution_part;
 729   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 730
 731   if (dump_enabled_p ())
 732     {
 733       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 735       dump_printf (MSG_NOTE, ",  init: ");
 736       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 737       dump_printf (MSG_NOTE, "\n");
 738     }
 739
 740   *init = init_expr;
 741   *step = step_expr;
 742
 743   if (TREE_CODE (step_expr) != INTEGER_CST
 744       && (TREE_CODE (step_expr) != SSA_NAME
 745           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 746               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 747           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 748               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 749                   || !flag_associative_math)))
 750       && (TREE_CODE (step_expr) != REAL_CST
 751           || !flag_associative_math))
 752     {
 753       if (dump_enabled_p ())
 754         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 755                          "step unknown.\n");
 756       return false;
 757     }
 758
 759   return true;
 760 }
 761
 762 /* Function vect_analyze_scalar_cycles_1.
 763
 764    Examine the cross iteration def-use cycles of scalar variables
 765    in LOOP.  LOOP_VINFO represents the loop that is now being
 766    considered for vectorization (can be LOOP, or an outer-loop
 767    enclosing LOOP).  */
 768
 769 static void
 770 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 771 {
 772   basic_block bb = loop->header;
 773   tree init, step;
 774   auto_vec<gimple *, 64> worklist;
 775   gphi_iterator gsi;
 776   bool double_reduc;
 777
 778   if (dump_enabled_p ())
 779     dump_printf_loc (MSG_NOTE, vect_location,
 780                      "=== vect_analyze_scalar_cycles ===\n");
 781
 782   /* First - identify all inductions.  Reduction detection assumes that all the
 783      inductions have been identified, therefore, this order must not be
 784      changed.  */
 785   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 786     {
 787       gphi *phi = gsi.phi ();
 788       tree access_fn = NULL;
 789       tree def = PHI_RESULT (phi);
 790       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 791
 792       if (dump_enabled_p ())
 793         {
 794           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 795           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 796         }
 797
 798       /* Skip virtual phi's.  The data dependences that are associated with
 799          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 800       if (virtual_operand_p (def))
 801         continue;
 802
 803       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 804
 805       /* Analyze the evolution function.  */
 806       access_fn = analyze_scalar_evolution (loop, def);
 807       if (access_fn)
 808         {
 809           STRIP_NOPS (access_fn);
 810           if (dump_enabled_p ())
 811             {
 812               dump_printf_loc (MSG_NOTE, vect_location,
 813                                "Access function of PHI: ");
 814               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 815               dump_printf (MSG_NOTE, "\n");
 816             }
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 818             = initial_condition_in_loop_num (access_fn, loop->num);
 819           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 820             = evolution_part_in_loop_num (access_fn, loop->num);
 821         }
 822
 823       if (!access_fn
 824           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 825           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 826               && TREE_CODE (step) != INTEGER_CST))
 827         {
 828           worklist.safe_push (phi);
 829           continue;
 830         }
 831
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 833                   != NULL_TREE);
 834       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 835
 836       if (dump_enabled_p ())
 837         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 838       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 839     }
 840
 841
 842   /* Second - identify all reductions and nested cycles.  */
 843   while (worklist.length () > 0)
 844     {
 845       gimple *phi = worklist.pop ();
 846       tree def = PHI_RESULT (phi);
 847       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 848       gimple *reduc_stmt;
 849
 850       if (dump_enabled_p ())
 851         {
 852           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 853           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 854         }
 855
 856       gcc_assert (!virtual_operand_p (def)
 857                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 858
 859       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 860                                                 &double_reduc, false);
 861       if (reduc_stmt)
 862         {
 863           if (double_reduc)
 864             {
 865               if (dump_enabled_p ())
 866                 dump_printf_loc (MSG_NOTE, vect_location,
 867                                  "Detected double reduction.\n");
 868
 869               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 870               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 871                                                     vect_double_reduction_def;
 872             }
 873           else
 874             {
 875               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 876                 {
 877                   if (dump_enabled_p ())
 878                     dump_printf_loc (MSG_NOTE, vect_location,
 879                                      "Detected vectorizable nested cycle.\n");
 880
 881                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 882                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 883                                                              vect_nested_cycle;
 884                 }
 885               else
 886                 {
 887                   if (dump_enabled_p ())
 888                     dump_printf_loc (MSG_NOTE, vect_location,
 889                                      "Detected reduction.\n");
 890
 891                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 892                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 893                                                            vect_reduction_def;
 894                   /* Store the reduction cycles for possible vectorization in
 895                      loop-aware SLP if it was not detected as reduction
 896                      chain.  */
 897                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 898                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 899                 }
 900             }
 901         }
 902       else
 903         if (dump_enabled_p ())
 904           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 905                            "Unknown def-use cycle pattern.\n");
 906     }
 907 }
 908
 909
 910 /* Function vect_analyze_scalar_cycles.
 911
 912    Examine the cross iteration def-use cycles of scalar variables, by
 913    analyzing the loop-header PHIs of scalar variables.  Classify each
 914    cycle as one of the following: invariant, induction, reduction, unknown.
 915    We do that for the loop represented by LOOP_VINFO, and also to its
 916    inner-loop, if exists.
 917    Examples for scalar cycles:
 918
 919    Example1: reduction:
 920
 921               loop1:
 922               for (i=0; i<N; i++)
 923                  sum += a[i];
 924
 925    Example2: induction:
 926
 927               loop2:
 928               for (i=0; i<N; i++)
 929                  a[i] = i;  */
 930
 931 static void
 932 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 933 {
 934   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 935
 936   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 937
 938   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 939      Reductions in such inner-loop therefore have different properties than
 940      the reductions in the nest that gets vectorized:
 941      1. When vectorized, they are executed in the same order as in the original
 942         scalar loop, so we can't change the order of computation when
 943         vectorizing them.
 944      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 945         current checks are too strict.  */
 946
 947   if (loop->inner)
 948     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 949 }
 950
 951 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 952
 953 static void
 954 vect_fixup_reduc_chain (gimple *stmt)
 955 {
 956   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 957   gimple *stmtp;
 958   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 959               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 960   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 961   do
 962     {
 963       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 965       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 966       if (stmt)
 967         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 968           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 969     }
 970   while (stmt);
 971   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 972 }
 973
 974 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 975
 976 static void
 977 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 978 {
 979   gimple *first;
 980   unsigned i;
 981
 982   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 983     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 984       {
 985         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 986         while (next)
 987           {
 988             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 989               break;
 990             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 991           }
 992         /* If not all stmt in the chain are patterns try to handle
 993            the chain without patterns.  */
 994         if (! next)
 995           {
 996             vect_fixup_reduc_chain (first);
 997             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 998               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 999           }
1000       }
1001 }
1002
1003 /* Function vect_get_loop_niters.
1004
1005    Determine how many iterations the loop is executed and place it
1006    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1007    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1008    niter information holds in ASSUMPTIONS.
1009
1010    Return the loop exit condition.  */
1011
1012
1013 static gcond *
1014 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1015                       tree *number_of_iterations, tree *number_of_iterationsm1)
1016 {
1017   edge exit = single_exit (loop);
1018   struct tree_niter_desc niter_desc;
1019   tree niter_assumptions, niter, may_be_zero;
1020   gcond *cond = get_loop_exit_condition (loop);
1021
1022   *assumptions = boolean_true_node;
1023   *number_of_iterationsm1 = chrec_dont_know;
1024   *number_of_iterations = chrec_dont_know;
1025   if (dump_enabled_p ())
1026     dump_printf_loc (MSG_NOTE, vect_location,
1027                      "=== get_loop_niters ===\n");
1028
1029   if (!exit)
1030     return cond;
1031
1032   niter = chrec_dont_know;
1033   may_be_zero = NULL_TREE;
1034   niter_assumptions = boolean_true_node;
1035   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1036       || chrec_contains_undetermined (niter_desc.niter))
1037     return cond;
1038
1039   niter_assumptions = niter_desc.assumptions;
1040   may_be_zero = niter_desc.may_be_zero;
1041   niter = niter_desc.niter;
1042
1043   if (may_be_zero && integer_zerop (may_be_zero))
1044     may_be_zero = NULL_TREE;
1045
1046   if (may_be_zero)
1047     {
1048       if (COMPARISON_CLASS_P (may_be_zero))
1049         {
1050           /* Try to combine may_be_zero with assumptions, this can simplify
1051              computation of niter expression.  */
1052           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1053             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1054                                              niter_assumptions,
1055                                              fold_build1 (TRUTH_NOT_EXPR,
1056                                                           boolean_type_node,
1057                                                           may_be_zero));
1058           else
1059             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1060                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1061
1062           may_be_zero = NULL_TREE;
1063         }
1064       else if (integer_nonzerop (may_be_zero))
1065         {
1066           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1067           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1068           return cond;
1069         }
1070       else
1071         return cond;
1072     }
1073
1074   *assumptions = niter_assumptions;
1075   *number_of_iterationsm1 = niter;
1076
1077   /* We want the number of loop header executions which is the number
1078      of latch executions plus one.
1079      ???  For UINT_MAX latch executions this number overflows to zero
1080      for loops like do { n++; } while (n != 0);  */
1081   if (niter && !chrec_contains_undetermined (niter))
1082     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1083                           build_int_cst (TREE_TYPE (niter), 1));
1084   *number_of_iterations = niter;
1085
1086   return cond;
1087 }
1088
1089 /* Function bb_in_loop_p
1090
1091    Used as predicate for dfs order traversal of the loop bbs.  */
1092
1093 static bool
1094 bb_in_loop_p (const_basic_block bb, const void *data)
1095 {
1096   const struct loop *const loop = (const struct loop *)data;
1097   if (flow_bb_inside_loop_p (loop, bb))
1098     return true;
1099   return false;
1100 }
1101
1102
1103 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1104    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1105
1106 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1107   : vec_info (vec_info::loop, init_cost (loop_in)),
1108     loop (loop_in),
1109     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1110     num_itersm1 (NULL_TREE),
1111     num_iters (NULL_TREE),
1112     num_iters_unchanged (NULL_TREE),
1113     num_iters_assumptions (NULL_TREE),
1114     th (0),
1115     vectorization_factor (0),
1116     max_vectorization_factor (0),
1117     unaligned_dr (NULL),
1118     peeling_for_alignment (0),
1119     ptr_mask (0),
1120     slp_unrolling_factor (1),
1121     single_scalar_iteration_cost (0),
1122     vectorizable (false),
1123     peeling_for_gaps (false),
1124     peeling_for_niter (false),
1125     operands_swapped (false),
1126     no_data_dependencies (false),
1127     has_mask_store (false),
1128     scalar_loop (NULL),
1129     orig_loop_info (NULL)
1130 {
1131   /* Create/Update stmt_info for all stmts in the loop.  */
1132   basic_block *body = get_loop_body (loop);
1133   for (unsigned int i = 0; i < loop->num_nodes; i++)
1134     {
1135       basic_block bb = body[i];
1136       gimple_stmt_iterator si;
1137
1138       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1139         {
1140           gimple *phi = gsi_stmt (si);
1141           gimple_set_uid (phi, 0);
1142           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1143         }
1144
1145       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1146         {
1147           gimple *stmt = gsi_stmt (si);
1148           gimple_set_uid (stmt, 0);
1149           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1150         }
1151     }
1152   free (body);
1153
1154   /* CHECKME: We want to visit all BBs before their successors (except for
1155      latch blocks, for which this assertion wouldn't hold).  In the simple
1156      case of the loop forms we allow, a dfs order of the BBs would the same
1157      as reversed postorder traversal, so we are safe.  */
1158
1159   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1160                                           bbs, loop->num_nodes, loop);
1161   gcc_assert (nbbs == loop->num_nodes);
1162 }
1163
1164
1165 /* Free all memory used by the _loop_vec_info, as well as all the
1166    stmt_vec_info structs of all the stmts in the loop.  */
1167
1168 _loop_vec_info::~_loop_vec_info ()
1169 {
1170   int nbbs;
1171   gimple_stmt_iterator si;
1172   int j;
1173
1174   nbbs = loop->num_nodes;
1175   for (j = 0; j < nbbs; j++)
1176     {
1177       basic_block bb = bbs[j];
1178       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1179         free_stmt_vec_info (gsi_stmt (si));
1180
1181       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1182         {
1183           gimple *stmt = gsi_stmt (si);
1184
1185           /* We may have broken canonical form by moving a constant
1186              into RHS1 of a commutative op.  Fix such occurrences.  */
1187           if (operands_swapped && is_gimple_assign (stmt))
1188             {
1189               enum tree_code code = gimple_assign_rhs_code (stmt);
1190
1191               if ((code == PLUS_EXPR
1192                    || code == POINTER_PLUS_EXPR
1193                    || code == MULT_EXPR)
1194                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1195                 swap_ssa_operands (stmt,
1196                                    gimple_assign_rhs1_ptr (stmt),
1197                                    gimple_assign_rhs2_ptr (stmt));
1198               else if (code == COND_EXPR
1199                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1200                 {
1201                   tree cond_expr = gimple_assign_rhs1 (stmt);
1202                   enum tree_code cond_code = TREE_CODE (cond_expr);
1203
1204                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1205                     {
1206                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1207                                                                   0));
1208                       cond_code = invert_tree_comparison (cond_code,
1209                                                           honor_nans);
1210                       if (cond_code != ERROR_MARK)
1211                         {
1212                           TREE_SET_CODE (cond_expr, cond_code);
1213                           swap_ssa_operands (stmt,
1214                                              gimple_assign_rhs2_ptr (stmt),
1215                                              gimple_assign_rhs3_ptr (stmt));
1216                         }
1217                     }
1218                 }
1219             }
1220
1221           /* Free stmt_vec_info.  */
1222           free_stmt_vec_info (stmt);
1223           gsi_next (&si);
1224         }
1225     }
1226
1227   free (bbs);
1228
1229   loop->aux = NULL;
1230 }
1231
1232
1233 /* Calculate the cost of one scalar iteration of the loop.  */
1234 static void
1235 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1236 {
1237   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1238   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1239   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1240   int innerloop_iters, i;
1241
1242   /* Count statements in scalar loop.  Using this as scalar cost for a single
1243      iteration for now.
1244
1245      TODO: Add outer loop support.
1246
1247      TODO: Consider assigning different costs to different scalar
1248      statements.  */
1249
1250   /* FORNOW.  */
1251   innerloop_iters = 1;
1252   if (loop->inner)
1253     innerloop_iters = 50; /* FIXME */
1254
1255   for (i = 0; i < nbbs; i++)
1256     {
1257       gimple_stmt_iterator si;
1258       basic_block bb = bbs[i];
1259
1260       if (bb->loop_father == loop->inner)
1261         factor = innerloop_iters;
1262       else
1263         factor = 1;
1264
1265       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1266         {
1267           gimple *stmt = gsi_stmt (si);
1268           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1269
1270           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1271             continue;
1272
1273           /* Skip stmts that are not vectorized inside the loop.  */
1274           if (stmt_info
1275               && !STMT_VINFO_RELEVANT_P (stmt_info)
1276               && (!STMT_VINFO_LIVE_P (stmt_info)
1277                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1278               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1279             continue;
1280
1281           vect_cost_for_stmt kind;
1282           if (STMT_VINFO_DATA_REF (stmt_info))
1283             {
1284               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1285                kind = scalar_load;
1286              else
1287                kind = scalar_store;
1288             }
1289           else
1290             kind = scalar_stmt;
1291
1292           scalar_single_iter_cost
1293             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1294                                  factor, kind, stmt_info, 0, vect_prologue);
1295         }
1296     }
1297   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1298     = scalar_single_iter_cost;
1299 }
1300
1301
1302 /* Function vect_analyze_loop_form_1.
1303
1304    Verify that certain CFG restrictions hold, including:
1305    - the loop has a pre-header
1306    - the loop has a single entry and exit
1307    - the loop exit condition is simple enough
1308    - the number of iterations can be analyzed, i.e, a countable loop.  The
1309      niter could be analyzed under some assumptions.  */
1310
1311 bool
1312 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1313                           tree *assumptions, tree *number_of_iterationsm1,
1314                           tree *number_of_iterations, gcond **inner_loop_cond)
1315 {
1316   if (dump_enabled_p ())
1317     dump_printf_loc (MSG_NOTE, vect_location,
1318                      "=== vect_analyze_loop_form ===\n");
1319
1320   /* Different restrictions apply when we are considering an inner-most loop,
1321      vs. an outer (nested) loop.
1322      (FORNOW. May want to relax some of these restrictions in the future).  */
1323
1324   if (!loop->inner)
1325     {
1326       /* Inner-most loop.  We currently require that the number of BBs is
1327          exactly 2 (the header and latch).  Vectorizable inner-most loops
1328          look like this:
1329
1330                         (pre-header)
1331                            |
1332                           header <--------+
1333                            | |            |
1334                            | +--> latch --+
1335                            |
1336                         (exit-bb)  */
1337
1338       if (loop->num_nodes != 2)
1339         {
1340           if (dump_enabled_p ())
1341             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1342                              "not vectorized: control flow in loop.\n");
1343           return false;
1344         }
1345
1346       if (empty_block_p (loop->header))
1347         {
1348           if (dump_enabled_p ())
1349             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1350                              "not vectorized: empty loop.\n");
1351           return false;
1352         }
1353     }
1354   else
1355     {
1356       struct loop *innerloop = loop->inner;
1357       edge entryedge;
1358
1359       /* Nested loop. We currently require that the loop is doubly-nested,
1360          contains a single inner loop, and the number of BBs is exactly 5.
1361          Vectorizable outer-loops look like this:
1362
1363                         (pre-header)
1364                            |
1365                           header <---+
1366                            |         |
1367                           inner-loop |
1368                            |         |
1369                           tail ------+
1370                            |
1371                         (exit-bb)
1372
1373          The inner-loop has the properties expected of inner-most loops
1374          as described above.  */
1375
1376       if ((loop->inner)->inner || (loop->inner)->next)
1377         {
1378           if (dump_enabled_p ())
1379             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1380                              "not vectorized: multiple nested loops.\n");
1381           return false;
1382         }
1383
1384       if (loop->num_nodes != 5)
1385         {
1386           if (dump_enabled_p ())
1387             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388                              "not vectorized: control flow in loop.\n");
1389           return false;
1390         }
1391
1392       entryedge = loop_preheader_edge (innerloop);
1393       if (entryedge->src != loop->header
1394           || !single_exit (innerloop)
1395           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1396         {
1397           if (dump_enabled_p ())
1398             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1399                              "not vectorized: unsupported outerloop form.\n");
1400           return false;
1401         }
1402
1403       /* Analyze the inner-loop.  */
1404       tree inner_niterm1, inner_niter, inner_assumptions;
1405       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1406                                       &inner_assumptions, &inner_niterm1,
1407                                       &inner_niter, NULL)
1408           /* Don't support analyzing niter under assumptions for inner
1409              loop.  */
1410           || !integer_onep (inner_assumptions))
1411         {
1412           if (dump_enabled_p ())
1413             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1414                              "not vectorized: Bad inner loop.\n");
1415           return false;
1416         }
1417
1418       if (!expr_invariant_in_loop_p (loop, inner_niter))
1419         {
1420           if (dump_enabled_p ())
1421             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1422                              "not vectorized: inner-loop count not"
1423                              " invariant.\n");
1424           return false;
1425         }
1426
1427       if (dump_enabled_p ())
1428         dump_printf_loc (MSG_NOTE, vect_location,
1429                          "Considering outer-loop vectorization.\n");
1430     }
1431
1432   if (!single_exit (loop)
1433       || EDGE_COUNT (loop->header->preds) != 2)
1434     {
1435       if (dump_enabled_p ())
1436         {
1437           if (!single_exit (loop))
1438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1439                              "not vectorized: multiple exits.\n");
1440           else if (EDGE_COUNT (loop->header->preds) != 2)
1441             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1442                              "not vectorized: too many incoming edges.\n");
1443         }
1444       return false;
1445     }
1446
1447   /* We assume that the loop exit condition is at the end of the loop. i.e,
1448      that the loop is represented as a do-while (with a proper if-guard
1449      before the loop if needed), where the loop header contains all the
1450      executable statements, and the latch is empty.  */
1451   if (!empty_block_p (loop->latch)
1452       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1453     {
1454       if (dump_enabled_p ())
1455         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1456                          "not vectorized: latch block not empty.\n");
1457       return false;
1458     }
1459
1460   /* Make sure the exit is not abnormal.  */
1461   edge e = single_exit (loop);
1462   if (e->flags & EDGE_ABNORMAL)
1463     {
1464       if (dump_enabled_p ())
1465         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1466                          "not vectorized: abnormal loop exit edge.\n");
1467       return false;
1468     }
1469
1470   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1471                                      number_of_iterationsm1);
1472   if (!*loop_cond)
1473     {
1474       if (dump_enabled_p ())
1475         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1476                          "not vectorized: complicated exit condition.\n");
1477       return false;
1478     }
1479
1480   if (integer_zerop (*assumptions)
1481       || !*number_of_iterations
1482       || chrec_contains_undetermined (*number_of_iterations))
1483     {
1484       if (dump_enabled_p ())
1485         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1486                          "not vectorized: number of iterations cannot be "
1487                          "computed.\n");
1488       return false;
1489     }
1490
1491   if (integer_zerop (*number_of_iterations))
1492     {
1493       if (dump_enabled_p ())
1494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495                          "not vectorized: number of iterations = 0.\n");
1496       return false;
1497     }
1498
1499   return true;
1500 }
1501
1502 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1503
1504 loop_vec_info
1505 vect_analyze_loop_form (struct loop *loop)
1506 {
1507   tree assumptions, number_of_iterations, number_of_iterationsm1;
1508   gcond *loop_cond, *inner_loop_cond = NULL;
1509
1510   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1511                                   &assumptions, &number_of_iterationsm1,
1512                                   &number_of_iterations, &inner_loop_cond))
1513     return NULL;
1514
1515   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1516   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1517   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1518   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1519   if (!integer_onep (assumptions))
1520     {
1521       /* We consider to vectorize this loop by versioning it under
1522          some assumptions.  In order to do this, we need to clear
1523          existing information computed by scev and niter analyzer.  */
1524       scev_reset_htab ();
1525       free_numbers_of_iterations_estimates (loop);
1526       /* Also set flag for this loop so that following scev and niter
1527          analysis are done under the assumptions.  */
1528       loop_constraint_set (loop, LOOP_C_FINITE);
1529       /* Also record the assumptions for versioning.  */
1530       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1531     }
1532
1533   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1534     {
1535       if (dump_enabled_p ())
1536         {
1537           dump_printf_loc (MSG_NOTE, vect_location,
1538                            "Symbolic number of iterations is ");
1539           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1540           dump_printf (MSG_NOTE, "\n");
1541         }
1542     }
1543
1544   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1545   if (inner_loop_cond)
1546     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1547       = loop_exit_ctrl_vec_info_type;
1548
1549   gcc_assert (!loop->aux);
1550   loop->aux = loop_vinfo;
1551   return loop_vinfo;
1552 }
1553
1554
1555
1556 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1557    statements update the vectorization factor.  */
1558
1559 static void
1560 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1561 {
1562   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1563   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1564   int nbbs = loop->num_nodes;
1565   unsigned int vectorization_factor;
1566   int i;
1567
1568   if (dump_enabled_p ())
1569     dump_printf_loc (MSG_NOTE, vect_location,
1570                      "=== vect_update_vf_for_slp ===\n");
1571
1572   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1573   gcc_assert (vectorization_factor != 0);
1574
1575   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1576      vectorization factor of the loop is the unrolling factor required by
1577      the SLP instances.  If that unrolling factor is 1, we say, that we
1578      perform pure SLP on loop - cross iteration parallelism is not
1579      exploited.  */
1580   bool only_slp_in_loop = true;
1581   for (i = 0; i < nbbs; i++)
1582     {
1583       basic_block bb = bbs[i];
1584       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1585            gsi_next (&si))
1586         {
1587           gimple *stmt = gsi_stmt (si);
1588           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1589           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1590               && STMT_VINFO_RELATED_STMT (stmt_info))
1591             {
1592               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1593               stmt_info = vinfo_for_stmt (stmt);
1594             }
1595           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1596                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1597               && !PURE_SLP_STMT (stmt_info))
1598             /* STMT needs both SLP and loop-based vectorization.  */
1599             only_slp_in_loop = false;
1600         }
1601     }
1602
1603   if (only_slp_in_loop)
1604     {
1605       dump_printf_loc (MSG_NOTE, vect_location,
1606                        "Loop contains only SLP stmts\n");
1607       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1608     }
1609   else
1610     {
1611       dump_printf_loc (MSG_NOTE, vect_location,
1612                        "Loop contains SLP and non-SLP stmts\n");
1613       vectorization_factor
1614         = least_common_multiple (vectorization_factor,
1615                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1616     }
1617
1618   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1619   if (dump_enabled_p ())
1620     dump_printf_loc (MSG_NOTE, vect_location,
1621                      "Updating vectorization factor to %d\n",
1622                      vectorization_factor);
1623 }
1624
1625 /* Function vect_analyze_loop_operations.
1626
1627    Scan the loop stmts and make sure they are all vectorizable.  */
1628
1629 static bool
1630 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1631 {
1632   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1633   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1634   int nbbs = loop->num_nodes;
1635   int i;
1636   stmt_vec_info stmt_info;
1637   bool need_to_vectorize = false;
1638   bool ok;
1639
1640   if (dump_enabled_p ())
1641     dump_printf_loc (MSG_NOTE, vect_location,
1642                      "=== vect_analyze_loop_operations ===\n");
1643
1644   for (i = 0; i < nbbs; i++)
1645     {
1646       basic_block bb = bbs[i];
1647
1648       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1649            gsi_next (&si))
1650         {
1651           gphi *phi = si.phi ();
1652           ok = true;
1653
1654           stmt_info = vinfo_for_stmt (phi);
1655           if (dump_enabled_p ())
1656             {
1657               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1658               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1659             }
1660           if (virtual_operand_p (gimple_phi_result (phi)))
1661             continue;
1662
1663           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1664              (i.e., a phi in the tail of the outer-loop).  */
1665           if (! is_loop_header_bb_p (bb))
1666             {
1667               /* FORNOW: we currently don't support the case that these phis
1668                  are not used in the outerloop (unless it is double reduction,
1669                  i.e., this phi is vect_reduction_def), cause this case
1670                  requires to actually do something here.  */
1671               if (STMT_VINFO_LIVE_P (stmt_info)
1672                   && STMT_VINFO_DEF_TYPE (stmt_info)
1673                      != vect_double_reduction_def)
1674                 {
1675                   if (dump_enabled_p ())
1676                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                                      "Unsupported loop-closed phi in "
1678                                      "outer-loop.\n");
1679                   return false;
1680                 }
1681
1682               /* If PHI is used in the outer loop, we check that its operand
1683                  is defined in the inner loop.  */
1684               if (STMT_VINFO_RELEVANT_P (stmt_info))
1685                 {
1686                   tree phi_op;
1687                   gimple *op_def_stmt;
1688
1689                   if (gimple_phi_num_args (phi) != 1)
1690                     return false;
1691
1692                   phi_op = PHI_ARG_DEF (phi, 0);
1693                   if (TREE_CODE (phi_op) != SSA_NAME)
1694                     return false;
1695
1696                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1697                   if (gimple_nop_p (op_def_stmt)
1698                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1699                       || !vinfo_for_stmt (op_def_stmt))
1700                     return false;
1701
1702                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1703                         != vect_used_in_outer
1704                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1705                            != vect_used_in_outer_by_reduction)
1706                     return false;
1707                 }
1708
1709               continue;
1710             }
1711
1712           gcc_assert (stmt_info);
1713
1714           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1715                || STMT_VINFO_LIVE_P (stmt_info))
1716               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1717             {
1718               /* A scalar-dependence cycle that we don't support.  */
1719               if (dump_enabled_p ())
1720                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1721                                  "not vectorized: scalar dependence cycle.\n");
1722               return false;
1723             }
1724
1725           if (STMT_VINFO_RELEVANT_P (stmt_info))
1726             {
1727               need_to_vectorize = true;
1728               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1729                   && ! PURE_SLP_STMT (stmt_info))
1730                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1731               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1732                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1733                        && ! PURE_SLP_STMT (stmt_info))
1734                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1735             }
1736
1737           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1738             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1739
1740           if (!ok)
1741             {
1742               if (dump_enabled_p ())
1743                 {
1744                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1745                                    "not vectorized: relevant phi not "
1746                                    "supported: ");
1747                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1748                 }
1749               return false;
1750             }
1751         }
1752
1753       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1754            gsi_next (&si))
1755         {
1756           gimple *stmt = gsi_stmt (si);
1757           if (!gimple_clobber_p (stmt)
1758               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1759             return false;
1760         }
1761     } /* bbs */
1762
1763   /* All operations in the loop are either irrelevant (deal with loop
1764      control, or dead), or only used outside the loop and can be moved
1765      out of the loop (e.g. invariants, inductions).  The loop can be
1766      optimized away by scalar optimizations.  We're better off not
1767      touching this loop.  */
1768   if (!need_to_vectorize)
1769     {
1770       if (dump_enabled_p ())
1771         dump_printf_loc (MSG_NOTE, vect_location,
1772                          "All the computation can be taken out of the loop.\n");
1773       if (dump_enabled_p ())
1774         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1775                          "not vectorized: redundant loop. no profit to "
1776                          "vectorize.\n");
1777       return false;
1778     }
1779
1780   return true;
1781 }
1782
1783
1784 /* Function vect_analyze_loop_2.
1785
1786    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1787    for it.  The different analyses will record information in the
1788    loop_vec_info struct.  */
1789 static bool
1790 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1791 {
1792   bool ok;
1793   int max_vf = MAX_VECTORIZATION_FACTOR;
1794   int min_vf = 2;
1795   unsigned int n_stmts = 0;
1796
1797   /* The first group of checks is independent of the vector size.  */
1798   fatal = true;
1799
1800   /* Find all data references in the loop (which correspond to vdefs/vuses)
1801      and analyze their evolution in the loop.  */
1802
1803   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1804
1805   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1806   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1807     {
1808       if (dump_enabled_p ())
1809         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1810                          "not vectorized: loop nest containing two "
1811                          "or more consecutive inner loops cannot be "
1812                          "vectorized\n");
1813       return false;
1814     }
1815
1816   for (unsigned i = 0; i < loop->num_nodes; i++)
1817     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1818          !gsi_end_p (gsi); gsi_next (&gsi))
1819       {
1820         gimple *stmt = gsi_stmt (gsi);
1821         if (is_gimple_debug (stmt))
1822           continue;
1823         ++n_stmts;
1824         if (!find_data_references_in_stmt (loop, stmt,
1825                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1826           {
1827             if (is_gimple_call (stmt) && loop->safelen)
1828               {
1829                 tree fndecl = gimple_call_fndecl (stmt), op;
1830                 if (fndecl != NULL_TREE)
1831                   {
1832                     cgraph_node *node = cgraph_node::get (fndecl);
1833                     if (node != NULL && node->simd_clones != NULL)
1834                       {
1835                         unsigned int j, n = gimple_call_num_args (stmt);
1836                         for (j = 0; j < n; j++)
1837                           {
1838                             op = gimple_call_arg (stmt, j);
1839                             if (DECL_P (op)
1840                                 || (REFERENCE_CLASS_P (op)
1841                                     && get_base_address (op)))
1842                               break;
1843                           }
1844                         op = gimple_call_lhs (stmt);
1845                         /* Ignore #pragma omp declare simd functions
1846                            if they don't have data references in the
1847                            call stmt itself.  */
1848                         if (j == n
1849                             && !(op
1850                                  && (DECL_P (op)
1851                                      || (REFERENCE_CLASS_P (op)
1852                                          && get_base_address (op)))))
1853                           continue;
1854                       }
1855                   }
1856               }
1857             if (dump_enabled_p ())
1858               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1859                                "not vectorized: loop contains function "
1860                                "calls or data references that cannot "
1861                                "be analyzed\n");
1862             return false;
1863           }
1864       }
1865
1866   /* Analyze the data references and also adjust the minimal
1867      vectorization factor according to the loads and stores.  */
1868
1869   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1870   if (!ok)
1871     {
1872       if (dump_enabled_p ())
1873         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1874                          "bad data references.\n");
1875       return false;
1876     }
1877
1878   /* Classify all cross-iteration scalar data-flow cycles.
1879      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1880   vect_analyze_scalar_cycles (loop_vinfo);
1881
1882   vect_pattern_recog (loop_vinfo);
1883
1884   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1885
1886   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1887      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1888
1889   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1890   if (!ok)
1891     {
1892       if (dump_enabled_p ())
1893         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1894                          "bad data access.\n");
1895       return false;
1896     }
1897
1898   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1899
1900   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1901   if (!ok)
1902     {
1903       if (dump_enabled_p ())
1904         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1905                          "unexpected pattern.\n");
1906       return false;
1907     }
1908
1909   /* While the rest of the analysis below depends on it in some way.  */
1910   fatal = false;
1911
1912   /* Analyze data dependences between the data-refs in the loop
1913      and adjust the maximum vectorization factor according to
1914      the dependences.
1915      FORNOW: fail at the first data dependence that we encounter.  */
1916
1917   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1918   if (!ok
1919       || max_vf < min_vf)
1920     {
1921       if (dump_enabled_p ())
1922             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1923                              "bad data dependence.\n");
1924       return false;
1925     }
1926   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1927
1928   ok = vect_determine_vectorization_factor (loop_vinfo);
1929   if (!ok)
1930     {
1931       if (dump_enabled_p ())
1932         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1933                          "can't determine vectorization factor.\n");
1934       return false;
1935     }
1936   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1937     {
1938       if (dump_enabled_p ())
1939         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                          "bad data dependence.\n");
1941       return false;
1942     }
1943
1944   /* Compute the scalar iteration cost.  */
1945   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1946
1947   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1948   HOST_WIDE_INT estimated_niter;
1949   unsigned th;
1950   int min_scalar_loop_bound;
1951
1952   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1953   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1954   if (!ok)
1955     return false;
1956
1957   /* If there are any SLP instances mark them as pure_slp.  */
1958   bool slp = vect_make_slp_decision (loop_vinfo);
1959   if (slp)
1960     {
1961       /* Find stmts that need to be both vectorized and SLPed.  */
1962       vect_detect_hybrid_slp (loop_vinfo);
1963
1964       /* Update the vectorization factor based on the SLP decision.  */
1965       vect_update_vf_for_slp (loop_vinfo);
1966     }
1967
1968   /* This is the point where we can re-start analysis with SLP forced off.  */
1969 start_over:
1970
1971   /* Now the vectorization factor is final.  */
1972   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973   gcc_assert (vectorization_factor != 0);
1974
1975   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1976     dump_printf_loc (MSG_NOTE, vect_location,
1977                      "vectorization_factor = %d, niters = "
1978                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1979                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1980
1981   HOST_WIDE_INT max_niter
1982     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1983   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1984        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1985       || (max_niter != -1
1986           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1987     {
1988       if (dump_enabled_p ())
1989         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1990                          "not vectorized: iteration count smaller than "
1991                          "vectorization factor.\n");
1992       return false;
1993     }
1994
1995   /* Analyze the alignment of the data-refs in the loop.
1996      Fail if a data reference is found that cannot be vectorized.  */
1997
1998   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1999   if (!ok)
2000     {
2001       if (dump_enabled_p ())
2002         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2003                          "bad data alignment.\n");
2004       return false;
2005     }
2006
2007   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2008      It is important to call pruning after vect_analyze_data_ref_accesses,
2009      since we use grouping information gathered by interleaving analysis.  */
2010   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2011   if (!ok)
2012     return false;
2013
2014   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2015      vectorization.  */
2016   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2017     {
2018     /* This pass will decide on using loop versioning and/or loop peeling in
2019        order to enhance the alignment of data references in the loop.  */
2020     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2021     if (!ok)
2022       {
2023         if (dump_enabled_p ())
2024           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2025                            "bad data alignment.\n");
2026         return false;
2027       }
2028     }
2029
2030   if (slp)
2031     {
2032       /* Analyze operations in the SLP instances.  Note this may
2033          remove unsupported SLP instances which makes the above
2034          SLP kind detection invalid.  */
2035       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2036       vect_slp_analyze_operations (loop_vinfo);
2037       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2038         goto again;
2039     }
2040
2041   /* Scan all the remaining operations in the loop that are not subject
2042      to SLP and make sure they are vectorizable.  */
2043   ok = vect_analyze_loop_operations (loop_vinfo);
2044   if (!ok)
2045     {
2046       if (dump_enabled_p ())
2047         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2048                          "bad operation or unsupported loop bound.\n");
2049       return false;
2050     }
2051
2052   /* If epilog loop is required because of data accesses with gaps,
2053      one additional iteration needs to be peeled.  Check if there is
2054      enough iterations for vectorization.  */
2055   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2056       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2057     {
2058       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2059       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2060
2061       if (wi::to_widest (scalar_niters) < vf)
2062         {
2063           if (dump_enabled_p ())
2064             dump_printf_loc (MSG_NOTE, vect_location,
2065                              "loop has no enough iterations to support"
2066                              " peeling for gaps.\n");
2067           return false;
2068         }
2069     }
2070
2071   /* Analyze cost.  Decide if worth while to vectorize.  */
2072   int min_profitable_estimate, min_profitable_iters;
2073   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2074                                       &min_profitable_estimate);
2075
2076   if (min_profitable_iters < 0)
2077     {
2078       if (dump_enabled_p ())
2079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080                          "not vectorized: vectorization not profitable.\n");
2081       if (dump_enabled_p ())
2082         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2083                          "not vectorized: vector version will never be "
2084                          "profitable.\n");
2085       goto again;
2086     }
2087
2088   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2089                            * vectorization_factor);
2090
2091   /* Use the cost model only if it is more conservative than user specified
2092      threshold.  */
2093   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2094
2095   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2096
2097   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2098       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2099     {
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2102                          "not vectorized: vectorization not profitable.\n");
2103       if (dump_enabled_p ())
2104         dump_printf_loc (MSG_NOTE, vect_location,
2105                          "not vectorized: iteration count smaller than user "
2106                          "specified loop bound parameter or minimum profitable "
2107                          "iterations (whichever is more conservative).\n");
2108       goto again;
2109     }
2110
2111   estimated_niter
2112     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2113   if (estimated_niter == -1)
2114     estimated_niter = max_niter;
2115   if (estimated_niter != -1
2116       && ((unsigned HOST_WIDE_INT) estimated_niter
2117           < MAX (th, (unsigned) min_profitable_estimate)))
2118     {
2119       if (dump_enabled_p ())
2120         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2121                          "not vectorized: estimated iteration count too "
2122                          "small.\n");
2123       if (dump_enabled_p ())
2124         dump_printf_loc (MSG_NOTE, vect_location,
2125                          "not vectorized: estimated iteration count smaller "
2126                          "than specified loop bound parameter or minimum "
2127                          "profitable iterations (whichever is more "
2128                          "conservative).\n");
2129       goto again;
2130     }
2131
2132   /* Decide whether we need to create an epilogue loop to handle
2133      remaining scalar iterations.  */
2134   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2135          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2136         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2137
2138   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2139       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2140     {
2141       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2142                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2143           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2144         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2145     }
2146   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2147            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2148                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2149                /* In case of versioning, check if the maximum number of
2150                   iterations is greater than th.  If they are identical,
2151                   the epilogue is unnecessary.  */
2152                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2153                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2154     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2155
2156   /* If an epilogue loop is required make sure we can create one.  */
2157   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2158       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2159     {
2160       if (dump_enabled_p ())
2161         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2162       if (!vect_can_advance_ivs_p (loop_vinfo)
2163           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2164                                            single_exit (LOOP_VINFO_LOOP
2165                                                          (loop_vinfo))))
2166         {
2167           if (dump_enabled_p ())
2168             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2169                              "not vectorized: can't create required "
2170                              "epilog loop\n");
2171           goto again;
2172         }
2173     }
2174
2175   /* During peeling, we need to check if number of loop iterations is
2176      enough for both peeled prolog loop and vector loop.  This check
2177      can be merged along with threshold check of loop versioning, so
2178      increase threshold for this case if necessary.  */
2179   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2180       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2181           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2182     {
2183       unsigned niters_th;
2184
2185       /* Niters for peeled prolog loop.  */
2186       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2187         {
2188           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2189           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2190
2191           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2192         }
2193       else
2194         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2195
2196       /* Niters for at least one iteration of vectorized loop.  */
2197       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2198       /* One additional iteration because of peeling for gap.  */
2199       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2200         niters_th++;
2201       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2202         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2203     }
2204
2205   gcc_assert (vectorization_factor
2206               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2207
2208   /* Ok to vectorize!  */
2209   return true;
2210
2211 again:
2212   /* Try again with SLP forced off but if we didn't do any SLP there is
2213      no point in re-trying.  */
2214   if (!slp)
2215     return false;
2216
2217   /* If there are reduction chains re-trying will fail anyway.  */
2218   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2219     return false;
2220
2221   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2222      via interleaving or lane instructions.  */
2223   slp_instance instance;
2224   slp_tree node;
2225   unsigned i, j;
2226   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2227     {
2228       stmt_vec_info vinfo;
2229       vinfo = vinfo_for_stmt
2230           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2231       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2232         continue;
2233       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2234       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2235       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2236       if (! vect_store_lanes_supported (vectype, size)
2237           && ! vect_grouped_store_supported (vectype, size))
2238         return false;
2239       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2240         {
2241           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2242           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2243           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2244           size = STMT_VINFO_GROUP_SIZE (vinfo);
2245           vectype = STMT_VINFO_VECTYPE (vinfo);
2246           if (! vect_load_lanes_supported (vectype, size)
2247               && ! vect_grouped_load_supported (vectype, single_element_p,
2248                                                 size))
2249             return false;
2250         }
2251     }
2252
2253   if (dump_enabled_p ())
2254     dump_printf_loc (MSG_NOTE, vect_location,
2255                      "re-trying with SLP disabled\n");
2256
2257   /* Roll back state appropriately.  No SLP this time.  */
2258   slp = false;
2259   /* Restore vectorization factor as it were without SLP.  */
2260   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2261   /* Free the SLP instances.  */
2262   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2263     vect_free_slp_instance (instance);
2264   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2265   /* Reset SLP type to loop_vect on all stmts.  */
2266   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2267     {
2268       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2269       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2270            !gsi_end_p (si); gsi_next (&si))
2271         {
2272           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2273           STMT_SLP_TYPE (stmt_info) = loop_vect;
2274         }
2275       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2276            !gsi_end_p (si); gsi_next (&si))
2277         {
2278           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2279           STMT_SLP_TYPE (stmt_info) = loop_vect;
2280           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2281             {
2282               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2283               STMT_SLP_TYPE (stmt_info) = loop_vect;
2284               for (gimple_stmt_iterator pi
2285                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2286                    !gsi_end_p (pi); gsi_next (&pi))
2287                 {
2288                   gimple *pstmt = gsi_stmt (pi);
2289                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2290                 }
2291             }
2292         }
2293     }
2294   /* Free optimized alias test DDRS.  */
2295   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2296   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2297   /* Reset target cost data.  */
2298   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2299   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2300     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2301   /* Reset assorted flags.  */
2302   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2303   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2304   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2305
2306   goto start_over;
2307 }
2308
2309 /* Function vect_analyze_loop.
2310
2311    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2312    for it.  The different analyses will record information in the
2313    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2314    be vectorized.  */
2315 loop_vec_info
2316 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2317 {
2318   loop_vec_info loop_vinfo;
2319   unsigned int vector_sizes;
2320
2321   /* Autodetect first vector size we try.  */
2322   current_vector_size = 0;
2323   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2324
2325   if (dump_enabled_p ())
2326     dump_printf_loc (MSG_NOTE, vect_location,
2327                      "===== analyze_loop_nest =====\n");
2328
2329   if (loop_outer (loop)
2330       && loop_vec_info_for_loop (loop_outer (loop))
2331       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2332     {
2333       if (dump_enabled_p ())
2334         dump_printf_loc (MSG_NOTE, vect_location,
2335                          "outer-loop already vectorized.\n");
2336       return NULL;
2337     }
2338
2339   while (1)
2340     {
2341       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2342       loop_vinfo = vect_analyze_loop_form (loop);
2343       if (!loop_vinfo)
2344         {
2345           if (dump_enabled_p ())
2346             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2347                              "bad loop form.\n");
2348           return NULL;
2349         }
2350
2351       bool fatal = false;
2352
2353       if (orig_loop_vinfo)
2354         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2355
2356       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2357         {
2358           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2359
2360           return loop_vinfo;
2361         }
2362
2363       delete loop_vinfo;
2364
2365       vector_sizes &= ~current_vector_size;
2366       if (fatal
2367           || vector_sizes == 0
2368           || current_vector_size == 0)
2369         return NULL;
2370
2371       /* Try the next biggest vector size.  */
2372       current_vector_size = 1 << floor_log2 (vector_sizes);
2373       if (dump_enabled_p ())
2374         dump_printf_loc (MSG_NOTE, vect_location,
2375                          "***** Re-trying analysis with "
2376                          "vector size %d\n", current_vector_size);
2377     }
2378 }
2379
2380
2381 /* Function reduction_fn_for_scalar_code
2382
2383    Input:
2384    CODE - tree_code of a reduction operations.
2385
2386    Output:
2387    REDUC_FN - the corresponding internal function to be used to reduce the
2388       vector of partial results into a single scalar result, or IFN_LAST
2389       if the operation is a supported reduction operation, but does not have
2390       such an internal function.
2391
2392    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2393
2394 static bool
2395 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2396 {
2397   switch (code)
2398     {
2399       case MAX_EXPR:
2400         *reduc_fn = IFN_REDUC_MAX;
2401         return true;
2402
2403       case MIN_EXPR:
2404         *reduc_fn = IFN_REDUC_MIN;
2405         return true;
2406
2407       case PLUS_EXPR:
2408         *reduc_fn = IFN_REDUC_PLUS;
2409         return true;
2410
2411       case MULT_EXPR:
2412       case MINUS_EXPR:
2413       case BIT_IOR_EXPR:
2414       case BIT_XOR_EXPR:
2415       case BIT_AND_EXPR:
2416         *reduc_fn = IFN_LAST;
2417         return true;
2418
2419       default:
2420        return false;
2421     }
2422 }
2423
2424
2425 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2426    STMT is printed with a message MSG. */
2427
2428 static void
2429 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 {
2431   dump_printf_loc (msg_type, vect_location, "%s", msg);
2432   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2433 }
2434
2435
2436 /* Detect SLP reduction of the form:
2437
2438    #a1 = phi <a5, a0>
2439    a2 = operation (a1)
2440    a3 = operation (a2)
2441    a4 = operation (a3)
2442    a5 = operation (a4)
2443
2444    #a = phi <a5>
2445
2446    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2447    FIRST_STMT is the first reduction stmt in the chain
2448    (a2 = operation (a1)).
2449
2450    Return TRUE if a reduction chain was detected.  */
2451
2452 static bool
2453 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2454                        gimple *first_stmt)
2455 {
2456   struct loop *loop = (gimple_bb (phi))->loop_father;
2457   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2458   enum tree_code code;
2459   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2460   stmt_vec_info use_stmt_info, current_stmt_info;
2461   tree lhs;
2462   imm_use_iterator imm_iter;
2463   use_operand_p use_p;
2464   int nloop_uses, size = 0, n_out_of_loop_uses;
2465   bool found = false;
2466
2467   if (loop != vect_loop)
2468     return false;
2469
2470   lhs = PHI_RESULT (phi);
2471   code = gimple_assign_rhs_code (first_stmt);
2472   while (1)
2473     {
2474       nloop_uses = 0;
2475       n_out_of_loop_uses = 0;
2476       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477         {
2478           gimple *use_stmt = USE_STMT (use_p);
2479           if (is_gimple_debug (use_stmt))
2480             continue;
2481
2482           /* Check if we got back to the reduction phi.  */
2483           if (use_stmt == phi)
2484             {
2485               loop_use_stmt = use_stmt;
2486               found = true;
2487               break;
2488             }
2489
2490           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491             {
2492               loop_use_stmt = use_stmt;
2493               nloop_uses++;
2494             }
2495            else
2496              n_out_of_loop_uses++;
2497
2498            /* There are can be either a single use in the loop or two uses in
2499               phi nodes.  */
2500            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2501              return false;
2502         }
2503
2504       if (found)
2505         break;
2506
2507       /* We reached a statement with no loop uses.  */
2508       if (nloop_uses == 0)
2509         return false;
2510
2511       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2512       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2513         return false;
2514
2515       if (!is_gimple_assign (loop_use_stmt)
2516           || code != gimple_assign_rhs_code (loop_use_stmt)
2517           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2518         return false;
2519
2520       /* Insert USE_STMT into reduction chain.  */
2521       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2522       if (current_stmt)
2523         {
2524           current_stmt_info = vinfo_for_stmt (current_stmt);
2525           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2526           GROUP_FIRST_ELEMENT (use_stmt_info)
2527             = GROUP_FIRST_ELEMENT (current_stmt_info);
2528         }
2529       else
2530         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531
2532       lhs = gimple_assign_lhs (loop_use_stmt);
2533       current_stmt = loop_use_stmt;
2534       size++;
2535    }
2536
2537   if (!found || loop_use_stmt != phi || size < 2)
2538     return false;
2539
2540   /* Swap the operands, if needed, to make the reduction operand be the second
2541      operand.  */
2542   lhs = PHI_RESULT (phi);
2543   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2544   while (next_stmt)
2545     {
2546       if (gimple_assign_rhs2 (next_stmt) == lhs)
2547         {
2548           tree op = gimple_assign_rhs1 (next_stmt);
2549           gimple *def_stmt = NULL;
2550
2551           if (TREE_CODE (op) == SSA_NAME)
2552             def_stmt = SSA_NAME_DEF_STMT (op);
2553
2554           /* Check that the other def is either defined in the loop
2555              ("vect_internal_def"), or it's an induction (defined by a
2556              loop-header phi-node).  */
2557           if (def_stmt
2558               && gimple_bb (def_stmt)
2559               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2560               && (is_gimple_assign (def_stmt)
2561                   || is_gimple_call (def_stmt)
2562                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2563                            == vect_induction_def
2564                   || (gimple_code (def_stmt) == GIMPLE_PHI
2565                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2566                                   == vect_internal_def
2567                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568             {
2569               lhs = gimple_assign_lhs (next_stmt);
2570               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2571               continue;
2572             }
2573
2574           return false;
2575         }
2576       else
2577         {
2578           tree op = gimple_assign_rhs2 (next_stmt);
2579           gimple *def_stmt = NULL;
2580
2581           if (TREE_CODE (op) == SSA_NAME)
2582             def_stmt = SSA_NAME_DEF_STMT (op);
2583
2584           /* Check that the other def is either defined in the loop
2585             ("vect_internal_def"), or it's an induction (defined by a
2586             loop-header phi-node).  */
2587           if (def_stmt
2588               && gimple_bb (def_stmt)
2589               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2590               && (is_gimple_assign (def_stmt)
2591                   || is_gimple_call (def_stmt)
2592                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2593                               == vect_induction_def
2594                   || (gimple_code (def_stmt) == GIMPLE_PHI
2595                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2596                                   == vect_internal_def
2597                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598             {
2599               if (dump_enabled_p ())
2600                 {
2601                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2602                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2603                 }
2604
2605               swap_ssa_operands (next_stmt,
2606                                  gimple_assign_rhs1_ptr (next_stmt),
2607                                  gimple_assign_rhs2_ptr (next_stmt));
2608               update_stmt (next_stmt);
2609
2610               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2611                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612             }
2613           else
2614             return false;
2615         }
2616
2617       lhs = gimple_assign_lhs (next_stmt);
2618       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2619     }
2620
2621   /* Save the chain for further analysis in SLP detection.  */
2622   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2623   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2624   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625
2626   return true;
2627 }
2628
2629
2630 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2631    reduction operation CODE has a handled computation expression.  */
2632
2633 bool
2634 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2635                       enum tree_code code)
2636 {
2637   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2638   auto_bitmap visited;
2639   tree lookfor = PHI_RESULT (phi);
2640   ssa_op_iter curri;
2641   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2642   while (USE_FROM_PTR (curr) != loop_arg)
2643     curr = op_iter_next_use (&curri);
2644   curri.i = curri.numops;
2645   do
2646     {
2647       path.safe_push (std::make_pair (curri, curr));
2648       tree use = USE_FROM_PTR (curr);
2649       if (use == lookfor)
2650         break;
2651       gimple *def = SSA_NAME_DEF_STMT (use);
2652       if (gimple_nop_p (def)
2653           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
2654         {
2655 pop:
2656           do
2657             {
2658               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
2659               curri = x.first;
2660               curr = x.second;
2661               do
2662                 curr = op_iter_next_use (&curri);
2663               /* Skip already visited or non-SSA operands (from iterating
2664                  over PHI args).  */
2665               while (curr != NULL_USE_OPERAND_P
2666                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2667                          || ! bitmap_set_bit (visited,
2668                                               SSA_NAME_VERSION
2669                                                 (USE_FROM_PTR (curr)))));
2670             }
2671           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
2672           if (curr == NULL_USE_OPERAND_P)
2673             break;
2674         }
2675       else
2676         {
2677           if (gimple_code (def) == GIMPLE_PHI)
2678             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
2679           else
2680             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
2681           while (curr != NULL_USE_OPERAND_P
2682                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
2683                      || ! bitmap_set_bit (visited,
2684                                           SSA_NAME_VERSION
2685                                             (USE_FROM_PTR (curr)))))
2686             curr = op_iter_next_use (&curri);
2687           if (curr == NULL_USE_OPERAND_P)
2688             goto pop;
2689         }
2690     }
2691   while (1);
2692   if (dump_file && (dump_flags & TDF_DETAILS))
2693     {
2694       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
2695       unsigned i;
2696       std::pair<ssa_op_iter, use_operand_p> *x;
2697       FOR_EACH_VEC_ELT (path, i, x)
2698         {
2699           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
2700           dump_printf (MSG_NOTE, " ");
2701         }
2702       dump_printf (MSG_NOTE, "\n");
2703     }
2704
2705   /* Check whether the reduction path detected is valid.  */
2706   bool fail = path.length () == 0;
2707   bool neg = false;
2708   for (unsigned i = 1; i < path.length (); ++i)
2709     {
2710       gimple *use_stmt = USE_STMT (path[i].second);
2711       tree op = USE_FROM_PTR (path[i].second);
2712       if (! has_single_use (op)
2713           || ! is_gimple_assign (use_stmt))
2714         {
2715           fail = true;
2716           break;
2717         }
2718       if (gimple_assign_rhs_code (use_stmt) != code)
2719         {
2720           if (code == PLUS_EXPR
2721               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
2722             {
2723               /* Track whether we negate the reduction value each iteration.  */
2724               if (gimple_assign_rhs2 (use_stmt) == op)
2725                 neg = ! neg;
2726             }
2727           else
2728             {
2729               fail = true;
2730               break;
2731             }
2732         }
2733     }
2734   return ! fail && ! neg;
2735 }
2736
2737
2738 /* Function vect_is_simple_reduction
2739
2740    (1) Detect a cross-iteration def-use cycle that represents a simple
2741    reduction computation.  We look for the following pattern:
2742
2743    loop_header:
2744      a1 = phi < a0, a2 >
2745      a3 = ...
2746      a2 = operation (a3, a1)
2747
2748    or
2749
2750    a3 = ...
2751    loop_header:
2752      a1 = phi < a0, a2 >
2753      a2 = operation (a3, a1)
2754
2755    such that:
2756    1. operation is commutative and associative and it is safe to
2757       change the order of the computation
2758    2. no uses for a2 in the loop (a2 is used out of the loop)
2759    3. no uses of a1 in the loop besides the reduction operation
2760    4. no uses of a1 outside the loop.
2761
2762    Conditions 1,4 are tested here.
2763    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2764
2765    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2766    nested cycles.
2767
2768    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2769    reductions:
2770
2771      a1 = phi < a0, a2 >
2772      inner loop (def of a3)
2773      a2 = phi < a3 >
2774
2775    (4) Detect condition expressions, ie:
2776      for (int i = 0; i < N; i++)
2777        if (a[i] < val)
2778         ret_val = a[i];
2779
2780 */
2781
2782 static gimple *
2783 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2784                           bool *double_reduc,
2785                           bool need_wrapping_integral_overflow,
2786                           enum vect_reduction_type *v_reduc_type)
2787 {
2788   struct loop *loop = (gimple_bb (phi))->loop_father;
2789   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2790   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2791   enum tree_code orig_code, code;
2792   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2793   tree type;
2794   int nloop_uses;
2795   tree name;
2796   imm_use_iterator imm_iter;
2797   use_operand_p use_p;
2798   bool phi_def;
2799
2800   *double_reduc = false;
2801   *v_reduc_type = TREE_CODE_REDUCTION;
2802
2803   tree phi_name = PHI_RESULT (phi);
2804   /* ???  If there are no uses of the PHI result the inner loop reduction
2805      won't be detected as possibly double-reduction by vectorizable_reduction
2806      because that tries to walk the PHI arg from the preheader edge which
2807      can be constant.  See PR60382.  */
2808   if (has_zero_uses (phi_name))
2809     return NULL;
2810   nloop_uses = 0;
2811   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2812     {
2813       gimple *use_stmt = USE_STMT (use_p);
2814       if (is_gimple_debug (use_stmt))
2815         continue;
2816
2817       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2818         {
2819           if (dump_enabled_p ())
2820             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2821                              "intermediate value used outside loop.\n");
2822
2823           return NULL;
2824         }
2825
2826       nloop_uses++;
2827       if (nloop_uses > 1)
2828         {
2829           if (dump_enabled_p ())
2830             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2831                              "reduction value used in loop.\n");
2832           return NULL;
2833         }
2834
2835       phi_use_stmt = use_stmt;
2836     }
2837
2838   edge latch_e = loop_latch_edge (loop);
2839   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2840   if (TREE_CODE (loop_arg) != SSA_NAME)
2841     {
2842       if (dump_enabled_p ())
2843         {
2844           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2845                            "reduction: not ssa_name: ");
2846           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2847           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2848         }
2849       return NULL;
2850     }
2851
2852   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2853   if (is_gimple_assign (def_stmt))
2854     {
2855       name = gimple_assign_lhs (def_stmt);
2856       phi_def = false;
2857     }
2858   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2859     {
2860       name = PHI_RESULT (def_stmt);
2861       phi_def = true;
2862     }
2863   else
2864     {
2865       if (dump_enabled_p ())
2866         {
2867           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2868                            "reduction: unhandled reduction operation: ");
2869           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2870         }
2871       return NULL;
2872     }
2873
2874   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2875     return NULL;
2876
2877   nloop_uses = 0;
2878   auto_vec<gphi *, 3> lcphis;
2879   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2880     {
2881       gimple *use_stmt = USE_STMT (use_p);
2882       if (is_gimple_debug (use_stmt))
2883         continue;
2884       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2885         nloop_uses++;
2886       else
2887         /* We can have more than one loop-closed PHI.  */
2888         lcphis.safe_push (as_a <gphi *> (use_stmt));
2889       if (nloop_uses > 1)
2890         {
2891           if (dump_enabled_p ())
2892             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2893                              "reduction used in loop.\n");
2894           return NULL;
2895         }
2896     }
2897
2898   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2899      defined in the inner loop.  */
2900   if (phi_def)
2901     {
2902       op1 = PHI_ARG_DEF (def_stmt, 0);
2903
2904       if (gimple_phi_num_args (def_stmt) != 1
2905           || TREE_CODE (op1) != SSA_NAME)
2906         {
2907           if (dump_enabled_p ())
2908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2909                              "unsupported phi node definition.\n");
2910
2911           return NULL;
2912         }
2913
2914       def1 = SSA_NAME_DEF_STMT (op1);
2915       if (gimple_bb (def1)
2916           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2917           && loop->inner
2918           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2919           && is_gimple_assign (def1)
2920           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2921         {
2922           if (dump_enabled_p ())
2923             report_vect_op (MSG_NOTE, def_stmt,
2924                             "detected double reduction: ");
2925
2926           *double_reduc = true;
2927           return def_stmt;
2928         }
2929
2930       return NULL;
2931     }
2932
2933   /* If we are vectorizing an inner reduction we are executing that
2934      in the original order only in case we are not dealing with a
2935      double reduction.  */
2936   bool check_reduction = true;
2937   if (flow_loop_nested_p (vect_loop, loop))
2938     {
2939       gphi *lcphi;
2940       unsigned i;
2941       check_reduction = false;
2942       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2943         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2944           {
2945             gimple *use_stmt = USE_STMT (use_p);
2946             if (is_gimple_debug (use_stmt))
2947               continue;
2948             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2949               check_reduction = true;
2950           }
2951     }
2952
2953   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2954   code = orig_code = gimple_assign_rhs_code (def_stmt);
2955
2956   /* We can handle "res -= x[i]", which is non-associative by
2957      simply rewriting this into "res += -x[i]".  Avoid changing
2958      gimple instruction for the first simple tests and only do this
2959      if we're allowed to change code at all.  */
2960   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2961     code = PLUS_EXPR;
2962
2963   if (code == COND_EXPR)
2964     {
2965       if (! nested_in_vect_loop)
2966         *v_reduc_type = COND_REDUCTION;
2967
2968       op3 = gimple_assign_rhs1 (def_stmt);
2969       if (COMPARISON_CLASS_P (op3))
2970         {
2971           op4 = TREE_OPERAND (op3, 1);
2972           op3 = TREE_OPERAND (op3, 0);
2973         }
2974       if (op3 == phi_name || op4 == phi_name)
2975         {
2976           if (dump_enabled_p ())
2977             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2978                             "reduction: condition depends on previous"
2979                             " iteration: ");
2980           return NULL;
2981         }
2982
2983       op1 = gimple_assign_rhs2 (def_stmt);
2984       op2 = gimple_assign_rhs3 (def_stmt);
2985     }
2986   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2987     {
2988       if (dump_enabled_p ())
2989         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2990                         "reduction: not commutative/associative: ");
2991       return NULL;
2992     }
2993   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2994     {
2995       op1 = gimple_assign_rhs1 (def_stmt);
2996       op2 = gimple_assign_rhs2 (def_stmt);
2997     }
2998   else
2999     {
3000       if (dump_enabled_p ())
3001         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3002                         "reduction: not handled operation: ");
3003       return NULL;
3004     }
3005
3006   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3007     {
3008       if (dump_enabled_p ())
3009         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3010                         "reduction: both uses not ssa_names: ");
3011
3012       return NULL;
3013     }
3014
3015   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3016   if ((TREE_CODE (op1) == SSA_NAME
3017        && !types_compatible_p (type,TREE_TYPE (op1)))
3018       || (TREE_CODE (op2) == SSA_NAME
3019           && !types_compatible_p (type, TREE_TYPE (op2)))
3020       || (op3 && TREE_CODE (op3) == SSA_NAME
3021           && !types_compatible_p (type, TREE_TYPE (op3)))
3022       || (op4 && TREE_CODE (op4) == SSA_NAME
3023           && !types_compatible_p (type, TREE_TYPE (op4))))
3024     {
3025       if (dump_enabled_p ())
3026         {
3027           dump_printf_loc (MSG_NOTE, vect_location,
3028                            "reduction: multiple types: operation type: ");
3029           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3030           dump_printf (MSG_NOTE, ", operands types: ");
3031           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3032                              TREE_TYPE (op1));
3033           dump_printf (MSG_NOTE, ",");
3034           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3035                              TREE_TYPE (op2));
3036           if (op3)
3037             {
3038               dump_printf (MSG_NOTE, ",");
3039               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3040                                  TREE_TYPE (op3));
3041             }
3042
3043           if (op4)
3044             {
3045               dump_printf (MSG_NOTE, ",");
3046               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3047                                  TREE_TYPE (op4));
3048             }
3049           dump_printf (MSG_NOTE, "\n");
3050         }
3051
3052       return NULL;
3053     }
3054
3055   /* Check that it's ok to change the order of the computation.
3056      Generally, when vectorizing a reduction we change the order of the
3057      computation.  This may change the behavior of the program in some
3058      cases, so we need to check that this is ok.  One exception is when
3059      vectorizing an outer-loop: the inner-loop is executed sequentially,
3060      and therefore vectorizing reductions in the inner-loop during
3061      outer-loop vectorization is safe.  */
3062
3063   if (*v_reduc_type != COND_REDUCTION
3064       && check_reduction)
3065     {
3066       /* CHECKME: check for !flag_finite_math_only too?  */
3067       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3068         {
3069           /* Changing the order of operations changes the semantics.  */
3070           if (dump_enabled_p ())
3071             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3072                         "reduction: unsafe fp math optimization: ");
3073           return NULL;
3074         }
3075       else if (INTEGRAL_TYPE_P (type))
3076         {
3077           if (!operation_no_trapping_overflow (type, code))
3078             {
3079               /* Changing the order of operations changes the semantics.  */
3080               if (dump_enabled_p ())
3081                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3082                                 "reduction: unsafe int math optimization"
3083                                 " (overflow traps): ");
3084               return NULL;
3085             }
3086           if (need_wrapping_integral_overflow
3087               && !TYPE_OVERFLOW_WRAPS (type)
3088               && operation_can_overflow (code))
3089             {
3090               /* Changing the order of operations changes the semantics.  */
3091               if (dump_enabled_p ())
3092                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3093                                 "reduction: unsafe int math optimization"
3094                                 " (overflow doesn't wrap): ");
3095               return NULL;
3096             }
3097         }
3098       else if (SAT_FIXED_POINT_TYPE_P (type))
3099         {
3100           /* Changing the order of operations changes the semantics.  */
3101           if (dump_enabled_p ())
3102           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3103                           "reduction: unsafe fixed-point math optimization: ");
3104           return NULL;
3105         }
3106     }
3107
3108   /* Reduction is safe. We're dealing with one of the following:
3109      1) integer arithmetic and no trapv
3110      2) floating point arithmetic, and special flags permit this optimization
3111      3) nested cycle (i.e., outer loop vectorization).  */
3112   if (TREE_CODE (op1) == SSA_NAME)
3113     def1 = SSA_NAME_DEF_STMT (op1);
3114
3115   if (TREE_CODE (op2) == SSA_NAME)
3116     def2 = SSA_NAME_DEF_STMT (op2);
3117
3118   if (code != COND_EXPR
3119       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3120     {
3121       if (dump_enabled_p ())
3122         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3123       return NULL;
3124     }
3125
3126   /* Check that one def is the reduction def, defined by PHI,
3127      the other def is either defined in the loop ("vect_internal_def"),
3128      or it's an induction (defined by a loop-header phi-node).  */
3129
3130   if (def2 && def2 == phi
3131       && (code == COND_EXPR
3132           || !def1 || gimple_nop_p (def1)
3133           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3134           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3135               && (is_gimple_assign (def1)
3136                   || is_gimple_call (def1)
3137                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3138                       == vect_induction_def
3139                   || (gimple_code (def1) == GIMPLE_PHI
3140                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3141                           == vect_internal_def
3142                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3143     {
3144       if (dump_enabled_p ())
3145         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3146       return def_stmt;
3147     }
3148
3149   if (def1 && def1 == phi
3150       && (code == COND_EXPR
3151           || !def2 || gimple_nop_p (def2)
3152           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3153           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3154               && (is_gimple_assign (def2)
3155                   || is_gimple_call (def2)
3156                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3157                        == vect_induction_def
3158                   || (gimple_code (def2) == GIMPLE_PHI
3159                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3160                            == vect_internal_def
3161                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3162     {
3163       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3164         {
3165           /* Check if we can swap operands (just for simplicity - so that
3166              the rest of the code can assume that the reduction variable
3167              is always the last (second) argument).  */
3168           if (code == COND_EXPR)
3169             {
3170               /* Swap cond_expr by inverting the condition.  */
3171               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3172               enum tree_code invert_code = ERROR_MARK;
3173               enum tree_code cond_code = TREE_CODE (cond_expr);
3174
3175               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3176                 {
3177                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3178                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3179                 }
3180               if (invert_code != ERROR_MARK)
3181                 {
3182                   TREE_SET_CODE (cond_expr, invert_code);
3183                   swap_ssa_operands (def_stmt,
3184                                      gimple_assign_rhs2_ptr (def_stmt),
3185                                      gimple_assign_rhs3_ptr (def_stmt));
3186                 }
3187               else
3188                 {
3189                   if (dump_enabled_p ())
3190                     report_vect_op (MSG_NOTE, def_stmt,
3191                                     "detected reduction: cannot swap operands "
3192                                     "for cond_expr");
3193                   return NULL;
3194                 }
3195             }
3196           else
3197             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3198                                gimple_assign_rhs2_ptr (def_stmt));
3199
3200           if (dump_enabled_p ())
3201             report_vect_op (MSG_NOTE, def_stmt,
3202                             "detected reduction: need to swap operands: ");
3203
3204           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3205             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3206         }
3207       else
3208         {
3209           if (dump_enabled_p ())
3210             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3211         }
3212
3213       return def_stmt;
3214     }
3215
3216   /* Try to find SLP reduction chain.  */
3217   if (! nested_in_vect_loop
3218       && code != COND_EXPR
3219       && orig_code != MINUS_EXPR
3220       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3221     {
3222       if (dump_enabled_p ())
3223         report_vect_op (MSG_NOTE, def_stmt,
3224                         "reduction: detected reduction chain: ");
3225
3226       return def_stmt;
3227     }
3228
3229   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3230   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3231   while (first)
3232     {
3233       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3234       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3235       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3236       first = next;
3237     }
3238
3239   /* Look for the expression computing loop_arg from loop PHI result.  */
3240   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3241                             code))
3242     return def_stmt;
3243
3244   if (dump_enabled_p ())
3245     {
3246       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3247                       "reduction: unknown pattern: ");
3248     }
3249
3250   return NULL;
3251 }
3252
3253 /* Wrapper around vect_is_simple_reduction, which will modify code
3254    in-place if it enables detection of more reductions.  Arguments
3255    as there.  */
3256
3257 gimple *
3258 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3259                              bool *double_reduc,
3260                              bool need_wrapping_integral_overflow)
3261 {
3262   enum vect_reduction_type v_reduc_type;
3263   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3264                                           need_wrapping_integral_overflow,
3265                                           &v_reduc_type);
3266   if (def)
3267     {
3268       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3269       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3270       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3271       reduc_def_info = vinfo_for_stmt (def);
3272       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3273     }
3274   return def;
3275 }
3276
3277 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3278 int
3279 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3280                              int *peel_iters_epilogue,
3281                              stmt_vector_for_cost *scalar_cost_vec,
3282                              stmt_vector_for_cost *prologue_cost_vec,
3283                              stmt_vector_for_cost *epilogue_cost_vec)
3284 {
3285   int retval = 0;
3286   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3287
3288   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3289     {
3290       *peel_iters_epilogue = vf/2;
3291       if (dump_enabled_p ())
3292         dump_printf_loc (MSG_NOTE, vect_location,
3293                          "cost model: epilogue peel iters set to vf/2 "
3294                          "because loop iterations are unknown .\n");
3295
3296       /* If peeled iterations are known but number of scalar loop
3297          iterations are unknown, count a taken branch per peeled loop.  */
3298       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3299                                  NULL, 0, vect_prologue);
3300       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3301                                  NULL, 0, vect_epilogue);
3302     }
3303   else
3304     {
3305       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3306       peel_iters_prologue = niters < peel_iters_prologue ?
3307                             niters : peel_iters_prologue;
3308       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3309       /* If we need to peel for gaps, but no peeling is required, we have to
3310          peel VF iterations.  */
3311       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3312         *peel_iters_epilogue = vf;
3313     }
3314
3315   stmt_info_for_cost *si;
3316   int j;
3317   if (peel_iters_prologue)
3318     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319         {
3320           stmt_vec_info stmt_info
3321             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3322           retval += record_stmt_cost (prologue_cost_vec,
3323                                       si->count * peel_iters_prologue,
3324                                       si->kind, stmt_info, si->misalign,
3325                                       vect_prologue);
3326         }
3327   if (*peel_iters_epilogue)
3328     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3329         {
3330           stmt_vec_info stmt_info
3331             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3332           retval += record_stmt_cost (epilogue_cost_vec,
3333                                       si->count * *peel_iters_epilogue,
3334                                       si->kind, stmt_info, si->misalign,
3335                                       vect_epilogue);
3336         }
3337
3338   return retval;
3339 }
3340
3341 /* Function vect_estimate_min_profitable_iters
3342
3343    Return the number of iterations required for the vector version of the
3344    loop to be profitable relative to the cost of the scalar version of the
3345    loop.
3346
3347    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3348    of iterations for vectorization.  -1 value means loop vectorization
3349    is not profitable.  This returned value may be used for dynamic
3350    profitability check.
3351
3352    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3353    for static check against estimated number of iterations.  */
3354
3355 static void
3356 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3357                                     int *ret_min_profitable_niters,
3358                                     int *ret_min_profitable_estimate)
3359 {
3360   int min_profitable_iters;
3361   int min_profitable_estimate;
3362   int peel_iters_prologue;
3363   int peel_iters_epilogue;
3364   unsigned vec_inside_cost = 0;
3365   int vec_outside_cost = 0;
3366   unsigned vec_prologue_cost = 0;
3367   unsigned vec_epilogue_cost = 0;
3368   int scalar_single_iter_cost = 0;
3369   int scalar_outside_cost = 0;
3370   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3371   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3372   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3373
3374   /* Cost model disabled.  */
3375   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3376     {
3377       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3378       *ret_min_profitable_niters = 0;
3379       *ret_min_profitable_estimate = 0;
3380       return;
3381     }
3382
3383   /* Requires loop versioning tests to handle misalignment.  */
3384   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3385     {
3386       /*  FIXME: Make cost depend on complexity of individual check.  */
3387       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3388       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3389                             vect_prologue);
3390       dump_printf (MSG_NOTE,
3391                    "cost model: Adding cost of checks for loop "
3392                    "versioning to treat misalignment.\n");
3393     }
3394
3395   /* Requires loop versioning with alias checks.  */
3396   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3397     {
3398       /*  FIXME: Make cost depend on complexity of individual check.  */
3399       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3400       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3401                             vect_prologue);
3402       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3403       if (len)
3404         /* Count LEN - 1 ANDs and LEN comparisons.  */
3405         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3406                               NULL, 0, vect_prologue);
3407       dump_printf (MSG_NOTE,
3408                    "cost model: Adding cost of checks for loop "
3409                    "versioning aliasing.\n");
3410     }
3411
3412   /* Requires loop versioning with niter checks.  */
3413   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3414     {
3415       /*  FIXME: Make cost depend on complexity of individual check.  */
3416       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3417                             vect_prologue);
3418       dump_printf (MSG_NOTE,
3419                    "cost model: Adding cost of checks for loop "
3420                    "versioning niters.\n");
3421     }
3422
3423   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3424     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3425                           vect_prologue);
3426
3427   /* Count statements in scalar loop.  Using this as scalar cost for a single
3428      iteration for now.
3429
3430      TODO: Add outer loop support.
3431
3432      TODO: Consider assigning different costs to different scalar
3433      statements.  */
3434
3435   scalar_single_iter_cost
3436     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3437
3438   /* Add additional cost for the peeled instructions in prologue and epilogue
3439      loop.
3440
3441      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3442      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3443
3444      TODO: Build an expression that represents peel_iters for prologue and
3445      epilogue to be used in a run-time test.  */
3446
3447   if (npeel  < 0)
3448     {
3449       peel_iters_prologue = vf/2;
3450       dump_printf (MSG_NOTE, "cost model: "
3451                    "prologue peel iters set to vf/2.\n");
3452
3453       /* If peeling for alignment is unknown, loop bound of main loop becomes
3454          unknown.  */
3455       peel_iters_epilogue = vf/2;
3456       dump_printf (MSG_NOTE, "cost model: "
3457                    "epilogue peel iters set to vf/2 because "
3458                    "peeling for alignment is unknown.\n");
3459
3460       /* If peeled iterations are unknown, count a taken branch and a not taken
3461          branch per peeled loop. Even if scalar loop iterations are known,
3462          vector iterations are not known since peeled prologue iterations are
3463          not known. Hence guards remain the same.  */
3464       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3465                             NULL, 0, vect_prologue);
3466       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3467                             NULL, 0, vect_prologue);
3468       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3469                             NULL, 0, vect_epilogue);
3470       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3471                             NULL, 0, vect_epilogue);
3472       stmt_info_for_cost *si;
3473       int j;
3474       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3475         {
3476           struct _stmt_vec_info *stmt_info
3477             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3478           (void) add_stmt_cost (target_cost_data,
3479                                 si->count * peel_iters_prologue,
3480                                 si->kind, stmt_info, si->misalign,
3481                                 vect_prologue);
3482           (void) add_stmt_cost (target_cost_data,
3483                                 si->count * peel_iters_epilogue,
3484                                 si->kind, stmt_info, si->misalign,
3485                                 vect_epilogue);
3486         }
3487     }
3488   else
3489     {
3490       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3491       stmt_info_for_cost *si;
3492       int j;
3493       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3494
3495       prologue_cost_vec.create (2);
3496       epilogue_cost_vec.create (2);
3497       peel_iters_prologue = npeel;
3498
3499       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3500                                           &peel_iters_epilogue,
3501                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3502                                             (loop_vinfo),
3503                                           &prologue_cost_vec,
3504                                           &epilogue_cost_vec);
3505
3506       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3507         {
3508           struct _stmt_vec_info *stmt_info
3509             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3510           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3511                                 si->misalign, vect_prologue);
3512         }
3513
3514       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3515         {
3516           struct _stmt_vec_info *stmt_info
3517             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3518           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3519                                 si->misalign, vect_epilogue);
3520         }
3521
3522       prologue_cost_vec.release ();
3523       epilogue_cost_vec.release ();
3524     }
3525
3526   /* FORNOW: The scalar outside cost is incremented in one of the
3527      following ways:
3528
3529      1. The vectorizer checks for alignment and aliasing and generates
3530      a condition that allows dynamic vectorization.  A cost model
3531      check is ANDED with the versioning condition.  Hence scalar code
3532      path now has the added cost of the versioning check.
3533
3534        if (cost > th & versioning_check)
3535          jmp to vector code
3536
3537      Hence run-time scalar is incremented by not-taken branch cost.
3538
3539      2. The vectorizer then checks if a prologue is required.  If the
3540      cost model check was not done before during versioning, it has to
3541      be done before the prologue check.
3542
3543        if (cost <= th)
3544          prologue = scalar_iters
3545        if (prologue == 0)
3546          jmp to vector code
3547        else
3548          execute prologue
3549        if (prologue == num_iters)
3550          go to exit
3551
3552      Hence the run-time scalar cost is incremented by a taken branch,
3553      plus a not-taken branch, plus a taken branch cost.
3554
3555      3. The vectorizer then checks if an epilogue is required.  If the
3556      cost model check was not done before during prologue check, it
3557      has to be done with the epilogue check.
3558
3559        if (prologue == 0)
3560          jmp to vector code
3561        else
3562          execute prologue
3563        if (prologue == num_iters)
3564          go to exit
3565        vector code:
3566          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3567            jmp to epilogue
3568
3569      Hence the run-time scalar cost should be incremented by 2 taken
3570      branches.
3571
3572      TODO: The back end may reorder the BBS's differently and reverse
3573      conditions/branch directions.  Change the estimates below to
3574      something more reasonable.  */
3575
3576   /* If the number of iterations is known and we do not do versioning, we can
3577      decide whether to vectorize at compile time.  Hence the scalar version
3578      do not carry cost model guard costs.  */
3579   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3580       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3581     {
3582       /* Cost model check occurs at versioning.  */
3583       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3584         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3585       else
3586         {
3587           /* Cost model check occurs at prologue generation.  */
3588           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3589             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3590               + vect_get_stmt_cost (cond_branch_not_taken);
3591           /* Cost model check occurs at epilogue generation.  */
3592           else
3593             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3594         }
3595     }
3596
3597   /* Complete the target-specific cost calculations.  */
3598   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3599                &vec_inside_cost, &vec_epilogue_cost);
3600
3601   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3602
3603   if (dump_enabled_p ())
3604     {
3605       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3606       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3607                    vec_inside_cost);
3608       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3609                    vec_prologue_cost);
3610       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3611                    vec_epilogue_cost);
3612       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3613                    scalar_single_iter_cost);
3614       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3615                    scalar_outside_cost);
3616       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3617                    vec_outside_cost);
3618       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3619                    peel_iters_prologue);
3620       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3621                    peel_iters_epilogue);
3622     }
3623
3624   /* Calculate number of iterations required to make the vector version
3625      profitable, relative to the loop bodies only.  The following condition
3626      must hold true:
3627      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3628      where
3629      SIC = scalar iteration cost, VIC = vector iteration cost,
3630      VOC = vector outside cost, VF = vectorization factor,
3631      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3632      SOC = scalar outside cost for run time cost model check.  */
3633
3634   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3635     {
3636       if (vec_outside_cost <= 0)
3637         min_profitable_iters = 0;
3638       else
3639         {
3640           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3641                                   - vec_inside_cost * peel_iters_prologue
3642                                   - vec_inside_cost * peel_iters_epilogue)
3643                                  / ((scalar_single_iter_cost * vf)
3644                                     - vec_inside_cost);
3645
3646           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3647               <= (((int) vec_inside_cost * min_profitable_iters)
3648                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3649             min_profitable_iters++;
3650         }
3651     }
3652   /* vector version will never be profitable.  */
3653   else
3654     {
3655       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3656         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3657                     "did not happen for a simd loop");
3658
3659       if (dump_enabled_p ())
3660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3661                          "cost model: the vector iteration cost = %d "
3662                          "divided by the scalar iteration cost = %d "
3663                          "is greater or equal to the vectorization factor = %d"
3664                          ".\n",
3665                          vec_inside_cost, scalar_single_iter_cost, vf);
3666       *ret_min_profitable_niters = -1;
3667       *ret_min_profitable_estimate = -1;
3668       return;
3669     }
3670
3671   dump_printf (MSG_NOTE,
3672                "  Calculated minimum iters for profitability: %d\n",
3673                min_profitable_iters);
3674
3675   /* We want the vectorized loop to execute at least once.  */
3676   if (min_profitable_iters < (vf + peel_iters_prologue))
3677     min_profitable_iters = vf + peel_iters_prologue;
3678
3679   if (dump_enabled_p ())
3680     dump_printf_loc (MSG_NOTE, vect_location,
3681                      "  Runtime profitability threshold = %d\n",
3682                      min_profitable_iters);
3683
3684   *ret_min_profitable_niters = min_profitable_iters;
3685
3686   /* Calculate number of iterations required to make the vector version
3687      profitable, relative to the loop bodies only.
3688
3689      Non-vectorized variant is SIC * niters and it must win over vector
3690      variant on the expected loop trip count.  The following condition must hold true:
3691      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3692
3693   if (vec_outside_cost <= 0)
3694     min_profitable_estimate = 0;
3695   else
3696     {
3697       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3698                                  - vec_inside_cost * peel_iters_prologue
3699                                  - vec_inside_cost * peel_iters_epilogue)
3700                                  / ((scalar_single_iter_cost * vf)
3701                                    - vec_inside_cost);
3702     }
3703   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3704   if (dump_enabled_p ())
3705     dump_printf_loc (MSG_NOTE, vect_location,
3706                      "  Static estimate profitability threshold = %d\n",
3707                      min_profitable_estimate);
3708
3709   *ret_min_profitable_estimate = min_profitable_estimate;
3710 }
3711
3712 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3713    vector elements (not bits) for a vector with NELT elements.  */
3714 static void
3715 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3716                               vec_perm_indices *sel)
3717 {
3718   unsigned int i;
3719
3720   for (i = 0; i < nelt; i++)
3721     sel->quick_push ((i + offset) & (2 * nelt - 1));
3722 }
3723
3724 /* Checks whether the target supports whole-vector shifts for vectors of mode
3725    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3726    it supports vec_perm_const with masks for all necessary shift amounts.  */
3727 static bool
3728 have_whole_vector_shift (machine_mode mode)
3729 {
3730   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3731     return true;
3732
3733   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3734     return false;
3735
3736   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3737   auto_vec_perm_indices sel (nelt);
3738
3739   for (i = nelt/2; i >= 1; i/=2)
3740     {
3741       sel.truncate (0);
3742       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3743       if (!can_vec_perm_p (mode, false, &sel))
3744         return false;
3745     }
3746   return true;
3747 }
3748
3749 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3750    functions. Design better to avoid maintenance issues.  */
3751
3752 /* Function vect_model_reduction_cost.
3753
3754    Models cost for a reduction operation, including the vector ops
3755    generated within the strip-mine loop, the initial definition before
3756    the loop, and the epilogue code that must be generated.  */
3757
3758 static void
3759 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3760                            int ncopies)
3761 {
3762   int prologue_cost = 0, epilogue_cost = 0;
3763   enum tree_code code;
3764   optab optab;
3765   tree vectype;
3766   gimple *orig_stmt;
3767   machine_mode mode;
3768   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3769   struct loop *loop = NULL;
3770   void *target_cost_data;
3771
3772   if (loop_vinfo)
3773     {
3774       loop = LOOP_VINFO_LOOP (loop_vinfo);
3775       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3776     }
3777   else
3778     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3779
3780   /* Condition reductions generate two reductions in the loop.  */
3781   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3782     ncopies *= 2;
3783
3784   /* Cost of reduction op inside loop.  */
3785   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3786                                         stmt_info, 0, vect_body);
3787
3788   vectype = STMT_VINFO_VECTYPE (stmt_info);
3789   mode = TYPE_MODE (vectype);
3790   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3791
3792   if (!orig_stmt)
3793     orig_stmt = STMT_VINFO_STMT (stmt_info);
3794
3795   code = gimple_assign_rhs_code (orig_stmt);
3796
3797   /* Add in cost for initial definition.
3798      For cond reduction we have four vectors: initial index, step, initial
3799      result of the data reduction, initial value of the index reduction.  */
3800   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3801                        == COND_REDUCTION ? 4 : 1;
3802   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3803                                   scalar_to_vec, stmt_info, 0,
3804                                   vect_prologue);
3805
3806   /* Determine cost of epilogue code.
3807
3808      We have a reduction operator that will reduce the vector in one statement.
3809      Also requires scalar extract.  */
3810
3811   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3812     {
3813       if (reduc_fn != IFN_LAST)
3814         {
3815           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3816             {
3817               /* An EQ stmt and an COND_EXPR stmt.  */
3818               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3819                                               vector_stmt, stmt_info, 0,
3820                                               vect_epilogue);
3821               /* Reduction of the max index and a reduction of the found
3822                  values.  */
3823               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3824                                               vec_to_scalar, stmt_info, 0,
3825                                               vect_epilogue);
3826               /* A broadcast of the max value.  */
3827               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3828                                               scalar_to_vec, stmt_info, 0,
3829                                               vect_epilogue);
3830             }
3831           else
3832             {
3833               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3834                                               stmt_info, 0, vect_epilogue);
3835               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3836                                               vec_to_scalar, stmt_info, 0,
3837                                               vect_epilogue);
3838             }
3839         }
3840       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3841         {
3842           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3843           /* Extraction of scalar elements.  */
3844           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3845                                           vec_to_scalar, stmt_info, 0,
3846                                           vect_epilogue);
3847           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3848           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3849                                           scalar_stmt, stmt_info, 0,
3850                                           vect_epilogue);
3851         }
3852       else
3853         {
3854           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3855           tree bitsize =
3856             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3857           int element_bitsize = tree_to_uhwi (bitsize);
3858           int nelements = vec_size_in_bits / element_bitsize;
3859
3860           if (code == COND_EXPR)
3861             code = MAX_EXPR;
3862
3863           optab = optab_for_tree_code (code, vectype, optab_default);
3864
3865           /* We have a whole vector shift available.  */
3866           if (optab != unknown_optab
3867               && VECTOR_MODE_P (mode)
3868               && optab_handler (optab, mode) != CODE_FOR_nothing
3869               && have_whole_vector_shift (mode))
3870             {
3871               /* Final reduction via vector shifts and the reduction operator.
3872                  Also requires scalar extract.  */
3873               epilogue_cost += add_stmt_cost (target_cost_data,
3874                                               exact_log2 (nelements) * 2,
3875                                               vector_stmt, stmt_info, 0,
3876                                               vect_epilogue);
3877               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3878                                               vec_to_scalar, stmt_info, 0,
3879                                               vect_epilogue);
3880             }
3881           else
3882             /* Use extracts and reduction op for final reduction.  For N
3883                elements, we have N extracts and N-1 reduction ops.  */
3884             epilogue_cost += add_stmt_cost (target_cost_data,
3885                                             nelements + nelements - 1,
3886                                             vector_stmt, stmt_info, 0,
3887                                             vect_epilogue);
3888         }
3889     }
3890
3891   if (dump_enabled_p ())
3892     dump_printf (MSG_NOTE,
3893                  "vect_model_reduction_cost: inside_cost = %d, "
3894                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3895                  prologue_cost, epilogue_cost);
3896 }
3897
3898
3899 /* Function vect_model_induction_cost.
3900
3901    Models cost for induction operations.  */
3902
3903 static void
3904 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3905 {
3906   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3907   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3908   unsigned inside_cost, prologue_cost;
3909
3910   if (PURE_SLP_STMT (stmt_info))
3911     return;
3912
3913   /* loop cost for vec_loop.  */
3914   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3915                                stmt_info, 0, vect_body);
3916
3917   /* prologue cost for vec_init and vec_step.  */
3918   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3919                                  stmt_info, 0, vect_prologue);
3920
3921   if (dump_enabled_p ())
3922     dump_printf_loc (MSG_NOTE, vect_location,
3923                      "vect_model_induction_cost: inside_cost = %d, "
3924                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3925 }
3926
3927
3928
3929 /* Function get_initial_def_for_reduction
3930
3931    Input:
3932    STMT - a stmt that performs a reduction operation in the loop.
3933    INIT_VAL - the initial value of the reduction variable
3934
3935    Output:
3936    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3937         of the reduction (used for adjusting the epilog - see below).
3938    Return a vector variable, initialized according to the operation that STMT
3939         performs. This vector will be used as the initial value of the
3940         vector of partial results.
3941
3942    Option1 (adjust in epilog): Initialize the vector as follows:
3943      add/bit or/xor:    [0,0,...,0,0]
3944      mult/bit and:      [1,1,...,1,1]
3945      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3946    and when necessary (e.g. add/mult case) let the caller know
3947    that it needs to adjust the result by init_val.
3948
3949    Option2: Initialize the vector as follows:
3950      add/bit or/xor:    [init_val,0,0,...,0]
3951      mult/bit and:      [init_val,1,1,...,1]
3952      min/max/cond_expr: [init_val,init_val,...,init_val]
3953    and no adjustments are needed.
3954
3955    For example, for the following code:
3956
3957    s = init_val;
3958    for (i=0;i<n;i++)
3959      s = s + a[i];
3960
3961    STMT is 's = s + a[i]', and the reduction variable is 's'.
3962    For a vector of 4 units, we want to return either [0,0,0,init_val],
3963    or [0,0,0,0] and let the caller know that it needs to adjust
3964    the result at the end by 'init_val'.
3965
3966    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3967    initialization vector is simpler (same element in all entries), if
3968    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3969
3970    A cost model should help decide between these two schemes.  */
3971
3972 tree
3973 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3974                                tree *adjustment_def)
3975 {
3976   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3977   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3978   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3979   tree scalar_type = TREE_TYPE (init_val);
3980   tree vectype = get_vectype_for_scalar_type (scalar_type);
3981   int nunits;
3982   enum tree_code code = gimple_assign_rhs_code (stmt);
3983   tree def_for_init;
3984   tree init_def;
3985   int i;
3986   bool nested_in_vect_loop = false;
3987   REAL_VALUE_TYPE real_init_val = dconst0;
3988   int int_init_val = 0;
3989   gimple *def_stmt = NULL;
3990   gimple_seq stmts = NULL;
3991
3992   gcc_assert (vectype);
3993   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3994
3995   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3996               || SCALAR_FLOAT_TYPE_P (scalar_type));
3997
3998   if (nested_in_vect_loop_p (loop, stmt))
3999     nested_in_vect_loop = true;
4000   else
4001     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4002
4003   /* In case of double reduction we only create a vector variable to be put
4004      in the reduction phi node.  The actual statement creation is done in
4005      vect_create_epilog_for_reduction.  */
4006   if (adjustment_def && nested_in_vect_loop
4007       && TREE_CODE (init_val) == SSA_NAME
4008       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4009       && gimple_code (def_stmt) == GIMPLE_PHI
4010       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4011       && vinfo_for_stmt (def_stmt)
4012       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4013           == vect_double_reduction_def)
4014     {
4015       *adjustment_def = NULL;
4016       return vect_create_destination_var (init_val, vectype);
4017     }
4018
4019   /* In case of a nested reduction do not use an adjustment def as
4020      that case is not supported by the epilogue generation correctly
4021      if ncopies is not one.  */
4022   if (adjustment_def && nested_in_vect_loop)
4023     {
4024       *adjustment_def = NULL;
4025       return vect_get_vec_def_for_operand (init_val, stmt);
4026     }
4027
4028   switch (code)
4029     {
4030     case WIDEN_SUM_EXPR:
4031     case DOT_PROD_EXPR:
4032     case SAD_EXPR:
4033     case PLUS_EXPR:
4034     case MINUS_EXPR:
4035     case BIT_IOR_EXPR:
4036     case BIT_XOR_EXPR:
4037     case MULT_EXPR:
4038     case BIT_AND_EXPR:
4039       {
4040         /* ADJUSMENT_DEF is NULL when called from
4041            vect_create_epilog_for_reduction to vectorize double reduction.  */
4042         if (adjustment_def)
4043           *adjustment_def = init_val;
4044
4045         if (code == MULT_EXPR)
4046           {
4047             real_init_val = dconst1;
4048             int_init_val = 1;
4049           }
4050
4051         if (code == BIT_AND_EXPR)
4052           int_init_val = -1;
4053
4054         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4055           def_for_init = build_real (scalar_type, real_init_val);
4056         else
4057           def_for_init = build_int_cst (scalar_type, int_init_val);
4058
4059         if (adjustment_def)
4060           /* Option1: the first element is '0' or '1' as well.  */
4061           init_def = gimple_build_vector_from_val (&stmts, vectype,
4062                                                    def_for_init);
4063         else
4064           {
4065             /* Option2: the first element is INIT_VAL.  */
4066             auto_vec<tree, 32> elts (nunits);
4067             elts.quick_push (init_val);
4068             for (i = 1; i < nunits; ++i)
4069               elts.quick_push (def_for_init);
4070             init_def = gimple_build_vector (&stmts, vectype, elts);
4071           }
4072       }
4073       break;
4074
4075     case MIN_EXPR:
4076     case MAX_EXPR:
4077     case COND_EXPR:
4078       {
4079         if (adjustment_def)
4080           {
4081             *adjustment_def = NULL_TREE;
4082             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4083               {
4084                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4085                 break;
4086               }
4087           }
4088         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4089         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4090       }
4091       break;
4092
4093     default:
4094       gcc_unreachable ();
4095     }
4096
4097   if (stmts)
4098     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4099   return init_def;
4100 }
4101
4102 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4103    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4104
4105 static void
4106 get_initial_defs_for_reduction (slp_tree slp_node,
4107                                 vec<tree> *vec_oprnds,
4108                                 unsigned int number_of_vectors,
4109                                 enum tree_code code, bool reduc_chain)
4110 {
4111   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4112   gimple *stmt = stmts[0];
4113   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4114   unsigned nunits;
4115   unsigned j, number_of_places_left_in_vector;
4116   tree vector_type, scalar_type;
4117   tree vop;
4118   int group_size = stmts.length ();
4119   unsigned int vec_num, i;
4120   unsigned number_of_copies = 1;
4121   vec<tree> voprnds;
4122   voprnds.create (number_of_vectors);
4123   tree neutral_op = NULL;
4124   struct loop *loop;
4125
4126   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4127   scalar_type = TREE_TYPE (vector_type);
4128   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4129
4130   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4131
4132   loop = (gimple_bb (stmt))->loop_father;
4133   gcc_assert (loop);
4134   edge pe = loop_preheader_edge (loop);
4135
4136   /* op is the reduction operand of the first stmt already.  */
4137   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4138      we need either neutral operands or the original operands.  See
4139      get_initial_def_for_reduction() for details.  */
4140   switch (code)
4141     {
4142     case WIDEN_SUM_EXPR:
4143     case DOT_PROD_EXPR:
4144     case SAD_EXPR:
4145     case PLUS_EXPR:
4146     case MINUS_EXPR:
4147     case BIT_IOR_EXPR:
4148     case BIT_XOR_EXPR:
4149       neutral_op = build_zero_cst (scalar_type);
4150       break;
4151
4152     case MULT_EXPR:
4153       neutral_op = build_one_cst (scalar_type);
4154       break;
4155
4156     case BIT_AND_EXPR:
4157       neutral_op = build_all_ones_cst (scalar_type);
4158       break;
4159
4160     /* For MIN/MAX we don't have an easy neutral operand but
4161        the initial values can be used fine here.  Only for
4162        a reduction chain we have to force a neutral element.  */
4163     case MAX_EXPR:
4164     case MIN_EXPR:
4165       if (! reduc_chain)
4166         neutral_op = NULL;
4167       else
4168         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4169       break;
4170
4171     default:
4172       gcc_assert (! reduc_chain);
4173       neutral_op = NULL;
4174     }
4175
4176   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4177      created vectors. It is greater than 1 if unrolling is performed.
4178
4179      For example, we have two scalar operands, s1 and s2 (e.g., group of
4180      strided accesses of size two), while NUNITS is four (i.e., four scalars
4181      of this type can be packed in a vector).  The output vector will contain
4182      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4183      will be 2).
4184
4185      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4186      containing the operands.
4187
4188      For example, NUNITS is four as before, and the group size is 8
4189      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4190      {s5, s6, s7, s8}.  */
4191
4192   number_of_copies = nunits * number_of_vectors / group_size;
4193
4194   number_of_places_left_in_vector = nunits;
4195   auto_vec<tree, 32> elts (nunits);
4196   elts.quick_grow (nunits);
4197   for (j = 0; j < number_of_copies; j++)
4198     {
4199       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4200         {
4201           tree op;
4202           /* Get the def before the loop.  In reduction chain we have only
4203              one initial value.  */
4204           if ((j != (number_of_copies - 1)
4205                || (reduc_chain && i != 0))
4206               && neutral_op)
4207             op = neutral_op;
4208           else
4209             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4210
4211           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4212           number_of_places_left_in_vector--;
4213           elts[number_of_places_left_in_vector] = op;
4214
4215           if (number_of_places_left_in_vector == 0)
4216             {
4217               gimple_seq ctor_seq = NULL;
4218               tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4219               if (ctor_seq != NULL)
4220                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4221               voprnds.quick_push (init);
4222
4223               number_of_places_left_in_vector = nunits;
4224             }
4225         }
4226     }
4227
4228   /* Since the vectors are created in the reverse order, we should invert
4229      them.  */
4230   vec_num = voprnds.length ();
4231   for (j = vec_num; j != 0; j--)
4232     {
4233       vop = voprnds[j - 1];
4234       vec_oprnds->quick_push (vop);
4235     }
4236
4237   voprnds.release ();
4238
4239   /* In case that VF is greater than the unrolling factor needed for the SLP
4240      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4241      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4242      to replicate the vectors.  */
4243   tree neutral_vec = NULL;
4244   while (number_of_vectors > vec_oprnds->length ())
4245     {
4246       if (neutral_op)
4247         {
4248           if (!neutral_vec)
4249             {
4250               gimple_seq ctor_seq = NULL;
4251               neutral_vec = gimple_build_vector_from_val
4252                 (&ctor_seq, vector_type, neutral_op);
4253               if (ctor_seq != NULL)
4254                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4255             }
4256           vec_oprnds->quick_push (neutral_vec);
4257         }
4258       else
4259         {
4260           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4261             vec_oprnds->quick_push (vop);
4262         }
4263     }
4264 }
4265
4266
4267 /* Function vect_create_epilog_for_reduction
4268
4269    Create code at the loop-epilog to finalize the result of a reduction
4270    computation.
4271
4272    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4273      reduction statements.
4274    STMT is the scalar reduction stmt that is being vectorized.
4275    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4276      number of elements that we can fit in a vectype (nunits).  In this case
4277      we have to generate more than one vector stmt - i.e - we need to "unroll"
4278      the vector stmt by a factor VF/nunits.  For more details see documentation
4279      in vectorizable_operation.
4280    REDUC_FN is the internal function for the epilog reduction.
4281    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4282      computation.
4283    REDUC_INDEX is the index of the operand in the right hand side of the
4284      statement that is defined by REDUCTION_PHI.
4285    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4286    SLP_NODE is an SLP node containing a group of reduction statements. The
4287      first one in this group is STMT.
4288
4289    This function:
4290    1. Creates the reduction def-use cycles: sets the arguments for
4291       REDUCTION_PHIS:
4292       The loop-entry argument is the vectorized initial-value of the reduction.
4293       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4294       sums.
4295    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4296       by calling the function specified by REDUC_FN if available, or by
4297       other means (whole-vector shifts or a scalar loop).
4298       The function also creates a new phi node at the loop exit to preserve
4299       loop-closed form, as illustrated below.
4300
4301      The flow at the entry to this function:
4302
4303         loop:
4304           vec_def = phi <null, null>            # REDUCTION_PHI
4305           VECT_DEF = vector_stmt                # vectorized form of STMT
4306           s_loop = scalar_stmt                  # (scalar) STMT
4307         loop_exit:
4308           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4309           use <s_out0>
4310           use <s_out0>
4311
4312      The above is transformed by this function into:
4313
4314         loop:
4315           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4316           VECT_DEF = vector_stmt                # vectorized form of STMT
4317           s_loop = scalar_stmt                  # (scalar) STMT
4318         loop_exit:
4319           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4320           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4321           v_out2 = reduce <v_out1>
4322           s_out3 = extract_field <v_out2, 0>
4323           s_out4 = adjust_result <s_out3>
4324           use <s_out4>
4325           use <s_out4>
4326 */
4327
4328 static void
4329 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4330                                   gimple *reduc_def_stmt,
4331                                   int ncopies, internal_fn reduc_fn,
4332                                   vec<gimple *> reduction_phis,
4333                                   bool double_reduc,
4334                                   slp_tree slp_node,
4335                                   slp_instance slp_node_instance)
4336 {
4337   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4338   stmt_vec_info prev_phi_info;
4339   tree vectype;
4340   machine_mode mode;
4341   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4342   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4343   basic_block exit_bb;
4344   tree scalar_dest;
4345   tree scalar_type;
4346   gimple *new_phi = NULL, *phi;
4347   gimple_stmt_iterator exit_gsi;
4348   tree vec_dest;
4349   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4350   gimple *epilog_stmt = NULL;
4351   enum tree_code code = gimple_assign_rhs_code (stmt);
4352   gimple *exit_phi;
4353   tree bitsize;
4354   tree adjustment_def = NULL;
4355   tree vec_initial_def = NULL;
4356   tree expr, def, initial_def = NULL;
4357   tree orig_name, scalar_result;
4358   imm_use_iterator imm_iter, phi_imm_iter;
4359   use_operand_p use_p, phi_use_p;
4360   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4361   bool nested_in_vect_loop = false;
4362   auto_vec<gimple *> new_phis;
4363   auto_vec<gimple *> inner_phis;
4364   enum vect_def_type dt = vect_unknown_def_type;
4365   int j, i;
4366   auto_vec<tree> scalar_results;
4367   unsigned int group_size = 1, k, ratio;
4368   auto_vec<tree> vec_initial_defs;
4369   auto_vec<gimple *> phis;
4370   bool slp_reduc = false;
4371   tree new_phi_result;
4372   gimple *inner_phi = NULL;
4373   tree induction_index = NULL_TREE;
4374
4375   if (slp_node)
4376     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4377
4378   if (nested_in_vect_loop_p (loop, stmt))
4379     {
4380       outer_loop = loop;
4381       loop = loop->inner;
4382       nested_in_vect_loop = true;
4383       gcc_assert (!slp_node);
4384     }
4385
4386   vectype = STMT_VINFO_VECTYPE (stmt_info);
4387   gcc_assert (vectype);
4388   mode = TYPE_MODE (vectype);
4389
4390   /* 1. Create the reduction def-use cycle:
4391      Set the arguments of REDUCTION_PHIS, i.e., transform
4392
4393         loop:
4394           vec_def = phi <null, null>            # REDUCTION_PHI
4395           VECT_DEF = vector_stmt                # vectorized form of STMT
4396           ...
4397
4398      into:
4399
4400         loop:
4401           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4402           VECT_DEF = vector_stmt                # vectorized form of STMT
4403           ...
4404
4405      (in case of SLP, do it for all the phis). */
4406
4407   /* Get the loop-entry arguments.  */
4408   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4409   if (slp_node)
4410     {
4411       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4412       vec_initial_defs.reserve (vec_num);
4413       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4414                                       &vec_initial_defs, vec_num, code,
4415                                       GROUP_FIRST_ELEMENT (stmt_info));
4416     }
4417   else
4418     {
4419       /* Get at the scalar def before the loop, that defines the initial value
4420          of the reduction variable.  */
4421       gimple *def_stmt;
4422       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4423                                            loop_preheader_edge (loop));
4424       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4425       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4426                                                        &adjustment_def);
4427       vec_initial_defs.create (1);
4428       vec_initial_defs.quick_push (vec_initial_def);
4429     }
4430
4431   /* Set phi nodes arguments.  */
4432   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4433     {
4434       tree vec_init_def = vec_initial_defs[i];
4435       tree def = vect_defs[i];
4436       for (j = 0; j < ncopies; j++)
4437         {
4438           if (j != 0)
4439             {
4440               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4441               if (nested_in_vect_loop)
4442                 vec_init_def
4443                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4444                                                     vec_init_def);
4445             }
4446
4447           /* Set the loop-entry arg of the reduction-phi.  */
4448
4449           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4450               == INTEGER_INDUC_COND_REDUCTION)
4451             {
4452               /* Initialise the reduction phi to zero.  This prevents initial
4453                  values of non-zero interferring with the reduction op.  */
4454               gcc_assert (ncopies == 1);
4455               gcc_assert (i == 0);
4456
4457               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4458               tree zero_vec = build_zero_cst (vec_init_def_type);
4459
4460               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4461                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4462             }
4463           else
4464             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4465                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4466
4467           /* Set the loop-latch arg for the reduction-phi.  */
4468           if (j > 0)
4469             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4470
4471           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4472                        UNKNOWN_LOCATION);
4473
4474           if (dump_enabled_p ())
4475             {
4476               dump_printf_loc (MSG_NOTE, vect_location,
4477                                "transform reduction: created def-use cycle: ");
4478               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4479               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4480             }
4481         }
4482     }
4483
4484   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4485      which is updated with the current index of the loop for every match of
4486      the original loop's cond_expr (VEC_STMT).  This results in a vector
4487      containing the last time the condition passed for that vector lane.
4488      The first match will be a 1 to allow 0 to be used for non-matching
4489      indexes.  If there are no matches at all then the vector will be all
4490      zeroes.  */
4491   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4492     {
4493       tree indx_before_incr, indx_after_incr;
4494       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4495       int k;
4496
4497       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4498       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4499
4500       int scalar_precision
4501         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4502       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4503       tree cr_index_vector_type = build_vector_type
4504         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4505
4506       /* First we create a simple vector induction variable which starts
4507          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4508          vector size (STEP).  */
4509
4510       /* Create a {1,2,3,...} vector.  */
4511       tree_vector_builder vtemp (cr_index_vector_type, 1, 3);
4512       for (k = 0; k < 3; ++k)
4513         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4514       tree series_vect = vtemp.build ();
4515
4516       /* Create a vector of the step value.  */
4517       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4518       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4519
4520       /* Create an induction variable.  */
4521       gimple_stmt_iterator incr_gsi;
4522       bool insert_after;
4523       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4524       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4525                  insert_after, &indx_before_incr, &indx_after_incr);
4526
4527       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4528          filled with zeros (VEC_ZERO).  */
4529
4530       /* Create a vector of 0s.  */
4531       tree zero = build_zero_cst (cr_index_scalar_type);
4532       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4533
4534       /* Create a vector phi node.  */
4535       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4536       new_phi = create_phi_node (new_phi_tree, loop->header);
4537       set_vinfo_for_stmt (new_phi,
4538                           new_stmt_vec_info (new_phi, loop_vinfo));
4539       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4540                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4541
4542       /* Now take the condition from the loops original cond_expr
4543          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4544          every match uses values from the induction variable
4545          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4546          (NEW_PHI_TREE).
4547          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4548          the new cond_expr (INDEX_COND_EXPR).  */
4549
4550       /* Duplicate the condition from vec_stmt.  */
4551       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4552
4553       /* Create a conditional, where the condition is taken from vec_stmt
4554          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4555          else is the phi (NEW_PHI_TREE).  */
4556       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4557                                      ccompare, indx_before_incr,
4558                                      new_phi_tree);
4559       induction_index = make_ssa_name (cr_index_vector_type);
4560       gimple *index_condition = gimple_build_assign (induction_index,
4561                                                      index_cond_expr);
4562       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4563       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4564                                                         loop_vinfo);
4565       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4566       set_vinfo_for_stmt (index_condition, index_vec_info);
4567
4568       /* Update the phi with the vec cond.  */
4569       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4570                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4571     }
4572
4573   /* 2. Create epilog code.
4574         The reduction epilog code operates across the elements of the vector
4575         of partial results computed by the vectorized loop.
4576         The reduction epilog code consists of:
4577
4578         step 1: compute the scalar result in a vector (v_out2)
4579         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4580         step 3: adjust the scalar result (s_out3) if needed.
4581
4582         Step 1 can be accomplished using one the following three schemes:
4583           (scheme 1) using reduc_fn, if available.
4584           (scheme 2) using whole-vector shifts, if available.
4585           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4586                      combined.
4587
4588           The overall epilog code looks like this:
4589
4590           s_out0 = phi <s_loop>         # original EXIT_PHI
4591           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4592           v_out2 = reduce <v_out1>              # step 1
4593           s_out3 = extract_field <v_out2, 0>    # step 2
4594           s_out4 = adjust_result <s_out3>       # step 3
4595
4596           (step 3 is optional, and steps 1 and 2 may be combined).
4597           Lastly, the uses of s_out0 are replaced by s_out4.  */
4598
4599
4600   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4601          v_out1 = phi <VECT_DEF>
4602          Store them in NEW_PHIS.  */
4603
4604   exit_bb = single_exit (loop)->dest;
4605   prev_phi_info = NULL;
4606   new_phis.create (vect_defs.length ());
4607   FOR_EACH_VEC_ELT (vect_defs, i, def)
4608     {
4609       for (j = 0; j < ncopies; j++)
4610         {
4611           tree new_def = copy_ssa_name (def);
4612           phi = create_phi_node (new_def, exit_bb);
4613           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4614           if (j == 0)
4615             new_phis.quick_push (phi);
4616           else
4617             {
4618               def = vect_get_vec_def_for_stmt_copy (dt, def);
4619               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4620             }
4621
4622           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4623           prev_phi_info = vinfo_for_stmt (phi);
4624         }
4625     }
4626
4627   /* The epilogue is created for the outer-loop, i.e., for the loop being
4628      vectorized.  Create exit phis for the outer loop.  */
4629   if (double_reduc)
4630     {
4631       loop = outer_loop;
4632       exit_bb = single_exit (loop)->dest;
4633       inner_phis.create (vect_defs.length ());
4634       FOR_EACH_VEC_ELT (new_phis, i, phi)
4635         {
4636           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4637           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4638           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4639                            PHI_RESULT (phi));
4640           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4641                                                             loop_vinfo));
4642           inner_phis.quick_push (phi);
4643           new_phis[i] = outer_phi;
4644           prev_phi_info = vinfo_for_stmt (outer_phi);
4645           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4646             {
4647               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4648               new_result = copy_ssa_name (PHI_RESULT (phi));
4649               outer_phi = create_phi_node (new_result, exit_bb);
4650               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4651                                PHI_RESULT (phi));
4652               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4653                                                                 loop_vinfo));
4654               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4655               prev_phi_info = vinfo_for_stmt (outer_phi);
4656             }
4657         }
4658     }
4659
4660   exit_gsi = gsi_after_labels (exit_bb);
4661
4662   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4663          (i.e. when reduc_fn is not available) and in the final adjustment
4664          code (if needed).  Also get the original scalar reduction variable as
4665          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4666          represents a reduction pattern), the tree-code and scalar-def are
4667          taken from the original stmt that the pattern-stmt (STMT) replaces.
4668          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4669          are taken from STMT.  */
4670
4671   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4672   if (!orig_stmt)
4673     {
4674       /* Regular reduction  */
4675       orig_stmt = stmt;
4676     }
4677   else
4678     {
4679       /* Reduction pattern  */
4680       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4681       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4682       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4683     }
4684
4685   code = gimple_assign_rhs_code (orig_stmt);
4686   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4687      partial results are added and not subtracted.  */
4688   if (code == MINUS_EXPR)
4689     code = PLUS_EXPR;
4690
4691   scalar_dest = gimple_assign_lhs (orig_stmt);
4692   scalar_type = TREE_TYPE (scalar_dest);
4693   scalar_results.create (group_size);
4694   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4695   bitsize = TYPE_SIZE (scalar_type);
4696
4697   /* In case this is a reduction in an inner-loop while vectorizing an outer
4698      loop - we don't need to extract a single scalar result at the end of the
4699      inner-loop (unless it is double reduction, i.e., the use of reduction is
4700      outside the outer-loop).  The final vector of partial results will be used
4701      in the vectorized outer-loop, or reduced to a scalar result at the end of
4702      the outer-loop.  */
4703   if (nested_in_vect_loop && !double_reduc)
4704     goto vect_finalize_reduction;
4705
4706   /* SLP reduction without reduction chain, e.g.,
4707      # a1 = phi <a2, a0>
4708      # b1 = phi <b2, b0>
4709      a2 = operation (a1)
4710      b2 = operation (b1)  */
4711   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4712
4713   /* In case of reduction chain, e.g.,
4714      # a1 = phi <a3, a0>
4715      a2 = operation (a1)
4716      a3 = operation (a2),
4717
4718      we may end up with more than one vector result.  Here we reduce them to
4719      one vector.  */
4720   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4721     {
4722       tree first_vect = PHI_RESULT (new_phis[0]);
4723       gassign *new_vec_stmt = NULL;
4724       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4725       for (k = 1; k < new_phis.length (); k++)
4726         {
4727           gimple *next_phi = new_phis[k];
4728           tree second_vect = PHI_RESULT (next_phi);
4729           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4730           new_vec_stmt = gimple_build_assign (tem, code,
4731                                               first_vect, second_vect);
4732           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4733           first_vect = tem;
4734         }
4735
4736       new_phi_result = first_vect;
4737       if (new_vec_stmt)
4738         {
4739           new_phis.truncate (0);
4740           new_phis.safe_push (new_vec_stmt);
4741         }
4742     }
4743   /* Likewise if we couldn't use a single defuse cycle.  */
4744   else if (ncopies > 1)
4745     {
4746       gcc_assert (new_phis.length () == 1);
4747       tree first_vect = PHI_RESULT (new_phis[0]);
4748       gassign *new_vec_stmt = NULL;
4749       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4750       gimple *next_phi = new_phis[0];
4751       for (int k = 1; k < ncopies; ++k)
4752         {
4753           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4754           tree second_vect = PHI_RESULT (next_phi);
4755           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4756           new_vec_stmt = gimple_build_assign (tem, code,
4757                                               first_vect, second_vect);
4758           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4759           first_vect = tem;
4760         }
4761       new_phi_result = first_vect;
4762       new_phis.truncate (0);
4763       new_phis.safe_push (new_vec_stmt);
4764     }
4765   else
4766     new_phi_result = PHI_RESULT (new_phis[0]);
4767
4768   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4769       && reduc_fn != IFN_LAST)
4770     {
4771       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4772          various data values where the condition matched and another vector
4773          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4774          need to extract the last matching index (which will be the index with
4775          highest value) and use this to index into the data vector.
4776          For the case where there were no matches, the data vector will contain
4777          all default values and the index vector will be all zeros.  */
4778
4779       /* Get various versions of the type of the vector of indexes.  */
4780       tree index_vec_type = TREE_TYPE (induction_index);
4781       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4782       tree index_scalar_type = TREE_TYPE (index_vec_type);
4783       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4784         (index_vec_type);
4785
4786       /* Get an unsigned integer version of the type of the data vector.  */
4787       int scalar_precision
4788         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4789       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4790       tree vectype_unsigned = build_vector_type
4791         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4792
4793       /* First we need to create a vector (ZERO_VEC) of zeros and another
4794          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4795          can create using a MAX reduction and then expanding.
4796          In the case where the loop never made any matches, the max index will
4797          be zero.  */
4798
4799       /* Vector of {0, 0, 0,...}.  */
4800       tree zero_vec = make_ssa_name (vectype);
4801       tree zero_vec_rhs = build_zero_cst (vectype);
4802       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4803       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4804
4805       /* Find maximum value from the vector of found indexes.  */
4806       tree max_index = make_ssa_name (index_scalar_type);
4807       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4808                                                           1, induction_index);
4809       gimple_call_set_lhs (max_index_stmt, max_index);
4810       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4811
4812       /* Vector of {max_index, max_index, max_index,...}.  */
4813       tree max_index_vec = make_ssa_name (index_vec_type);
4814       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4815                                                       max_index);
4816       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4817                                                         max_index_vec_rhs);
4818       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4819
4820       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4821          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4822          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4823          otherwise.  Only one value should match, resulting in a vector
4824          (VEC_COND) with one data value and the rest zeros.
4825          In the case where the loop never made any matches, every index will
4826          match, resulting in a vector with all data values (which will all be
4827          the default value).  */
4828
4829       /* Compare the max index vector to the vector of found indexes to find
4830          the position of the max value.  */
4831       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4832       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4833                                                       induction_index,
4834                                                       max_index_vec);
4835       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4836
4837       /* Use the compare to choose either values from the data vector or
4838          zero.  */
4839       tree vec_cond = make_ssa_name (vectype);
4840       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4841                                                    vec_compare, new_phi_result,
4842                                                    zero_vec);
4843       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4844
4845       /* Finally we need to extract the data value from the vector (VEC_COND)
4846          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4847          reduction, but because this doesn't exist, we can use a MAX reduction
4848          instead.  The data value might be signed or a float so we need to cast
4849          it first.
4850          In the case where the loop never made any matches, the data values are
4851          all identical, and so will reduce down correctly.  */
4852
4853       /* Make the matched data values unsigned.  */
4854       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4855       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4856                                        vec_cond);
4857       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4858                                                         VIEW_CONVERT_EXPR,
4859                                                         vec_cond_cast_rhs);
4860       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4861
4862       /* Reduce down to a scalar value.  */
4863       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4864       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4865                                                            1, vec_cond_cast);
4866       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4867       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4868
4869       /* Convert the reduced value back to the result type and set as the
4870          result.  */
4871       gimple_seq stmts = NULL;
4872       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4873                                data_reduc);
4874       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4875       scalar_results.safe_push (new_temp);
4876     }
4877   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4878            && reduc_fn == IFN_LAST)
4879     {
4880       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4881          idx = 0;
4882          idx_val = induction_index[0];
4883          val = data_reduc[0];
4884          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4885            if (induction_index[i] > idx_val)
4886              val = data_reduc[i], idx_val = induction_index[i];
4887          return val;  */
4888
4889       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4890       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4891       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4892       unsigned HOST_WIDE_INT v_size
4893         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4894       tree idx_val = NULL_TREE, val = NULL_TREE;
4895       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4896         {
4897           tree old_idx_val = idx_val;
4898           tree old_val = val;
4899           idx_val = make_ssa_name (idx_eltype);
4900           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4901                                              build3 (BIT_FIELD_REF, idx_eltype,
4902                                                      induction_index,
4903                                                      bitsize_int (el_size),
4904                                                      bitsize_int (off)));
4905           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4906           val = make_ssa_name (data_eltype);
4907           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4908                                              build3 (BIT_FIELD_REF,
4909                                                      data_eltype,
4910                                                      new_phi_result,
4911                                                      bitsize_int (el_size),
4912                                                      bitsize_int (off)));
4913           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4914           if (off != 0)
4915             {
4916               tree new_idx_val = idx_val;
4917               tree new_val = val;
4918               if (off != v_size - el_size)
4919                 {
4920                   new_idx_val = make_ssa_name (idx_eltype);
4921                   epilog_stmt = gimple_build_assign (new_idx_val,
4922                                                      MAX_EXPR, idx_val,
4923                                                      old_idx_val);
4924                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4925                 }
4926               new_val = make_ssa_name (data_eltype);
4927               epilog_stmt = gimple_build_assign (new_val,
4928                                                  COND_EXPR,
4929                                                  build2 (GT_EXPR,
4930                                                          boolean_type_node,
4931                                                          idx_val,
4932                                                          old_idx_val),
4933                                                  val, old_val);
4934               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4935               idx_val = new_idx_val;
4936               val = new_val;
4937             }
4938         }
4939       /* Convert the reduced value back to the result type and set as the
4940          result.  */
4941       gimple_seq stmts = NULL;
4942       val = gimple_convert (&stmts, scalar_type, val);
4943       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4944       scalar_results.safe_push (val);
4945     }
4946
4947   /* 2.3 Create the reduction code, using one of the three schemes described
4948          above. In SLP we simply need to extract all the elements from the
4949          vector (without reducing them), so we use scalar shifts.  */
4950   else if (reduc_fn != IFN_LAST && !slp_reduc)
4951     {
4952       tree tmp;
4953       tree vec_elem_type;
4954
4955       /* Case 1:  Create:
4956          v_out2 = reduc_expr <v_out1>  */
4957
4958       if (dump_enabled_p ())
4959         dump_printf_loc (MSG_NOTE, vect_location,
4960                          "Reduce using direct vector reduction.\n");
4961
4962       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4963       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4964         {
4965           tree tmp_dest
4966             = vect_create_destination_var (scalar_dest, vec_elem_type);
4967           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4968                                                     new_phi_result);
4969           gimple_set_lhs (epilog_stmt, tmp_dest);
4970           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4971           gimple_set_lhs (epilog_stmt, new_temp);
4972           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4973
4974           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4975                                              new_temp);
4976         }
4977       else
4978         {
4979           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4980                                                     new_phi_result);
4981           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4982         }
4983
4984       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4985       gimple_set_lhs (epilog_stmt, new_temp);
4986       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4987
4988       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4989           == INTEGER_INDUC_COND_REDUCTION)
4990         {
4991           /* Earlier we set the initial value to be zero.  Check the result
4992              and if it is zero then replace with the original initial
4993              value.  */
4994           tree zero = build_zero_cst (scalar_type);
4995           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4996
4997           tmp = make_ssa_name (new_scalar_dest);
4998           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4999                                              initial_def, new_temp);
5000           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5001           new_temp = tmp;
5002         }
5003
5004       scalar_results.safe_push (new_temp);
5005     }
5006   else
5007     {
5008       bool reduce_with_shift = have_whole_vector_shift (mode);
5009       int element_bitsize = tree_to_uhwi (bitsize);
5010       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5011       tree vec_temp;
5012
5013       /* COND reductions all do the final reduction with MAX_EXPR.  */
5014       if (code == COND_EXPR)
5015         code = MAX_EXPR;
5016
5017       /* Regardless of whether we have a whole vector shift, if we're
5018          emulating the operation via tree-vect-generic, we don't want
5019          to use it.  Only the first round of the reduction is likely
5020          to still be profitable via emulation.  */
5021       /* ??? It might be better to emit a reduction tree code here, so that
5022          tree-vect-generic can expand the first round via bit tricks.  */
5023       if (!VECTOR_MODE_P (mode))
5024         reduce_with_shift = false;
5025       else
5026         {
5027           optab optab = optab_for_tree_code (code, vectype, optab_default);
5028           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5029             reduce_with_shift = false;
5030         }
5031
5032       if (reduce_with_shift && !slp_reduc)
5033         {
5034           int nelements = vec_size_in_bits / element_bitsize;
5035           auto_vec_perm_indices sel (nelements);
5036
5037           int elt_offset;
5038
5039           tree zero_vec = build_zero_cst (vectype);
5040           /* Case 2: Create:
5041              for (offset = nelements/2; offset >= 1; offset/=2)
5042                 {
5043                   Create:  va' = vec_shift <va, offset>
5044                   Create:  va = vop <va, va'>
5045                 }  */
5046
5047           tree rhs;
5048
5049           if (dump_enabled_p ())
5050             dump_printf_loc (MSG_NOTE, vect_location,
5051                              "Reduce using vector shifts\n");
5052
5053           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5054           new_temp = new_phi_result;
5055           for (elt_offset = nelements / 2;
5056                elt_offset >= 1;
5057                elt_offset /= 2)
5058             {
5059               sel.truncate (0);
5060               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5061               tree mask = vect_gen_perm_mask_any (vectype, sel);
5062               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5063                                                  new_temp, zero_vec, mask);
5064               new_name = make_ssa_name (vec_dest, epilog_stmt);
5065               gimple_assign_set_lhs (epilog_stmt, new_name);
5066               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5067
5068               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5069                                                  new_temp);
5070               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5071               gimple_assign_set_lhs (epilog_stmt, new_temp);
5072               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5073             }
5074
5075           /* 2.4  Extract the final scalar result.  Create:
5076              s_out3 = extract_field <v_out2, bitpos>  */
5077
5078           if (dump_enabled_p ())
5079             dump_printf_loc (MSG_NOTE, vect_location,
5080                              "extract scalar result\n");
5081
5082           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5083                         bitsize, bitsize_zero_node);
5084           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5085           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5086           gimple_assign_set_lhs (epilog_stmt, new_temp);
5087           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5088           scalar_results.safe_push (new_temp);
5089         }
5090       else
5091         {
5092           /* Case 3: Create:
5093              s = extract_field <v_out2, 0>
5094              for (offset = element_size;
5095                   offset < vector_size;
5096                   offset += element_size;)
5097                {
5098                  Create:  s' = extract_field <v_out2, offset>
5099                  Create:  s = op <s, s'>  // For non SLP cases
5100                }  */
5101
5102           if (dump_enabled_p ())
5103             dump_printf_loc (MSG_NOTE, vect_location,
5104                              "Reduce using scalar code.\n");
5105
5106           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5107           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5108             {
5109               int bit_offset;
5110               if (gimple_code (new_phi) == GIMPLE_PHI)
5111                 vec_temp = PHI_RESULT (new_phi);
5112               else
5113                 vec_temp = gimple_assign_lhs (new_phi);
5114               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5115                             bitsize_zero_node);
5116               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5117               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5118               gimple_assign_set_lhs (epilog_stmt, new_temp);
5119               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5120
5121               /* In SLP we don't need to apply reduction operation, so we just
5122                  collect s' values in SCALAR_RESULTS.  */
5123               if (slp_reduc)
5124                 scalar_results.safe_push (new_temp);
5125
5126               for (bit_offset = element_bitsize;
5127                    bit_offset < vec_size_in_bits;
5128                    bit_offset += element_bitsize)
5129                 {
5130                   tree bitpos = bitsize_int (bit_offset);
5131                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5132                                      bitsize, bitpos);
5133
5134                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5135                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5136                   gimple_assign_set_lhs (epilog_stmt, new_name);
5137                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5138
5139                   if (slp_reduc)
5140                     {
5141                       /* In SLP we don't need to apply reduction operation, so
5142                          we just collect s' values in SCALAR_RESULTS.  */
5143                       new_temp = new_name;
5144                       scalar_results.safe_push (new_name);
5145                     }
5146                   else
5147                     {
5148                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5149                                                          new_name, new_temp);
5150                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5151                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5152                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5153                     }
5154                 }
5155             }
5156
5157           /* The only case where we need to reduce scalar results in SLP, is
5158              unrolling.  If the size of SCALAR_RESULTS is greater than
5159              GROUP_SIZE, we reduce them combining elements modulo
5160              GROUP_SIZE.  */
5161           if (slp_reduc)
5162             {
5163               tree res, first_res, new_res;
5164               gimple *new_stmt;
5165
5166               /* Reduce multiple scalar results in case of SLP unrolling.  */
5167               for (j = group_size; scalar_results.iterate (j, &res);
5168                    j++)
5169                 {
5170                   first_res = scalar_results[j % group_size];
5171                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5172                                                   first_res, res);
5173                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5174                   gimple_assign_set_lhs (new_stmt, new_res);
5175                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5176                   scalar_results[j % group_size] = new_res;
5177                 }
5178             }
5179           else
5180             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5181             scalar_results.safe_push (new_temp);
5182         }
5183
5184       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5185           == INTEGER_INDUC_COND_REDUCTION)
5186         {
5187           /* Earlier we set the initial value to be zero.  Check the result
5188              and if it is zero then replace with the original initial
5189              value.  */
5190           tree zero = build_zero_cst (scalar_type);
5191           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5192
5193           tree tmp = make_ssa_name (new_scalar_dest);
5194           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5195                                              initial_def, new_temp);
5196           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5197           scalar_results[0] = tmp;
5198         }
5199     }
5200
5201 vect_finalize_reduction:
5202
5203   if (double_reduc)
5204     loop = loop->inner;
5205
5206   /* 2.5 Adjust the final result by the initial value of the reduction
5207          variable. (When such adjustment is not needed, then
5208          'adjustment_def' is zero).  For example, if code is PLUS we create:
5209          new_temp = loop_exit_def + adjustment_def  */
5210
5211   if (adjustment_def)
5212     {
5213       gcc_assert (!slp_reduc);
5214       if (nested_in_vect_loop)
5215         {
5216           new_phi = new_phis[0];
5217           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5218           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5219           new_dest = vect_create_destination_var (scalar_dest, vectype);
5220         }
5221       else
5222         {
5223           new_temp = scalar_results[0];
5224           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5225           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5226           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5227         }
5228
5229       epilog_stmt = gimple_build_assign (new_dest, expr);
5230       new_temp = make_ssa_name (new_dest, epilog_stmt);
5231       gimple_assign_set_lhs (epilog_stmt, new_temp);
5232       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5233       if (nested_in_vect_loop)
5234         {
5235           set_vinfo_for_stmt (epilog_stmt,
5236                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5237           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5238                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5239
5240           if (!double_reduc)
5241             scalar_results.quick_push (new_temp);
5242           else
5243             scalar_results[0] = new_temp;
5244         }
5245       else
5246         scalar_results[0] = new_temp;
5247
5248       new_phis[0] = epilog_stmt;
5249     }
5250
5251   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5252           phis with new adjusted scalar results, i.e., replace use <s_out0>
5253           with use <s_out4>.
5254
5255      Transform:
5256         loop_exit:
5257           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5258           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5259           v_out2 = reduce <v_out1>
5260           s_out3 = extract_field <v_out2, 0>
5261           s_out4 = adjust_result <s_out3>
5262           use <s_out0>
5263           use <s_out0>
5264
5265      into:
5266
5267         loop_exit:
5268           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5269           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5270           v_out2 = reduce <v_out1>
5271           s_out3 = extract_field <v_out2, 0>
5272           s_out4 = adjust_result <s_out3>
5273           use <s_out4>
5274           use <s_out4> */
5275
5276
5277   /* In SLP reduction chain we reduce vector results into one vector if
5278      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5279      the last stmt in the reduction chain, since we are looking for the loop
5280      exit phi node.  */
5281   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5282     {
5283       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5284       /* Handle reduction patterns.  */
5285       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5286         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5287
5288       scalar_dest = gimple_assign_lhs (dest_stmt);
5289       group_size = 1;
5290     }
5291
5292   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5293      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5294      need to match SCALAR_RESULTS with corresponding statements.  The first
5295      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5296      the first vector stmt, etc.
5297      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5298   if (group_size > new_phis.length ())
5299     {
5300       ratio = group_size / new_phis.length ();
5301       gcc_assert (!(group_size % new_phis.length ()));
5302     }
5303   else
5304     ratio = 1;
5305
5306   for (k = 0; k < group_size; k++)
5307     {
5308       if (k % ratio == 0)
5309         {
5310           epilog_stmt = new_phis[k / ratio];
5311           reduction_phi = reduction_phis[k / ratio];
5312           if (double_reduc)
5313             inner_phi = inner_phis[k / ratio];
5314         }
5315
5316       if (slp_reduc)
5317         {
5318           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5319
5320           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5321           /* SLP statements can't participate in patterns.  */
5322           gcc_assert (!orig_stmt);
5323           scalar_dest = gimple_assign_lhs (current_stmt);
5324         }
5325
5326       phis.create (3);
5327       /* Find the loop-closed-use at the loop exit of the original scalar
5328          result.  (The reduction result is expected to have two immediate uses -
5329          one at the latch block, and one at the loop exit).  */
5330       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5331         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5332             && !is_gimple_debug (USE_STMT (use_p)))
5333           phis.safe_push (USE_STMT (use_p));
5334
5335       /* While we expect to have found an exit_phi because of loop-closed-ssa
5336          form we can end up without one if the scalar cycle is dead.  */
5337
5338       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5339         {
5340           if (outer_loop)
5341             {
5342               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5343               gphi *vect_phi;
5344
5345               /* FORNOW. Currently not supporting the case that an inner-loop
5346                  reduction is not used in the outer-loop (but only outside the
5347                  outer-loop), unless it is double reduction.  */
5348               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5349                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5350                           || double_reduc);
5351
5352               if (double_reduc)
5353                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5354               else
5355                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5356               if (!double_reduc
5357                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5358                       != vect_double_reduction_def)
5359                 continue;
5360
5361               /* Handle double reduction:
5362
5363                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5364                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5365                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5366                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5367
5368                  At that point the regular reduction (stmt2 and stmt3) is
5369                  already vectorized, as well as the exit phi node, stmt4.
5370                  Here we vectorize the phi node of double reduction, stmt1, and
5371                  update all relevant statements.  */
5372
5373               /* Go through all the uses of s2 to find double reduction phi
5374                  node, i.e., stmt1 above.  */
5375               orig_name = PHI_RESULT (exit_phi);
5376               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5377                 {
5378                   stmt_vec_info use_stmt_vinfo;
5379                   stmt_vec_info new_phi_vinfo;
5380                   tree vect_phi_init, preheader_arg, vect_phi_res;
5381                   basic_block bb = gimple_bb (use_stmt);
5382                   gimple *use;
5383
5384                   /* Check that USE_STMT is really double reduction phi
5385                      node.  */
5386                   if (gimple_code (use_stmt) != GIMPLE_PHI
5387                       || gimple_phi_num_args (use_stmt) != 2
5388                       || bb->loop_father != outer_loop)
5389                     continue;
5390                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5391                   if (!use_stmt_vinfo
5392                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5393                           != vect_double_reduction_def)
5394                     continue;
5395
5396                   /* Create vector phi node for double reduction:
5397                      vs1 = phi <vs0, vs2>
5398                      vs1 was created previously in this function by a call to
5399                        vect_get_vec_def_for_operand and is stored in
5400                        vec_initial_def;
5401                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5402                      vs0 is created here.  */
5403
5404                   /* Create vector phi node.  */
5405                   vect_phi = create_phi_node (vec_initial_def, bb);
5406                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5407                                     loop_vec_info_for_loop (outer_loop));
5408                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5409
5410                   /* Create vs0 - initial def of the double reduction phi.  */
5411                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5412                                              loop_preheader_edge (outer_loop));
5413                   vect_phi_init = get_initial_def_for_reduction
5414                     (stmt, preheader_arg, NULL);
5415
5416                   /* Update phi node arguments with vs0 and vs2.  */
5417                   add_phi_arg (vect_phi, vect_phi_init,
5418                                loop_preheader_edge (outer_loop),
5419                                UNKNOWN_LOCATION);
5420                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5421                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5422                   if (dump_enabled_p ())
5423                     {
5424                       dump_printf_loc (MSG_NOTE, vect_location,
5425                                        "created double reduction phi node: ");
5426                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5427                     }
5428
5429                   vect_phi_res = PHI_RESULT (vect_phi);
5430
5431                   /* Replace the use, i.e., set the correct vs1 in the regular
5432                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5433                      loop is redundant.  */
5434                   use = reduction_phi;
5435                   for (j = 0; j < ncopies; j++)
5436                     {
5437                       edge pr_edge = loop_preheader_edge (loop);
5438                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5439                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5440                     }
5441                 }
5442             }
5443         }
5444
5445       phis.release ();
5446       if (nested_in_vect_loop)
5447         {
5448           if (double_reduc)
5449             loop = outer_loop;
5450           else
5451             continue;
5452         }
5453
5454       phis.create (3);
5455       /* Find the loop-closed-use at the loop exit of the original scalar
5456          result.  (The reduction result is expected to have two immediate uses,
5457          one at the latch block, and one at the loop exit).  For double
5458          reductions we are looking for exit phis of the outer loop.  */
5459       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5460         {
5461           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5462             {
5463               if (!is_gimple_debug (USE_STMT (use_p)))
5464                 phis.safe_push (USE_STMT (use_p));
5465             }
5466           else
5467             {
5468               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5469                 {
5470                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5471
5472                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5473                     {
5474                       if (!flow_bb_inside_loop_p (loop,
5475                                              gimple_bb (USE_STMT (phi_use_p)))
5476                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5477                         phis.safe_push (USE_STMT (phi_use_p));
5478                     }
5479                 }
5480             }
5481         }
5482
5483       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5484         {
5485           /* Replace the uses:  */
5486           orig_name = PHI_RESULT (exit_phi);
5487           scalar_result = scalar_results[k];
5488           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5489             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5490               SET_USE (use_p, scalar_result);
5491         }
5492
5493       phis.release ();
5494     }
5495 }
5496
5497
5498 /* Function is_nonwrapping_integer_induction.
5499
5500    Check if STMT (which is part of loop LOOP) both increments and
5501    does not cause overflow.  */
5502
5503 static bool
5504 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5505 {
5506   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5507   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5508   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5509   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5510   widest_int ni, max_loop_value, lhs_max;
5511   bool overflow = false;
5512
5513   /* Make sure the loop is integer based.  */
5514   if (TREE_CODE (base) != INTEGER_CST
5515       || TREE_CODE (step) != INTEGER_CST)
5516     return false;
5517
5518   /* Check that the induction increments.  */
5519   if (tree_int_cst_sgn (step) == -1)
5520     return false;
5521
5522   /* Check that the max size of the loop will not wrap.  */
5523
5524   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5525     return true;
5526
5527   if (! max_stmt_executions (loop, &ni))
5528     return false;
5529
5530   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5531                             &overflow);
5532   if (overflow)
5533     return false;
5534
5535   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5536                             TYPE_SIGN (lhs_type), &overflow);
5537   if (overflow)
5538     return false;
5539
5540   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5541           <= TYPE_PRECISION (lhs_type));
5542 }
5543
5544 /* Function vectorizable_reduction.
5545
5546    Check if STMT performs a reduction operation that can be vectorized.
5547    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5548    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5549    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5550
5551    This function also handles reduction idioms (patterns) that have been
5552    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5553    of this form:
5554      X = pattern_expr (arg0, arg1, ..., X)
5555    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5556    sequence that had been detected and replaced by the pattern-stmt (STMT).
5557
5558    This function also handles reduction of condition expressions, for example:
5559      for (int i = 0; i < N; i++)
5560        if (a[i] < value)
5561          last = a[i];
5562    This is handled by vectorising the loop and creating an additional vector
5563    containing the loop indexes for which "a[i] < value" was true.  In the
5564    function epilogue this is reduced to a single max value and then used to
5565    index into the vector of results.
5566
5567    In some cases of reduction patterns, the type of the reduction variable X is
5568    different than the type of the other arguments of STMT.
5569    In such cases, the vectype that is used when transforming STMT into a vector
5570    stmt is different than the vectype that is used to determine the
5571    vectorization factor, because it consists of a different number of elements
5572    than the actual number of elements that are being operated upon in parallel.
5573
5574    For example, consider an accumulation of shorts into an int accumulator.
5575    On some targets it's possible to vectorize this pattern operating on 8
5576    shorts at a time (hence, the vectype for purposes of determining the
5577    vectorization factor should be V8HI); on the other hand, the vectype that
5578    is used to create the vector form is actually V4SI (the type of the result).
5579
5580    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5581    indicates what is the actual level of parallelism (V8HI in the example), so
5582    that the right vectorization factor would be derived.  This vectype
5583    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5584    be used to create the vectorized stmt.  The right vectype for the vectorized
5585    stmt is obtained from the type of the result X:
5586         get_vectype_for_scalar_type (TREE_TYPE (X))
5587
5588    This means that, contrary to "regular" reductions (or "regular" stmts in
5589    general), the following equation:
5590       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5591    does *NOT* necessarily hold for reduction patterns.  */
5592
5593 bool
5594 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5595                         gimple **vec_stmt, slp_tree slp_node,
5596                         slp_instance slp_node_instance)
5597 {
5598   tree vec_dest;
5599   tree scalar_dest;
5600   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5601   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5602   tree vectype_in = NULL_TREE;
5603   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5604   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5605   enum tree_code code, orig_code;
5606   internal_fn reduc_fn;
5607   machine_mode vec_mode;
5608   int op_type;
5609   optab optab;
5610   tree new_temp = NULL_TREE;
5611   gimple *def_stmt;
5612   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5613   tree scalar_type;
5614   bool is_simple_use;
5615   gimple *orig_stmt;
5616   stmt_vec_info orig_stmt_info = NULL;
5617   int i;
5618   int ncopies;
5619   int epilog_copies;
5620   stmt_vec_info prev_stmt_info, prev_phi_info;
5621   bool single_defuse_cycle = false;
5622   gimple *new_stmt = NULL;
5623   int j;
5624   tree ops[3];
5625   enum vect_def_type dts[3];
5626   bool nested_cycle = false, found_nested_cycle_def = false;
5627   bool double_reduc = false;
5628   basic_block def_bb;
5629   struct loop * def_stmt_loop, *outer_loop = NULL;
5630   tree def_arg;
5631   gimple *def_arg_stmt;
5632   auto_vec<tree> vec_oprnds0;
5633   auto_vec<tree> vec_oprnds1;
5634   auto_vec<tree> vec_oprnds2;
5635   auto_vec<tree> vect_defs;
5636   auto_vec<gimple *> phis;
5637   int vec_num;
5638   tree def0, tem;
5639   bool first_p = true;
5640   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5641   tree cond_reduc_val = NULL_TREE;
5642
5643   /* Make sure it was already recognized as a reduction computation.  */
5644   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5645       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5646     return false;
5647
5648   if (nested_in_vect_loop_p (loop, stmt))
5649     {
5650       outer_loop = loop;
5651       loop = loop->inner;
5652       nested_cycle = true;
5653     }
5654
5655   /* In case of reduction chain we switch to the first stmt in the chain, but
5656      we don't update STMT_INFO, since only the last stmt is marked as reduction
5657      and has reduction properties.  */
5658   if (GROUP_FIRST_ELEMENT (stmt_info)
5659       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5660     {
5661       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5662       first_p = false;
5663     }
5664
5665   if (gimple_code (stmt) == GIMPLE_PHI)
5666     {
5667       /* Analysis is fully done on the reduction stmt invocation.  */
5668       if (! vec_stmt)
5669         {
5670           if (slp_node)
5671             slp_node_instance->reduc_phis = slp_node;
5672
5673           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5674           return true;
5675         }
5676
5677       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5678       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5679         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5680
5681       gcc_assert (is_gimple_assign (reduc_stmt));
5682       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5683         {
5684           tree op = gimple_op (reduc_stmt, k);
5685           if (op == gimple_phi_result (stmt))
5686             continue;
5687           if (k == 1
5688               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5689             continue;
5690           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5691           if (! vectype_in
5692               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5693             vectype_in = tem;
5694           break;
5695         }
5696       gcc_assert (vectype_in);
5697
5698       if (slp_node)
5699         ncopies = 1;
5700       else
5701         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5702
5703       use_operand_p use_p;
5704       gimple *use_stmt;
5705       if (ncopies > 1
5706           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5707               <= vect_used_only_live)
5708           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5709           && (use_stmt == reduc_stmt
5710               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5711                   == reduc_stmt)))
5712         single_defuse_cycle = true;
5713
5714       /* Create the destination vector  */
5715       scalar_dest = gimple_assign_lhs (reduc_stmt);
5716       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5717
5718       if (slp_node)
5719         /* The size vect_schedule_slp_instance computes is off for us.  */
5720         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5721                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5722                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5723       else
5724         vec_num = 1;
5725
5726       /* Generate the reduction PHIs upfront.  */
5727       prev_phi_info = NULL;
5728       for (j = 0; j < ncopies; j++)
5729         {
5730           if (j == 0 || !single_defuse_cycle)
5731             {
5732               for (i = 0; i < vec_num; i++)
5733                 {
5734                   /* Create the reduction-phi that defines the reduction
5735                      operand.  */
5736                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5737                   set_vinfo_for_stmt (new_phi,
5738                                       new_stmt_vec_info (new_phi, loop_vinfo));
5739
5740                   if (slp_node)
5741                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5742                   else
5743                     {
5744                       if (j == 0)
5745                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5746                       else
5747                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5748                       prev_phi_info = vinfo_for_stmt (new_phi);
5749                     }
5750                 }
5751             }
5752         }
5753
5754       return true;
5755     }
5756
5757   /* 1. Is vectorizable reduction?  */
5758   /* Not supportable if the reduction variable is used in the loop, unless
5759      it's a reduction chain.  */
5760   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5761       && !GROUP_FIRST_ELEMENT (stmt_info))
5762     return false;
5763
5764   /* Reductions that are not used even in an enclosing outer-loop,
5765      are expected to be "live" (used out of the loop).  */
5766   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5767       && !STMT_VINFO_LIVE_P (stmt_info))
5768     return false;
5769
5770   /* 2. Has this been recognized as a reduction pattern?
5771
5772      Check if STMT represents a pattern that has been recognized
5773      in earlier analysis stages.  For stmts that represent a pattern,
5774      the STMT_VINFO_RELATED_STMT field records the last stmt in
5775      the original sequence that constitutes the pattern.  */
5776
5777   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5778   if (orig_stmt)
5779     {
5780       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5781       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5782       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5783     }
5784
5785   /* 3. Check the operands of the operation.  The first operands are defined
5786         inside the loop body. The last operand is the reduction variable,
5787         which is defined by the loop-header-phi.  */
5788
5789   gcc_assert (is_gimple_assign (stmt));
5790
5791   /* Flatten RHS.  */
5792   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5793     {
5794     case GIMPLE_BINARY_RHS:
5795       code = gimple_assign_rhs_code (stmt);
5796       op_type = TREE_CODE_LENGTH (code);
5797       gcc_assert (op_type == binary_op);
5798       ops[0] = gimple_assign_rhs1 (stmt);
5799       ops[1] = gimple_assign_rhs2 (stmt);
5800       break;
5801
5802     case GIMPLE_TERNARY_RHS:
5803       code = gimple_assign_rhs_code (stmt);
5804       op_type = TREE_CODE_LENGTH (code);
5805       gcc_assert (op_type == ternary_op);
5806       ops[0] = gimple_assign_rhs1 (stmt);
5807       ops[1] = gimple_assign_rhs2 (stmt);
5808       ops[2] = gimple_assign_rhs3 (stmt);
5809       break;
5810
5811     case GIMPLE_UNARY_RHS:
5812       return false;
5813
5814     default:
5815       gcc_unreachable ();
5816     }
5817
5818   if (code == COND_EXPR && slp_node)
5819     return false;
5820
5821   scalar_dest = gimple_assign_lhs (stmt);
5822   scalar_type = TREE_TYPE (scalar_dest);
5823   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5824       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5825     return false;
5826
5827   /* Do not try to vectorize bit-precision reductions.  */
5828   if (!type_has_mode_precision_p (scalar_type))
5829     return false;
5830
5831   /* All uses but the last are expected to be defined in the loop.
5832      The last use is the reduction variable.  In case of nested cycle this
5833      assumption is not true: we use reduc_index to record the index of the
5834      reduction variable.  */
5835   gimple *reduc_def_stmt = NULL;
5836   int reduc_index = -1;
5837   for (i = 0; i < op_type; i++)
5838     {
5839       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5840       if (i == 0 && code == COND_EXPR)
5841         continue;
5842
5843       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5844                                           &def_stmt, &dts[i], &tem);
5845       dt = dts[i];
5846       gcc_assert (is_simple_use);
5847       if (dt == vect_reduction_def)
5848         {
5849           reduc_def_stmt = def_stmt;
5850           reduc_index = i;
5851           continue;
5852         }
5853       else if (tem)
5854         {
5855           /* To properly compute ncopies we are interested in the widest
5856              input type in case we're looking at a widening accumulation.  */
5857           if (!vectype_in
5858               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5859             vectype_in = tem;
5860         }
5861
5862       if (dt != vect_internal_def
5863           && dt != vect_external_def
5864           && dt != vect_constant_def
5865           && dt != vect_induction_def
5866           && !(dt == vect_nested_cycle && nested_cycle))
5867         return false;
5868
5869       if (dt == vect_nested_cycle)
5870         {
5871           found_nested_cycle_def = true;
5872           reduc_def_stmt = def_stmt;
5873           reduc_index = i;
5874         }
5875
5876       if (i == 1 && code == COND_EXPR)
5877         {
5878           /* Record how value of COND_EXPR is defined.  */
5879           if (dt == vect_constant_def)
5880             {
5881               cond_reduc_dt = dt;
5882               cond_reduc_val = ops[i];
5883             }
5884           if (dt == vect_induction_def && def_stmt != NULL
5885               && is_nonwrapping_integer_induction (def_stmt, loop))
5886             cond_reduc_dt = dt;
5887         }
5888     }
5889
5890   if (!vectype_in)
5891     vectype_in = vectype_out;
5892
5893   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5894      directy used in stmt.  */
5895   if (reduc_index == -1)
5896     {
5897       if (orig_stmt)
5898         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5899       else
5900         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5901     }
5902
5903   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5904     return false;
5905
5906   if (!(reduc_index == -1
5907         || dts[reduc_index] == vect_reduction_def
5908         || dts[reduc_index] == vect_nested_cycle
5909         || ((dts[reduc_index] == vect_internal_def
5910              || dts[reduc_index] == vect_external_def
5911              || dts[reduc_index] == vect_constant_def
5912              || dts[reduc_index] == vect_induction_def)
5913             && nested_cycle && found_nested_cycle_def)))
5914     {
5915       /* For pattern recognized stmts, orig_stmt might be a reduction,
5916          but some helper statements for the pattern might not, or
5917          might be COND_EXPRs with reduction uses in the condition.  */
5918       gcc_assert (orig_stmt);
5919       return false;
5920     }
5921
5922   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5923   enum vect_reduction_type v_reduc_type
5924     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5925   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5926
5927   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5928   /* If we have a condition reduction, see if we can simplify it further.  */
5929   if (v_reduc_type == COND_REDUCTION)
5930     {
5931       if (cond_reduc_dt == vect_induction_def)
5932         {
5933           if (dump_enabled_p ())
5934             dump_printf_loc (MSG_NOTE, vect_location,
5935                              "condition expression based on "
5936                              "integer induction.\n");
5937           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5938             = INTEGER_INDUC_COND_REDUCTION;
5939         }
5940
5941       /* Loop peeling modifies initial value of reduction PHI, which
5942          makes the reduction stmt to be transformed different to the
5943          original stmt analyzed.  We need to record reduction code for
5944          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5945          it can be used directly at transform stage.  */
5946       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5947           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5948         {
5949           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5950           gcc_assert (cond_reduc_dt == vect_constant_def);
5951           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5952         }
5953       else if (cond_reduc_dt == vect_constant_def)
5954         {
5955           enum vect_def_type cond_initial_dt;
5956           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5957           tree cond_initial_val
5958             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5959
5960           gcc_assert (cond_reduc_val != NULL_TREE);
5961           vect_is_simple_use (cond_initial_val, loop_vinfo,
5962                               &def_stmt, &cond_initial_dt);
5963           if (cond_initial_dt == vect_constant_def
5964               && types_compatible_p (TREE_TYPE (cond_initial_val),
5965                                      TREE_TYPE (cond_reduc_val)))
5966             {
5967               tree e = fold_binary (LE_EXPR, boolean_type_node,
5968                                     cond_initial_val, cond_reduc_val);
5969               if (e && (integer_onep (e) || integer_zerop (e)))
5970                 {
5971                   if (dump_enabled_p ())
5972                     dump_printf_loc (MSG_NOTE, vect_location,
5973                                      "condition expression based on "
5974                                      "compile time constant.\n");
5975                   /* Record reduction code at analysis stage.  */
5976                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5977                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5978                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5979                     = CONST_COND_REDUCTION;
5980                 }
5981             }
5982         }
5983     }
5984
5985   if (orig_stmt)
5986     gcc_assert (tmp == orig_stmt
5987                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5988   else
5989     /* We changed STMT to be the first stmt in reduction chain, hence we
5990        check that in this case the first element in the chain is STMT.  */
5991     gcc_assert (stmt == tmp
5992                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5993
5994   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5995     return false;
5996
5997   if (slp_node)
5998     ncopies = 1;
5999   else
6000     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6001
6002   gcc_assert (ncopies >= 1);
6003
6004   vec_mode = TYPE_MODE (vectype_in);
6005
6006   if (code == COND_EXPR)
6007     {
6008       /* Only call during the analysis stage, otherwise we'll lose
6009          STMT_VINFO_TYPE.  */
6010       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6011                                                 ops[reduc_index], 0, NULL))
6012         {
6013           if (dump_enabled_p ())
6014             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6015                              "unsupported condition in reduction\n");
6016           return false;
6017         }
6018     }
6019   else
6020     {
6021       /* 4. Supportable by target?  */
6022
6023       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6024           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6025         {
6026           /* Shifts and rotates are only supported by vectorizable_shifts,
6027              not vectorizable_reduction.  */
6028           if (dump_enabled_p ())
6029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6030                              "unsupported shift or rotation.\n");
6031           return false;
6032         }
6033
6034       /* 4.1. check support for the operation in the loop  */
6035       optab = optab_for_tree_code (code, vectype_in, optab_default);
6036       if (!optab)
6037         {
6038           if (dump_enabled_p ())
6039             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6040                              "no optab.\n");
6041
6042           return false;
6043         }
6044
6045       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6046         {
6047           if (dump_enabled_p ())
6048             dump_printf (MSG_NOTE, "op not supported by target.\n");
6049
6050           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6051               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6052             return false;
6053
6054           if (dump_enabled_p ())
6055             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6056         }
6057
6058       /* Worthwhile without SIMD support?  */
6059       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6060           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6061         {
6062           if (dump_enabled_p ())
6063             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6064                              "not worthwhile without SIMD support.\n");
6065
6066           return false;
6067         }
6068     }
6069
6070   /* 4.2. Check support for the epilog operation.
6071
6072           If STMT represents a reduction pattern, then the type of the
6073           reduction variable may be different than the type of the rest
6074           of the arguments.  For example, consider the case of accumulation
6075           of shorts into an int accumulator; The original code:
6076                         S1: int_a = (int) short_a;
6077           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6078
6079           was replaced with:
6080                         STMT: int_acc = widen_sum <short_a, int_acc>
6081
6082           This means that:
6083           1. The tree-code that is used to create the vector operation in the
6084              epilog code (that reduces the partial results) is not the
6085              tree-code of STMT, but is rather the tree-code of the original
6086              stmt from the pattern that STMT is replacing.  I.e, in the example
6087              above we want to use 'widen_sum' in the loop, but 'plus' in the
6088              epilog.
6089           2. The type (mode) we use to check available target support
6090              for the vector operation to be created in the *epilog*, is
6091              determined by the type of the reduction variable (in the example
6092              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6093              However the type (mode) we use to check available target support
6094              for the vector operation to be created *inside the loop*, is
6095              determined by the type of the other arguments to STMT (in the
6096              example we'd check this: optab_handler (widen_sum_optab,
6097              vect_short_mode)).
6098
6099           This is contrary to "regular" reductions, in which the types of all
6100           the arguments are the same as the type of the reduction variable.
6101           For "regular" reductions we can therefore use the same vector type
6102           (and also the same tree-code) when generating the epilog code and
6103           when generating the code inside the loop.  */
6104
6105   if (orig_stmt)
6106     {
6107       /* This is a reduction pattern: get the vectype from the type of the
6108          reduction variable, and get the tree-code from orig_stmt.  */
6109       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6110                   == TREE_CODE_REDUCTION);
6111       orig_code = gimple_assign_rhs_code (orig_stmt);
6112       gcc_assert (vectype_out);
6113       vec_mode = TYPE_MODE (vectype_out);
6114     }
6115   else
6116     {
6117       /* Regular reduction: use the same vectype and tree-code as used for
6118          the vector code inside the loop can be used for the epilog code. */
6119       orig_code = code;
6120
6121       if (code == MINUS_EXPR)
6122         orig_code = PLUS_EXPR;
6123
6124       /* For simple condition reductions, replace with the actual expression
6125          we want to base our reduction around.  */
6126       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6127         {
6128           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6129           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6130         }
6131       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6132                  == INTEGER_INDUC_COND_REDUCTION)
6133         orig_code = MAX_EXPR;
6134     }
6135
6136   if (nested_cycle)
6137     {
6138       def_bb = gimple_bb (reduc_def_stmt);
6139       def_stmt_loop = def_bb->loop_father;
6140       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6141                                        loop_preheader_edge (def_stmt_loop));
6142       if (TREE_CODE (def_arg) == SSA_NAME
6143           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6144           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6145           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6146           && vinfo_for_stmt (def_arg_stmt)
6147           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6148               == vect_double_reduction_def)
6149         double_reduc = true;
6150     }
6151
6152   reduc_fn = IFN_LAST;
6153
6154   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6155     {
6156       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6157         {
6158           if (reduc_fn != IFN_LAST
6159               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6160                                                   OPTIMIZE_FOR_SPEED))
6161             {
6162               if (dump_enabled_p ())
6163                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6164                                  "reduc op not supported by target.\n");
6165
6166               reduc_fn = IFN_LAST;
6167             }
6168         }
6169       else
6170         {
6171           if (!nested_cycle || double_reduc)
6172             {
6173               if (dump_enabled_p ())
6174                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6175                                  "no reduc code for scalar code.\n");
6176
6177               return false;
6178             }
6179         }
6180     }
6181   else
6182     {
6183       int scalar_precision
6184         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6185       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6186       cr_index_vector_type = build_vector_type
6187         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6188
6189       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6190                                           OPTIMIZE_FOR_SPEED))
6191         reduc_fn = IFN_REDUC_MAX;
6192     }
6193
6194   if ((double_reduc
6195        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6196       && ncopies > 1)
6197     {
6198       if (dump_enabled_p ())
6199         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6200                          "multiple types in double reduction or condition "
6201                          "reduction.\n");
6202       return false;
6203     }
6204
6205   /* In case of widenning multiplication by a constant, we update the type
6206      of the constant to be the type of the other operand.  We check that the
6207      constant fits the type in the pattern recognition pass.  */
6208   if (code == DOT_PROD_EXPR
6209       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6210     {
6211       if (TREE_CODE (ops[0]) == INTEGER_CST)
6212         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6213       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6214         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6215       else
6216         {
6217           if (dump_enabled_p ())
6218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6219                              "invalid types in dot-prod\n");
6220
6221           return false;
6222         }
6223     }
6224
6225   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6226     {
6227       widest_int ni;
6228
6229       if (! max_loop_iterations (loop, &ni))
6230         {
6231           if (dump_enabled_p ())
6232             dump_printf_loc (MSG_NOTE, vect_location,
6233                              "loop count not known, cannot create cond "
6234                              "reduction.\n");
6235           return false;
6236         }
6237       /* Convert backedges to iterations.  */
6238       ni += 1;
6239
6240       /* The additional index will be the same type as the condition.  Check
6241          that the loop can fit into this less one (because we'll use up the
6242          zero slot for when there are no matches).  */
6243       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6244       if (wi::geu_p (ni, wi::to_widest (max_index)))
6245         {
6246           if (dump_enabled_p ())
6247             dump_printf_loc (MSG_NOTE, vect_location,
6248                              "loop size is greater than data size.\n");
6249           return false;
6250         }
6251     }
6252
6253   /* In case the vectorization factor (VF) is bigger than the number
6254      of elements that we can fit in a vectype (nunits), we have to generate
6255      more than one vector stmt - i.e - we need to "unroll" the
6256      vector stmt by a factor VF/nunits.  For more details see documentation
6257      in vectorizable_operation.  */
6258
6259   /* If the reduction is used in an outer loop we need to generate
6260      VF intermediate results, like so (e.g. for ncopies=2):
6261         r0 = phi (init, r0)
6262         r1 = phi (init, r1)
6263         r0 = x0 + r0;
6264         r1 = x1 + r1;
6265     (i.e. we generate VF results in 2 registers).
6266     In this case we have a separate def-use cycle for each copy, and therefore
6267     for each copy we get the vector def for the reduction variable from the
6268     respective phi node created for this copy.
6269
6270     Otherwise (the reduction is unused in the loop nest), we can combine
6271     together intermediate results, like so (e.g. for ncopies=2):
6272         r = phi (init, r)
6273         r = x0 + r;
6274         r = x1 + r;
6275    (i.e. we generate VF/2 results in a single register).
6276    In this case for each copy we get the vector def for the reduction variable
6277    from the vectorized reduction operation generated in the previous iteration.
6278
6279    This only works when we see both the reduction PHI and its only consumer
6280    in vectorizable_reduction and there are no intermediate stmts
6281    participating.  */
6282   use_operand_p use_p;
6283   gimple *use_stmt;
6284   if (ncopies > 1
6285       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6286       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6287       && (use_stmt == stmt
6288           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6289     {
6290       single_defuse_cycle = true;
6291       epilog_copies = 1;
6292     }
6293   else
6294     epilog_copies = ncopies;
6295
6296   /* If the reduction stmt is one of the patterns that have lane
6297      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6298   if ((ncopies > 1
6299        && ! single_defuse_cycle)
6300       && (code == DOT_PROD_EXPR
6301           || code == WIDEN_SUM_EXPR
6302           || code == SAD_EXPR))
6303     {
6304       if (dump_enabled_p ())
6305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6306                          "multi def-use cycle not possible for lane-reducing "
6307                          "reduction operation\n");
6308       return false;
6309     }
6310
6311   if (!vec_stmt) /* transformation not required.  */
6312     {
6313       if (first_p)
6314         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6315       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6316       return true;
6317     }
6318
6319   /* Transform.  */
6320
6321   if (dump_enabled_p ())
6322     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6323
6324   /* FORNOW: Multiple types are not supported for condition.  */
6325   if (code == COND_EXPR)
6326     gcc_assert (ncopies == 1);
6327
6328   /* Create the destination vector  */
6329   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6330
6331   prev_stmt_info = NULL;
6332   prev_phi_info = NULL;
6333   if (slp_node)
6334     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6335   else
6336     {
6337       vec_num = 1;
6338       vec_oprnds0.create (1);
6339       vec_oprnds1.create (1);
6340       if (op_type == ternary_op)
6341         vec_oprnds2.create (1);
6342     }
6343
6344   phis.create (vec_num);
6345   vect_defs.create (vec_num);
6346   if (!slp_node)
6347     vect_defs.quick_push (NULL_TREE);
6348
6349   if (slp_node)
6350     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6351   else
6352     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6353
6354   for (j = 0; j < ncopies; j++)
6355     {
6356       if (code == COND_EXPR)
6357         {
6358           gcc_assert (!slp_node);
6359           vectorizable_condition (stmt, gsi, vec_stmt,
6360                                   PHI_RESULT (phis[0]),
6361                                   reduc_index, NULL);
6362           /* Multiple types are not supported for condition.  */
6363           break;
6364         }
6365
6366       /* Handle uses.  */
6367       if (j == 0)
6368         {
6369           if (slp_node)
6370             {
6371               /* Get vec defs for all the operands except the reduction index,
6372                  ensuring the ordering of the ops in the vector is kept.  */
6373               auto_vec<tree, 3> slp_ops;
6374               auto_vec<vec<tree>, 3> vec_defs;
6375
6376               slp_ops.quick_push (ops[0]);
6377               slp_ops.quick_push (ops[1]);
6378               if (op_type == ternary_op)
6379                 slp_ops.quick_push (ops[2]);
6380
6381               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6382
6383               vec_oprnds0.safe_splice (vec_defs[0]);
6384               vec_defs[0].release ();
6385               vec_oprnds1.safe_splice (vec_defs[1]);
6386               vec_defs[1].release ();
6387               if (op_type == ternary_op)
6388                 {
6389                   vec_oprnds2.safe_splice (vec_defs[2]);
6390                   vec_defs[2].release ();
6391                 }
6392             }
6393           else
6394             {
6395               vec_oprnds0.quick_push
6396                 (vect_get_vec_def_for_operand (ops[0], stmt));
6397               vec_oprnds1.quick_push
6398                 (vect_get_vec_def_for_operand (ops[1], stmt));
6399               if (op_type == ternary_op)
6400                 vec_oprnds2.quick_push
6401                   (vect_get_vec_def_for_operand (ops[2], stmt));
6402             }
6403         }
6404       else
6405         {
6406           if (!slp_node)
6407             {
6408               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6409
6410               if (single_defuse_cycle && reduc_index == 0)
6411                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6412               else
6413                 vec_oprnds0[0]
6414                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6415               if (single_defuse_cycle && reduc_index == 1)
6416                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6417               else
6418                 vec_oprnds1[0]
6419                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6420               if (op_type == ternary_op)
6421                 {
6422                   if (single_defuse_cycle && reduc_index == 2)
6423                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6424                   else
6425                     vec_oprnds2[0]
6426                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6427                 }
6428             }
6429         }
6430
6431       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6432         {
6433           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6434           if (op_type == ternary_op)
6435             vop[2] = vec_oprnds2[i];
6436
6437           new_temp = make_ssa_name (vec_dest, new_stmt);
6438           new_stmt = gimple_build_assign (new_temp, code,
6439                                           vop[0], vop[1], vop[2]);
6440           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6441
6442           if (slp_node)
6443             {
6444               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6445               vect_defs.quick_push (new_temp);
6446             }
6447           else
6448             vect_defs[0] = new_temp;
6449         }
6450
6451       if (slp_node)
6452         continue;
6453
6454       if (j == 0)
6455         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6456       else
6457         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6458
6459       prev_stmt_info = vinfo_for_stmt (new_stmt);
6460     }
6461
6462   /* Finalize the reduction-phi (set its arguments) and create the
6463      epilog reduction code.  */
6464   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6465     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6466
6467   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6468                                     epilog_copies, reduc_fn, phis,
6469                                     double_reduc, slp_node, slp_node_instance);
6470
6471   return true;
6472 }
6473
6474 /* Function vect_min_worthwhile_factor.
6475
6476    For a loop where we could vectorize the operation indicated by CODE,
6477    return the minimum vectorization factor that makes it worthwhile
6478    to use generic vectors.  */
6479 int
6480 vect_min_worthwhile_factor (enum tree_code code)
6481 {
6482   switch (code)
6483     {
6484     case PLUS_EXPR:
6485     case MINUS_EXPR:
6486     case NEGATE_EXPR:
6487       return 4;
6488
6489     case BIT_AND_EXPR:
6490     case BIT_IOR_EXPR:
6491     case BIT_XOR_EXPR:
6492     case BIT_NOT_EXPR:
6493       return 2;
6494
6495     default:
6496       return INT_MAX;
6497     }
6498 }
6499
6500 /* Return true if VINFO indicates we are doing loop vectorization and if
6501    it is worth decomposing CODE operations into scalar operations for
6502    that loop's vectorization factor.  */
6503
6504 bool
6505 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6506 {
6507   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6508   return (loop_vinfo
6509           && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6510               >= vect_min_worthwhile_factor (code)));
6511 }
6512
6513 /* Function vectorizable_induction
6514
6515    Check if PHI performs an induction computation that can be vectorized.
6516    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6517    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6518    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6519
6520 bool
6521 vectorizable_induction (gimple *phi,
6522                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6523                         gimple **vec_stmt, slp_tree slp_node)
6524 {
6525   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6526   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6527   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6528   unsigned ncopies;
6529   bool nested_in_vect_loop = false;
6530   struct loop *iv_loop;
6531   tree vec_def;
6532   edge pe = loop_preheader_edge (loop);
6533   basic_block new_bb;
6534   tree new_vec, vec_init, vec_step, t;
6535   tree new_name;
6536   gimple *new_stmt;
6537   gphi *induction_phi;
6538   tree induc_def, vec_dest;
6539   tree init_expr, step_expr;
6540   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6541   unsigned i;
6542   tree expr;
6543   gimple_seq stmts;
6544   imm_use_iterator imm_iter;
6545   use_operand_p use_p;
6546   gimple *exit_phi;
6547   edge latch_e;
6548   tree loop_arg;
6549   gimple_stmt_iterator si;
6550   basic_block bb = gimple_bb (phi);
6551
6552   if (gimple_code (phi) != GIMPLE_PHI)
6553     return false;
6554
6555   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6556     return false;
6557
6558   /* Make sure it was recognized as induction computation.  */
6559   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6560     return false;
6561
6562   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6563   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6564
6565   if (slp_node)
6566     ncopies = 1;
6567   else
6568     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6569   gcc_assert (ncopies >= 1);
6570
6571   /* FORNOW. These restrictions should be relaxed.  */
6572   if (nested_in_vect_loop_p (loop, phi))
6573     {
6574       imm_use_iterator imm_iter;
6575       use_operand_p use_p;
6576       gimple *exit_phi;
6577       edge latch_e;
6578       tree loop_arg;
6579
6580       if (ncopies > 1)
6581         {
6582           if (dump_enabled_p ())
6583             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6584                              "multiple types in nested loop.\n");
6585           return false;
6586         }
6587
6588       /* FORNOW: outer loop induction with SLP not supported.  */
6589       if (STMT_SLP_TYPE (stmt_info))
6590         return false;
6591
6592       exit_phi = NULL;
6593       latch_e = loop_latch_edge (loop->inner);
6594       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6595       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6596         {
6597           gimple *use_stmt = USE_STMT (use_p);
6598           if (is_gimple_debug (use_stmt))
6599             continue;
6600
6601           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6602             {
6603               exit_phi = use_stmt;
6604               break;
6605             }
6606         }
6607       if (exit_phi)
6608         {
6609           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6610           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6611                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6612             {
6613               if (dump_enabled_p ())
6614                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6615                                  "inner-loop induction only used outside "
6616                                  "of the outer vectorized loop.\n");
6617               return false;
6618             }
6619         }
6620
6621       nested_in_vect_loop = true;
6622       iv_loop = loop->inner;
6623     }
6624   else
6625     iv_loop = loop;
6626   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6627
6628   if (!vec_stmt) /* transformation not required.  */
6629     {
6630       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6631       if (dump_enabled_p ())
6632         dump_printf_loc (MSG_NOTE, vect_location,
6633                          "=== vectorizable_induction ===\n");
6634       vect_model_induction_cost (stmt_info, ncopies);
6635       return true;
6636     }
6637
6638   /* Transform.  */
6639
6640   /* Compute a vector variable, initialized with the first VF values of
6641      the induction variable.  E.g., for an iv with IV_PHI='X' and
6642      evolution S, for a vector of 4 units, we want to compute:
6643      [X, X + S, X + 2*S, X + 3*S].  */
6644
6645   if (dump_enabled_p ())
6646     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6647
6648   latch_e = loop_latch_edge (iv_loop);
6649   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6650
6651   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6652   gcc_assert (step_expr != NULL_TREE);
6653
6654   pe = loop_preheader_edge (iv_loop);
6655   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6656                                      loop_preheader_edge (iv_loop));
6657
6658   /* Convert the step to the desired type.  */
6659   stmts = NULL;
6660   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6661   if (stmts)
6662     {
6663       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6664       gcc_assert (!new_bb);
6665     }
6666
6667   /* Find the first insertion point in the BB.  */
6668   si = gsi_after_labels (bb);
6669
6670   /* For SLP induction we have to generate several IVs as for example
6671      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6672      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6673      [VF*S, VF*S, VF*S, VF*S] for all.  */
6674   if (slp_node)
6675     {
6676       /* Convert the init to the desired type.  */
6677       stmts = NULL;
6678       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6679       if (stmts)
6680         {
6681           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6682           gcc_assert (!new_bb);
6683         }
6684
6685       /* Generate [VF*S, VF*S, ... ].  */
6686       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6687         {
6688           expr = build_int_cst (integer_type_node, vf);
6689           expr = fold_convert (TREE_TYPE (step_expr), expr);
6690         }
6691       else
6692         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6693       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6694                               expr, step_expr);
6695       if (! CONSTANT_CLASS_P (new_name))
6696         new_name = vect_init_vector (phi, new_name,
6697                                      TREE_TYPE (step_expr), NULL);
6698       new_vec = build_vector_from_val (vectype, new_name);
6699       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6700
6701       /* Now generate the IVs.  */
6702       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6703       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6704       unsigned elts = nunits * nvects;
6705       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6706       gcc_assert (elts % group_size == 0);
6707       tree elt = init_expr;
6708       unsigned ivn;
6709       for (ivn = 0; ivn < nivs; ++ivn)
6710         {
6711           auto_vec<tree, 32> elts (nunits);
6712           stmts = NULL;
6713           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6714             {
6715               if (ivn*nunits + eltn >= group_size
6716                   && (ivn*nunits + eltn) % group_size == 0)
6717                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6718                                     elt, step_expr);
6719               elts.quick_push (elt);
6720             }
6721           vec_init = gimple_build_vector (&stmts, vectype, elts);
6722           if (stmts)
6723             {
6724               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6725               gcc_assert (!new_bb);
6726             }
6727
6728           /* Create the induction-phi that defines the induction-operand.  */
6729           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6730           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6731           set_vinfo_for_stmt (induction_phi,
6732                               new_stmt_vec_info (induction_phi, loop_vinfo));
6733           induc_def = PHI_RESULT (induction_phi);
6734
6735           /* Create the iv update inside the loop  */
6736           vec_def = make_ssa_name (vec_dest);
6737           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6738           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6739           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6740
6741           /* Set the arguments of the phi node:  */
6742           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6743           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6744                        UNKNOWN_LOCATION);
6745
6746           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6747         }
6748
6749       /* Re-use IVs when we can.  */
6750       if (ivn < nvects)
6751         {
6752           unsigned vfp
6753             = least_common_multiple (group_size, nunits) / group_size;
6754           /* Generate [VF'*S, VF'*S, ... ].  */
6755           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6756             {
6757               expr = build_int_cst (integer_type_node, vfp);
6758               expr = fold_convert (TREE_TYPE (step_expr), expr);
6759             }
6760           else
6761             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6762           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6763                                   expr, step_expr);
6764           if (! CONSTANT_CLASS_P (new_name))
6765             new_name = vect_init_vector (phi, new_name,
6766                                          TREE_TYPE (step_expr), NULL);
6767           new_vec = build_vector_from_val (vectype, new_name);
6768           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6769           for (; ivn < nvects; ++ivn)
6770             {
6771               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6772               tree def;
6773               if (gimple_code (iv) == GIMPLE_PHI)
6774                 def = gimple_phi_result (iv);
6775               else
6776                 def = gimple_assign_lhs (iv);
6777               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6778                                               PLUS_EXPR,
6779                                               def, vec_step);
6780               if (gimple_code (iv) == GIMPLE_PHI)
6781                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6782               else
6783                 {
6784                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6785                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6786                 }
6787               set_vinfo_for_stmt (new_stmt,
6788                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6789               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6790             }
6791         }
6792
6793       return true;
6794     }
6795
6796   /* Create the vector that holds the initial_value of the induction.  */
6797   if (nested_in_vect_loop)
6798     {
6799       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6800          been created during vectorization of previous stmts.  We obtain it
6801          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6802       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6803       /* If the initial value is not of proper type, convert it.  */
6804       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6805         {
6806           new_stmt
6807             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6808                                                           vect_simple_var,
6809                                                           "vec_iv_"),
6810                                    VIEW_CONVERT_EXPR,
6811                                    build1 (VIEW_CONVERT_EXPR, vectype,
6812                                            vec_init));
6813           vec_init = gimple_assign_lhs (new_stmt);
6814           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6815                                                  new_stmt);
6816           gcc_assert (!new_bb);
6817           set_vinfo_for_stmt (new_stmt,
6818                               new_stmt_vec_info (new_stmt, loop_vinfo));
6819         }
6820     }
6821   else
6822     {
6823       /* iv_loop is the loop to be vectorized. Create:
6824          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6825       stmts = NULL;
6826       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6827
6828       auto_vec<tree, 32> elts (nunits);
6829       elts.quick_push (new_name);
6830       for (i = 1; i < nunits; i++)
6831         {
6832           /* Create: new_name_i = new_name + step_expr  */
6833           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6834                                    new_name, step_expr);
6835           elts.quick_push (new_name);
6836         }
6837       /* Create a vector from [new_name_0, new_name_1, ...,
6838          new_name_nunits-1]  */
6839       vec_init = gimple_build_vector (&stmts, vectype, elts);
6840       if (stmts)
6841         {
6842           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6843           gcc_assert (!new_bb);
6844         }
6845     }
6846
6847
6848   /* Create the vector that holds the step of the induction.  */
6849   if (nested_in_vect_loop)
6850     /* iv_loop is nested in the loop to be vectorized. Generate:
6851        vec_step = [S, S, S, S]  */
6852     new_name = step_expr;
6853   else
6854     {
6855       /* iv_loop is the loop to be vectorized. Generate:
6856           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6857       gimple_seq seq = NULL;
6858       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6859         {
6860           expr = build_int_cst (integer_type_node, vf);
6861           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6862         }
6863       else
6864         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6865       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6866                                expr, step_expr);
6867       if (seq)
6868         {
6869           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6870           gcc_assert (!new_bb);
6871         }
6872     }
6873
6874   t = unshare_expr (new_name);
6875   gcc_assert (CONSTANT_CLASS_P (new_name)
6876               || TREE_CODE (new_name) == SSA_NAME);
6877   new_vec = build_vector_from_val (vectype, t);
6878   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6879
6880
6881   /* Create the following def-use cycle:
6882      loop prolog:
6883          vec_init = ...
6884          vec_step = ...
6885      loop:
6886          vec_iv = PHI <vec_init, vec_loop>
6887          ...
6888          STMT
6889          ...
6890          vec_loop = vec_iv + vec_step;  */
6891
6892   /* Create the induction-phi that defines the induction-operand.  */
6893   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6894   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6895   set_vinfo_for_stmt (induction_phi,
6896                       new_stmt_vec_info (induction_phi, loop_vinfo));
6897   induc_def = PHI_RESULT (induction_phi);
6898
6899   /* Create the iv update inside the loop  */
6900   vec_def = make_ssa_name (vec_dest);
6901   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6902   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6903   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6904
6905   /* Set the arguments of the phi node:  */
6906   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6907   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6908                UNKNOWN_LOCATION);
6909
6910   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6911
6912   /* In case that vectorization factor (VF) is bigger than the number
6913      of elements that we can fit in a vectype (nunits), we have to generate
6914      more than one vector stmt - i.e - we need to "unroll" the
6915      vector stmt by a factor VF/nunits.  For more details see documentation
6916      in vectorizable_operation.  */
6917
6918   if (ncopies > 1)
6919     {
6920       gimple_seq seq = NULL;
6921       stmt_vec_info prev_stmt_vinfo;
6922       /* FORNOW. This restriction should be relaxed.  */
6923       gcc_assert (!nested_in_vect_loop);
6924
6925       /* Create the vector that holds the step of the induction.  */
6926       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6927         {
6928           expr = build_int_cst (integer_type_node, nunits);
6929           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6930         }
6931       else
6932         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6933       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6934                                expr, step_expr);
6935       if (seq)
6936         {
6937           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6938           gcc_assert (!new_bb);
6939         }
6940
6941       t = unshare_expr (new_name);
6942       gcc_assert (CONSTANT_CLASS_P (new_name)
6943                   || TREE_CODE (new_name) == SSA_NAME);
6944       new_vec = build_vector_from_val (vectype, t);
6945       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6946
6947       vec_def = induc_def;
6948       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6949       for (i = 1; i < ncopies; i++)
6950         {
6951           /* vec_i = vec_prev + vec_step  */
6952           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6953                                           vec_def, vec_step);
6954           vec_def = make_ssa_name (vec_dest, new_stmt);
6955           gimple_assign_set_lhs (new_stmt, vec_def);
6956
6957           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6958           set_vinfo_for_stmt (new_stmt,
6959                               new_stmt_vec_info (new_stmt, loop_vinfo));
6960           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6961           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6962         }
6963     }
6964
6965   if (nested_in_vect_loop)
6966     {
6967       /* Find the loop-closed exit-phi of the induction, and record
6968          the final vector of induction results:  */
6969       exit_phi = NULL;
6970       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6971         {
6972           gimple *use_stmt = USE_STMT (use_p);
6973           if (is_gimple_debug (use_stmt))
6974             continue;
6975
6976           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6977             {
6978               exit_phi = use_stmt;
6979               break;
6980             }
6981         }
6982       if (exit_phi)
6983         {
6984           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6985           /* FORNOW. Currently not supporting the case that an inner-loop induction
6986              is not used in the outer-loop (i.e. only outside the outer-loop).  */
6987           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6988                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
6989
6990           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6991           if (dump_enabled_p ())
6992             {
6993               dump_printf_loc (MSG_NOTE, vect_location,
6994                                "vector of inductions after inner-loop:");
6995               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6996             }
6997         }
6998     }
6999
7000
7001   if (dump_enabled_p ())
7002     {
7003       dump_printf_loc (MSG_NOTE, vect_location,
7004                        "transform induction: created def-use cycle: ");
7005       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7006       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7007                         SSA_NAME_DEF_STMT (vec_def), 0);
7008     }
7009
7010   return true;
7011 }
7012
7013 /* Function vectorizable_live_operation.
7014
7015    STMT computes a value that is used outside the loop.  Check if
7016    it can be supported.  */
7017
7018 bool
7019 vectorizable_live_operation (gimple *stmt,
7020                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7021                              slp_tree slp_node, int slp_index,
7022                              gimple **vec_stmt)
7023 {
7024   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7025   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7026   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7027   imm_use_iterator imm_iter;
7028   tree lhs, lhs_type, bitsize, vec_bitsize;
7029   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7030   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7031   int ncopies;
7032   gimple *use_stmt;
7033   auto_vec<tree> vec_oprnds;
7034
7035   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7036
7037   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7038     return false;
7039
7040   /* FORNOW.  CHECKME.  */
7041   if (nested_in_vect_loop_p (loop, stmt))
7042     return false;
7043
7044   /* If STMT is not relevant and it is a simple assignment and its inputs are
7045      invariant then it can remain in place, unvectorized.  The original last
7046      scalar value that it computes will be used.  */
7047   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7048     {
7049       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7050       if (dump_enabled_p ())
7051         dump_printf_loc (MSG_NOTE, vect_location,
7052                          "statement is simple and uses invariant.  Leaving in "
7053                          "place.\n");
7054       return true;
7055     }
7056
7057   if (slp_node)
7058     ncopies = 1;
7059   else
7060     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7061
7062   if (!vec_stmt)
7063     /* No transformation required.  */
7064     return true;
7065
7066   /* If stmt has a related stmt, then use that for getting the lhs.  */
7067   if (is_pattern_stmt_p (stmt_info))
7068     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7069
7070   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7071         : gimple_get_lhs (stmt);
7072   lhs_type = TREE_TYPE (lhs);
7073
7074   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7075              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7076              : TYPE_SIZE (TREE_TYPE (vectype)));
7077   vec_bitsize = TYPE_SIZE (vectype);
7078
7079   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7080   tree vec_lhs, bitstart;
7081   if (slp_node)
7082     {
7083       gcc_assert (slp_index >= 0);
7084
7085       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7086       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7087
7088       /* Get the last occurrence of the scalar index from the concatenation of
7089          all the slp vectors. Calculate which slp vector it is and the index
7090          within.  */
7091       int pos = (num_vec * nunits) - num_scalar + slp_index;
7092       int vec_entry = pos / nunits;
7093       int vec_index = pos % nunits;
7094
7095       /* Get the correct slp vectorized stmt.  */
7096       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7097
7098       /* Get entry to use.  */
7099       bitstart = bitsize_int (vec_index);
7100       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7101     }
7102   else
7103     {
7104       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7105       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7106
7107       /* For multiple copies, get the last copy.  */
7108       for (int i = 1; i < ncopies; ++i)
7109         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7110                                                   vec_lhs);
7111
7112       /* Get the last lane in the vector.  */
7113       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7114     }
7115
7116   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7117      loop.  */
7118   gimple_seq stmts = NULL;
7119   tree bftype = TREE_TYPE (vectype);
7120   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7121     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7122   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7123   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7124                                    true, NULL_TREE);
7125   if (stmts)
7126     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7127
7128   /* Replace use of lhs with newly computed result.  If the use stmt is a
7129      single arg PHI, just replace all uses of PHI result.  It's necessary
7130      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7131   use_operand_p use_p;
7132   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7133     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7134         && !is_gimple_debug (use_stmt))
7135     {
7136       if (gimple_code (use_stmt) == GIMPLE_PHI
7137           && gimple_phi_num_args (use_stmt) == 1)
7138         {
7139           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7140         }
7141       else
7142         {
7143           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7144             SET_USE (use_p, new_tree);
7145         }
7146       update_stmt (use_stmt);
7147     }
7148
7149   return true;
7150 }
7151
7152 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7153
7154 static void
7155 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7156 {
7157   ssa_op_iter op_iter;
7158   imm_use_iterator imm_iter;
7159   def_operand_p def_p;
7160   gimple *ustmt;
7161
7162   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7163     {
7164       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7165         {
7166           basic_block bb;
7167
7168           if (!is_gimple_debug (ustmt))
7169             continue;
7170
7171           bb = gimple_bb (ustmt);
7172
7173           if (!flow_bb_inside_loop_p (loop, bb))
7174             {
7175               if (gimple_debug_bind_p (ustmt))
7176                 {
7177                   if (dump_enabled_p ())
7178                     dump_printf_loc (MSG_NOTE, vect_location,
7179                                      "killing debug use\n");
7180
7181                   gimple_debug_bind_reset_value (ustmt);
7182                   update_stmt (ustmt);
7183                 }
7184               else
7185                 gcc_unreachable ();
7186             }
7187         }
7188     }
7189 }
7190
7191 /* Given loop represented by LOOP_VINFO, return true if computation of
7192    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7193    otherwise.  */
7194
7195 static bool
7196 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7197 {
7198   /* Constant case.  */
7199   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7200     {
7201       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7202       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7203
7204       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7205       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7206       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7207         return true;
7208     }
7209
7210   widest_int max;
7211   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7212   /* Check the upper bound of loop niters.  */
7213   if (get_max_loop_iterations (loop, &max))
7214     {
7215       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7216       signop sgn = TYPE_SIGN (type);
7217       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7218       if (max < type_max)
7219         return true;
7220     }
7221   return false;
7222 }
7223
7224 /* Scale profiling counters by estimation for LOOP which is vectorized
7225    by factor VF.  */
7226
7227 static void
7228 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7229 {
7230   edge preheader = loop_preheader_edge (loop);
7231   /* Reduce loop iterations by the vectorization factor.  */
7232   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7233   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7234
7235   if (freq_h.nonzero_p ())
7236     {
7237       profile_probability p;
7238
7239       /* Avoid dropping loop body profile counter to 0 because of zero count
7240          in loop's preheader.  */
7241       if (!(freq_e == profile_count::zero ()))
7242         freq_e = freq_e.force_nonzero ();
7243       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7244       scale_loop_frequencies (loop, p);
7245     }
7246
7247   edge exit_e = single_exit (loop);
7248   exit_e->probability = profile_probability::always ()
7249                                  .apply_scale (1, new_est_niter + 1);
7250
7251   edge exit_l = single_pred_edge (loop->latch);
7252   profile_probability prob = exit_l->probability;
7253   exit_l->probability = exit_e->probability.invert ();
7254   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7255     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7256 }
7257
7258 /* Function vect_transform_loop.
7259
7260    The analysis phase has determined that the loop is vectorizable.
7261    Vectorize the loop - created vectorized stmts to replace the scalar
7262    stmts in the loop, and update the loop exit condition.
7263    Returns scalar epilogue loop if any.  */
7264
7265 struct loop *
7266 vect_transform_loop (loop_vec_info loop_vinfo)
7267 {
7268   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7269   struct loop *epilogue = NULL;
7270   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7271   int nbbs = loop->num_nodes;
7272   int i;
7273   tree niters_vector = NULL;
7274   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7275   bool grouped_store;
7276   bool slp_scheduled = false;
7277   gimple *stmt, *pattern_stmt;
7278   gimple_seq pattern_def_seq = NULL;
7279   gimple_stmt_iterator pattern_def_si = gsi_none ();
7280   bool transform_pattern_stmt = false;
7281   bool check_profitability = false;
7282   int th;
7283
7284   if (dump_enabled_p ())
7285     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7286
7287   /* Use the more conservative vectorization threshold.  If the number
7288      of iterations is constant assume the cost check has been performed
7289      by our caller.  If the threshold makes all loops profitable that
7290      run at least the vectorization factor number of times checking
7291      is pointless, too.  */
7292   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7293   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7294       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7295     {
7296       if (dump_enabled_p ())
7297         dump_printf_loc (MSG_NOTE, vect_location,
7298                          "Profitability threshold is %d loop iterations.\n",
7299                          th);
7300       check_profitability = true;
7301     }
7302
7303   /* Make sure there exists a single-predecessor exit bb.  Do this before
7304      versioning.   */
7305   edge e = single_exit (loop);
7306   if (! single_pred_p (e->dest))
7307     {
7308       split_loop_exit_edge (e);
7309       if (dump_enabled_p ())
7310         dump_printf (MSG_NOTE, "split exit edge\n");
7311     }
7312
7313   /* Version the loop first, if required, so the profitability check
7314      comes first.  */
7315
7316   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7317     {
7318       vect_loop_versioning (loop_vinfo, th, check_profitability);
7319       check_profitability = false;
7320     }
7321
7322   /* Make sure there exists a single-predecessor exit bb also on the
7323      scalar loop copy.  Do this after versioning but before peeling
7324      so CFG structure is fine for both scalar and if-converted loop
7325      to make slpeel_duplicate_current_defs_from_edges face matched
7326      loop closed PHI nodes on the exit.  */
7327   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7328     {
7329       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7330       if (! single_pred_p (e->dest))
7331         {
7332           split_loop_exit_edge (e);
7333           if (dump_enabled_p ())
7334             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7335         }
7336     }
7337
7338   tree niters = vect_build_loop_niters (loop_vinfo);
7339   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7340   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7341   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7342   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7343                               check_profitability, niters_no_overflow);
7344   if (niters_vector == NULL_TREE)
7345     {
7346       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7347         niters_vector
7348           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7349                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7350       else
7351         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7352                                      niters_no_overflow);
7353     }
7354
7355   /* 1) Make sure the loop header has exactly two entries
7356      2) Make sure we have a preheader basic block.  */
7357
7358   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7359
7360   split_edge (loop_preheader_edge (loop));
7361
7362   /* FORNOW: the vectorizer supports only loops which body consist
7363      of one basic block (header + empty latch). When the vectorizer will
7364      support more involved loop forms, the order by which the BBs are
7365      traversed need to be reconsidered.  */
7366
7367   for (i = 0; i < nbbs; i++)
7368     {
7369       basic_block bb = bbs[i];
7370       stmt_vec_info stmt_info;
7371
7372       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7373            gsi_next (&si))
7374         {
7375           gphi *phi = si.phi ();
7376           if (dump_enabled_p ())
7377             {
7378               dump_printf_loc (MSG_NOTE, vect_location,
7379                                "------>vectorizing phi: ");
7380               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7381             }
7382           stmt_info = vinfo_for_stmt (phi);
7383           if (!stmt_info)
7384             continue;
7385
7386           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7387             vect_loop_kill_debug_uses (loop, phi);
7388
7389           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7390               && !STMT_VINFO_LIVE_P (stmt_info))
7391             continue;
7392
7393           if (STMT_VINFO_VECTYPE (stmt_info)
7394               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7395                   != (unsigned HOST_WIDE_INT) vf)
7396               && dump_enabled_p ())
7397             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7398
7399           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7400                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7401                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7402               && ! PURE_SLP_STMT (stmt_info))
7403             {
7404               if (dump_enabled_p ())
7405                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7406               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7407             }
7408         }
7409
7410       pattern_stmt = NULL;
7411       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7412            !gsi_end_p (si) || transform_pattern_stmt;)
7413         {
7414           bool is_store;
7415
7416           if (transform_pattern_stmt)
7417             stmt = pattern_stmt;
7418           else
7419             {
7420               stmt = gsi_stmt (si);
7421               /* During vectorization remove existing clobber stmts.  */
7422               if (gimple_clobber_p (stmt))
7423                 {
7424                   unlink_stmt_vdef (stmt);
7425                   gsi_remove (&si, true);
7426                   release_defs (stmt);
7427                   continue;
7428                 }
7429             }
7430
7431           if (dump_enabled_p ())
7432             {
7433               dump_printf_loc (MSG_NOTE, vect_location,
7434                                "------>vectorizing statement: ");
7435               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7436             }
7437
7438           stmt_info = vinfo_for_stmt (stmt);
7439
7440           /* vector stmts created in the outer-loop during vectorization of
7441              stmts in an inner-loop may not have a stmt_info, and do not
7442              need to be vectorized.  */
7443           if (!stmt_info)
7444             {
7445               gsi_next (&si);
7446               continue;
7447             }
7448
7449           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7450             vect_loop_kill_debug_uses (loop, stmt);
7451
7452           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7453               && !STMT_VINFO_LIVE_P (stmt_info))
7454             {
7455               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7456                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7457                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7458                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7459                 {
7460                   stmt = pattern_stmt;
7461                   stmt_info = vinfo_for_stmt (stmt);
7462                 }
7463               else
7464                 {
7465                   gsi_next (&si);
7466                   continue;
7467                 }
7468             }
7469           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7470                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7471                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7472                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7473             transform_pattern_stmt = true;
7474
7475           /* If pattern statement has def stmts, vectorize them too.  */
7476           if (is_pattern_stmt_p (stmt_info))
7477             {
7478               if (pattern_def_seq == NULL)
7479                 {
7480                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7481                   pattern_def_si = gsi_start (pattern_def_seq);
7482                 }
7483               else if (!gsi_end_p (pattern_def_si))
7484                 gsi_next (&pattern_def_si);
7485               if (pattern_def_seq != NULL)
7486                 {
7487                   gimple *pattern_def_stmt = NULL;
7488                   stmt_vec_info pattern_def_stmt_info = NULL;
7489
7490                   while (!gsi_end_p (pattern_def_si))
7491                     {
7492                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7493                       pattern_def_stmt_info
7494                         = vinfo_for_stmt (pattern_def_stmt);
7495                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7496                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7497                         break;
7498                       gsi_next (&pattern_def_si);
7499                     }
7500
7501                   if (!gsi_end_p (pattern_def_si))
7502                     {
7503                       if (dump_enabled_p ())
7504                         {
7505                           dump_printf_loc (MSG_NOTE, vect_location,
7506                                            "==> vectorizing pattern def "
7507                                            "stmt: ");
7508                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7509                                             pattern_def_stmt, 0);
7510                         }
7511
7512                       stmt = pattern_def_stmt;
7513                       stmt_info = pattern_def_stmt_info;
7514                     }
7515                   else
7516                     {
7517                       pattern_def_si = gsi_none ();
7518                       transform_pattern_stmt = false;
7519                     }
7520                 }
7521               else
7522                 transform_pattern_stmt = false;
7523             }
7524
7525           if (STMT_VINFO_VECTYPE (stmt_info))
7526             {
7527               unsigned int nunits
7528                 = (unsigned int)
7529                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7530               if (!STMT_SLP_TYPE (stmt_info)
7531                   && nunits != (unsigned int) vf
7532                   && dump_enabled_p ())
7533                   /* For SLP VF is set according to unrolling factor, and not
7534                      to vector size, hence for SLP this print is not valid.  */
7535                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7536             }
7537
7538           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7539              reached.  */
7540           if (STMT_SLP_TYPE (stmt_info))
7541             {
7542               if (!slp_scheduled)
7543                 {
7544                   slp_scheduled = true;
7545
7546                   if (dump_enabled_p ())
7547                     dump_printf_loc (MSG_NOTE, vect_location,
7548                                      "=== scheduling SLP instances ===\n");
7549
7550                   vect_schedule_slp (loop_vinfo);
7551                 }
7552
7553               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7554               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7555                 {
7556                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7557                     {
7558                       pattern_def_seq = NULL;
7559                       gsi_next (&si);
7560                     }
7561                   continue;
7562                 }
7563             }
7564
7565           /* -------- vectorize statement ------------ */
7566           if (dump_enabled_p ())
7567             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7568
7569           grouped_store = false;
7570           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7571           if (is_store)
7572             {
7573               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7574                 {
7575                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7576                      interleaving chain was completed - free all the stores in
7577                      the chain.  */
7578                   gsi_next (&si);
7579                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7580                 }
7581               else
7582                 {
7583                   /* Free the attached stmt_vec_info and remove the stmt.  */
7584                   gimple *store = gsi_stmt (si);
7585                   free_stmt_vec_info (store);
7586                   unlink_stmt_vdef (store);
7587                   gsi_remove (&si, true);
7588                   release_defs (store);
7589                 }
7590
7591               /* Stores can only appear at the end of pattern statements.  */
7592               gcc_assert (!transform_pattern_stmt);
7593               pattern_def_seq = NULL;
7594             }
7595           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7596             {
7597               pattern_def_seq = NULL;
7598               gsi_next (&si);
7599             }
7600         }                       /* stmts in BB */
7601     }                           /* BBs in loop */
7602
7603   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7604
7605   scale_profile_for_vect_loop (loop, vf);
7606
7607   /* The minimum number of iterations performed by the epilogue.  This
7608      is 1 when peeling for gaps because we always need a final scalar
7609      iteration.  */
7610   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7611   /* +1 to convert latch counts to loop iteration counts,
7612      -min_epilogue_iters to remove iterations that cannot be performed
7613        by the vector code.  */
7614   int bias = 1 - min_epilogue_iters;
7615   /* In these calculations the "- 1" converts loop iteration counts
7616      back to latch counts.  */
7617   if (loop->any_upper_bound)
7618     loop->nb_iterations_upper_bound
7619       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7620   if (loop->any_likely_upper_bound)
7621     loop->nb_iterations_likely_upper_bound
7622       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7623   if (loop->any_estimate)
7624     loop->nb_iterations_estimate
7625       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7626
7627   if (dump_enabled_p ())
7628     {
7629       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7630         {
7631           dump_printf_loc (MSG_NOTE, vect_location,
7632                            "LOOP VECTORIZED\n");
7633           if (loop->inner)
7634             dump_printf_loc (MSG_NOTE, vect_location,
7635                              "OUTER LOOP VECTORIZED\n");
7636           dump_printf (MSG_NOTE, "\n");
7637         }
7638       else
7639         dump_printf_loc (MSG_NOTE, vect_location,
7640                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7641                          current_vector_size);
7642     }
7643
7644   /* Free SLP instances here because otherwise stmt reference counting
7645      won't work.  */
7646   slp_instance instance;
7647   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7648     vect_free_slp_instance (instance);
7649   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7650   /* Clear-up safelen field since its value is invalid after vectorization
7651      since vectorized loop can have loop-carried dependencies.  */
7652   loop->safelen = 0;
7653
7654   /* Don't vectorize epilogue for epilogue.  */
7655   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7656     epilogue = NULL;
7657
7658   if (epilogue)
7659     {
7660         unsigned int vector_sizes
7661           = targetm.vectorize.autovectorize_vector_sizes ();
7662         vector_sizes &= current_vector_size - 1;
7663
7664         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7665           epilogue = NULL;
7666         else if (!vector_sizes)
7667           epilogue = NULL;
7668         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7669                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7670           {
7671             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7672             int ratio = current_vector_size / smallest_vec_size;
7673             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7674               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7675             eiters = eiters % vf;
7676
7677             epilogue->nb_iterations_upper_bound = eiters - 1;
7678
7679             if (eiters < vf / ratio)
7680               epilogue = NULL;
7681             }
7682     }
7683
7684   if (epilogue)
7685     {
7686       epilogue->force_vectorize = loop->force_vectorize;
7687       epilogue->safelen = loop->safelen;
7688       epilogue->dont_vectorize = false;
7689
7690       /* We may need to if-convert epilogue to vectorize it.  */
7691       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7692         tree_if_conversion (epilogue);
7693     }
7694
7695   return epilogue;
7696 }
7697
7698 /* The code below is trying to perform simple optimization - revert
7699    if-conversion for masked stores, i.e. if the mask of a store is zero
7700    do not perform it and all stored value producers also if possible.
7701    For example,
7702      for (i=0; i<n; i++)
7703        if (c[i])
7704         {
7705           p1[i] += 1;
7706           p2[i] = p3[i] +2;
7707         }
7708    this transformation will produce the following semi-hammock:
7709
7710    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7711      {
7712        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7713        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7714        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7715        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7716        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7717        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7718      }
7719 */
7720
7721 void
7722 optimize_mask_stores (struct loop *loop)
7723 {
7724   basic_block *bbs = get_loop_body (loop);
7725   unsigned nbbs = loop->num_nodes;
7726   unsigned i;
7727   basic_block bb;
7728   struct loop *bb_loop;
7729   gimple_stmt_iterator gsi;
7730   gimple *stmt;
7731   auto_vec<gimple *> worklist;
7732
7733   vect_location = find_loop_location (loop);
7734   /* Pick up all masked stores in loop if any.  */
7735   for (i = 0; i < nbbs; i++)
7736     {
7737       bb = bbs[i];
7738       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7739            gsi_next (&gsi))
7740         {
7741           stmt = gsi_stmt (gsi);
7742           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7743             worklist.safe_push (stmt);
7744         }
7745     }
7746
7747   free (bbs);
7748   if (worklist.is_empty ())
7749     return;
7750
7751   /* Loop has masked stores.  */
7752   while (!worklist.is_empty ())
7753     {
7754       gimple *last, *last_store;
7755       edge e, efalse;
7756       tree mask;
7757       basic_block store_bb, join_bb;
7758       gimple_stmt_iterator gsi_to;
7759       tree vdef, new_vdef;
7760       gphi *phi;
7761       tree vectype;
7762       tree zero;
7763
7764       last = worklist.pop ();
7765       mask = gimple_call_arg (last, 2);
7766       bb = gimple_bb (last);
7767       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7768          the same loop as if_bb.  It could be different to LOOP when two
7769          level loop-nest is vectorized and mask_store belongs to the inner
7770          one.  */
7771       e = split_block (bb, last);
7772       bb_loop = bb->loop_father;
7773       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7774       join_bb = e->dest;
7775       store_bb = create_empty_bb (bb);
7776       add_bb_to_loop (store_bb, bb_loop);
7777       e->flags = EDGE_TRUE_VALUE;
7778       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7779       /* Put STORE_BB to likely part.  */
7780       efalse->probability = profile_probability::unlikely ();
7781       store_bb->count = efalse->count ();
7782       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7783       if (dom_info_available_p (CDI_DOMINATORS))
7784         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7785       if (dump_enabled_p ())
7786         dump_printf_loc (MSG_NOTE, vect_location,
7787                          "Create new block %d to sink mask stores.",
7788                          store_bb->index);
7789       /* Create vector comparison with boolean result.  */
7790       vectype = TREE_TYPE (mask);
7791       zero = build_zero_cst (vectype);
7792       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7793       gsi = gsi_last_bb (bb);
7794       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7795       /* Create new PHI node for vdef of the last masked store:
7796          .MEM_2 = VDEF <.MEM_1>
7797          will be converted to
7798          .MEM.3 = VDEF <.MEM_1>
7799          and new PHI node will be created in join bb
7800          .MEM_2 = PHI <.MEM_1, .MEM_3>
7801       */
7802       vdef = gimple_vdef (last);
7803       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7804       gimple_set_vdef (last, new_vdef);
7805       phi = create_phi_node (vdef, join_bb);
7806       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7807
7808       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7809       while (true)
7810         {
7811           gimple_stmt_iterator gsi_from;
7812           gimple *stmt1 = NULL;
7813
7814           /* Move masked store to STORE_BB.  */
7815           last_store = last;
7816           gsi = gsi_for_stmt (last);
7817           gsi_from = gsi;
7818           /* Shift GSI to the previous stmt for further traversal.  */
7819           gsi_prev (&gsi);
7820           gsi_to = gsi_start_bb (store_bb);
7821           gsi_move_before (&gsi_from, &gsi_to);
7822           /* Setup GSI_TO to the non-empty block start.  */
7823           gsi_to = gsi_start_bb (store_bb);
7824           if (dump_enabled_p ())
7825             {
7826               dump_printf_loc (MSG_NOTE, vect_location,
7827                                "Move stmt to created bb\n");
7828               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7829             }
7830           /* Move all stored value producers if possible.  */
7831           while (!gsi_end_p (gsi))
7832             {
7833               tree lhs;
7834               imm_use_iterator imm_iter;
7835               use_operand_p use_p;
7836               bool res;
7837
7838               /* Skip debug statements.  */
7839               if (is_gimple_debug (gsi_stmt (gsi)))
7840                 {
7841                   gsi_prev (&gsi);
7842                   continue;
7843                 }
7844               stmt1 = gsi_stmt (gsi);
7845               /* Do not consider statements writing to memory or having
7846                  volatile operand.  */
7847               if (gimple_vdef (stmt1)
7848                   || gimple_has_volatile_ops (stmt1))
7849                 break;
7850               gsi_from = gsi;
7851               gsi_prev (&gsi);
7852               lhs = gimple_get_lhs (stmt1);
7853               if (!lhs)
7854                 break;
7855
7856               /* LHS of vectorized stmt must be SSA_NAME.  */
7857               if (TREE_CODE (lhs) != SSA_NAME)
7858                 break;
7859
7860               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7861                 {
7862                   /* Remove dead scalar statement.  */
7863                   if (has_zero_uses (lhs))
7864                     {
7865                       gsi_remove (&gsi_from, true);
7866                       continue;
7867                     }
7868                 }
7869
7870               /* Check that LHS does not have uses outside of STORE_BB.  */
7871               res = true;
7872               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7873                 {
7874                   gimple *use_stmt;
7875                   use_stmt = USE_STMT (use_p);
7876                   if (is_gimple_debug (use_stmt))
7877                     continue;
7878                   if (gimple_bb (use_stmt) != store_bb)
7879                     {
7880                       res = false;
7881                       break;
7882                     }
7883                 }
7884               if (!res)
7885                 break;
7886
7887               if (gimple_vuse (stmt1)
7888                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7889                 break;
7890
7891               /* Can move STMT1 to STORE_BB.  */
7892               if (dump_enabled_p ())
7893                 {
7894                   dump_printf_loc (MSG_NOTE, vect_location,
7895                                    "Move stmt to created bb\n");
7896                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7897                 }
7898               gsi_move_before (&gsi_from, &gsi_to);
7899               /* Shift GSI_TO for further insertion.  */
7900               gsi_prev (&gsi_to);
7901             }
7902           /* Put other masked stores with the same mask to STORE_BB.  */
7903           if (worklist.is_empty ()
7904               || gimple_call_arg (worklist.last (), 2) != mask
7905               || worklist.last () != stmt1)
7906             break;
7907           last = worklist.pop ();
7908         }
7909       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7910     }
7911 }