gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54
  55 /* Loop Vectorization Pass.
  56
  57    This pass tries to vectorize loops.
  58
  59    For example, the vectorizer transforms the following simple loop:
  60
  61         short a[N]; short b[N]; short c[N]; int i;
  62
  63         for (i=0; i<N; i++){
  64           a[i] = b[i] + c[i];
  65         }
  66
  67    as if it was manually vectorized by rewriting the source code into:
  68
  69         typedef int __attribute__((mode(V8HI))) v8hi;
  70         short a[N];  short b[N]; short c[N];   int i;
  71         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  72         v8hi va, vb, vc;
  73
  74         for (i=0; i<N/8; i++){
  75           vb = pb[i];
  76           vc = pc[i];
  77           va = vb + vc;
  78           pa[i] = va;
  79         }
  80
  81         The main entry to this pass is vectorize_loops(), in which
  82    the vectorizer applies a set of analyses on a given set of loops,
  83    followed by the actual vectorization transformation for the loops that
  84    had successfully passed the analysis phase.
  85         Throughout this pass we make a distinction between two types of
  86    data: scalars (which are represented by SSA_NAMES), and memory references
  87    ("data-refs").  These two types of data require different handling both
  88    during analysis and transformation. The types of data-refs that the
  89    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  90    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  91    accesses are required to have a simple (consecutive) access pattern.
  92
  93    Analysis phase:
  94    ===============
  95         The driver for the analysis phase is vect_analyze_loop().
  96    It applies a set of analyses, some of which rely on the scalar evolution
  97    analyzer (scev) developed by Sebastian Pop.
  98
  99         During the analysis phase the vectorizer records some information
 100    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 101    loop, as well as general information about the loop as a whole, which is
 102    recorded in a "loop_vec_info" struct attached to each loop.
 103
 104    Transformation phase:
 105    =====================
 106         The loop transformation phase scans all the stmts in the loop, and
 107    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 108    the loop that needs to be vectorized.  It inserts the vector code sequence
 109    just before the scalar stmt S, and records a pointer to the vector code
 110    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 111    attached to S).  This pointer will be used for the vectorization of following
 112    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 113    otherwise, we rely on dead code elimination for removing it.
 114
 115         For example, say stmt S1 was vectorized into stmt VS1:
 116
 117    VS1: vb = px[i];
 118    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 119    S2:  a = b;
 120
 121    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 122    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 123    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 124    resulting sequence would be:
 125
 126    VS1: vb = px[i];
 127    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 128    VS2: va = vb;
 129    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 130
 131         Operands that are not SSA_NAMEs, are data-refs that appear in
 132    load/store operations (like 'x[i]' in S1), and are handled differently.
 133
 134    Target modeling:
 135    =================
 136         Currently the only target specific information that is used is the
 137    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 138    Targets that can support different sizes of vectors, for now will need
 139    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 140    flexibility will be added in the future.
 141
 142         Since we only vectorize operations which vector form can be
 143    expressed using existing tree codes, to verify that an operation is
 144    supported, the vectorizer checks the relevant optab at the relevant
 145    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 146    the value found is CODE_FOR_nothing, then there's no target support, and
 147    we can't vectorize the stmt.
 148
 149    For additional information on this project see:
 150    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 151 */
 152
 153 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 154
 155 /* Function vect_determine_vectorization_factor
 156
 157    Determine the vectorization factor (VF).  VF is the number of data elements
 158    that are operated upon in parallel in a single iteration of the vectorized
 159    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 160    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 161    elements can fit in a single vector register.
 162
 163    We currently support vectorization of loops in which all types operated upon
 164    are of the same size.  Therefore this function currently sets VF according to
 165    the size of the types operated upon, and fails if there are multiple sizes
 166    in the loop.
 167
 168    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 169    original loop:
 170         for (i=0; i<N; i++){
 171           a[i] = b[i] + c[i];
 172         }
 173
 174    vectorized loop:
 175         for (i=0; i<N; i+=VF){
 176           a[i:VF] = b[i:VF] + c[i:VF];
 177         }
 178 */
 179
 180 static bool
 181 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 182 {
 183   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 184   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 185   unsigned nbbs = loop->num_nodes;
 186   unsigned int vectorization_factor = 0;
 187   tree scalar_type = NULL_TREE;
 188   gphi *phi;
 189   tree vectype;
 190   unsigned int nunits;
 191   stmt_vec_info stmt_info;
 192   unsigned i;
 193   HOST_WIDE_INT dummy;
 194   gimple *stmt, *pattern_stmt = NULL;
 195   gimple_seq pattern_def_seq = NULL;
 196   gimple_stmt_iterator pattern_def_si = gsi_none ();
 197   bool analyze_pattern_stmt = false;
 198   bool bool_result;
 199   auto_vec<stmt_vec_info> mask_producers;
 200
 201   if (dump_enabled_p ())
 202     dump_printf_loc (MSG_NOTE, vect_location,
 203                      "=== vect_determine_vectorization_factor ===\n");
 204
 205   for (i = 0; i < nbbs; i++)
 206     {
 207       basic_block bb = bbs[i];
 208
 209       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 210            gsi_next (&si))
 211         {
 212           phi = si.phi ();
 213           stmt_info = vinfo_for_stmt (phi);
 214           if (dump_enabled_p ())
 215             {
 216               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 217               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 218             }
 219
 220           gcc_assert (stmt_info);
 221
 222           if (STMT_VINFO_RELEVANT_P (stmt_info)
 223               || STMT_VINFO_LIVE_P (stmt_info))
 224             {
 225               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 226               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 227
 228               if (dump_enabled_p ())
 229                 {
 230                   dump_printf_loc (MSG_NOTE, vect_location,
 231                                    "get vectype for scalar type:  ");
 232                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 233                   dump_printf (MSG_NOTE, "\n");
 234                 }
 235
 236               vectype = get_vectype_for_scalar_type (scalar_type);
 237               if (!vectype)
 238                 {
 239                   if (dump_enabled_p ())
 240                     {
 241                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 242                                        "not vectorized: unsupported "
 243                                        "data-type ");
 244                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 245                                          scalar_type);
 246                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 247                     }
 248                   return false;
 249                 }
 250               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 251
 252               if (dump_enabled_p ())
 253                 {
 254                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 255                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 256                   dump_printf (MSG_NOTE, "\n");
 257                 }
 258
 259               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 260               if (dump_enabled_p ())
 261                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 262                                  nunits);
 263
 264               if (!vectorization_factor
 265                   || (nunits > vectorization_factor))
 266                 vectorization_factor = nunits;
 267             }
 268         }
 269
 270       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 271            !gsi_end_p (si) || analyze_pattern_stmt;)
 272         {
 273           tree vf_vectype;
 274
 275           if (analyze_pattern_stmt)
 276             stmt = pattern_stmt;
 277           else
 278             stmt = gsi_stmt (si);
 279
 280           stmt_info = vinfo_for_stmt (stmt);
 281
 282           if (dump_enabled_p ())
 283             {
 284               dump_printf_loc (MSG_NOTE, vect_location,
 285                                "==> examining statement: ");
 286               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 287             }
 288
 289           gcc_assert (stmt_info);
 290
 291           /* Skip stmts which do not need to be vectorized.  */
 292           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 293                && !STMT_VINFO_LIVE_P (stmt_info))
 294               || gimple_clobber_p (stmt))
 295             {
 296               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 297                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 298                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 299                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 300                 {
 301                   stmt = pattern_stmt;
 302                   stmt_info = vinfo_for_stmt (pattern_stmt);
 303                   if (dump_enabled_p ())
 304                     {
 305                       dump_printf_loc (MSG_NOTE, vect_location,
 306                                        "==> examining pattern statement: ");
 307                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 308                     }
 309                 }
 310               else
 311                 {
 312                   if (dump_enabled_p ())
 313                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 314                   gsi_next (&si);
 315                   continue;
 316                 }
 317             }
 318           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 319                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 320                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 321                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 322             analyze_pattern_stmt = true;
 323
 324           /* If a pattern statement has def stmts, analyze them too.  */
 325           if (is_pattern_stmt_p (stmt_info))
 326             {
 327               if (pattern_def_seq == NULL)
 328                 {
 329                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 330                   pattern_def_si = gsi_start (pattern_def_seq);
 331                 }
 332               else if (!gsi_end_p (pattern_def_si))
 333                 gsi_next (&pattern_def_si);
 334               if (pattern_def_seq != NULL)
 335                 {
 336                   gimple *pattern_def_stmt = NULL;
 337                   stmt_vec_info pattern_def_stmt_info = NULL;
 338
 339                   while (!gsi_end_p (pattern_def_si))
 340                     {
 341                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 342                       pattern_def_stmt_info
 343                         = vinfo_for_stmt (pattern_def_stmt);
 344                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 345                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 346                         break;
 347                       gsi_next (&pattern_def_si);
 348                     }
 349
 350                   if (!gsi_end_p (pattern_def_si))
 351                     {
 352                       if (dump_enabled_p ())
 353                         {
 354                           dump_printf_loc (MSG_NOTE, vect_location,
 355                                            "==> examining pattern def stmt: ");
 356                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 357                                             pattern_def_stmt, 0);
 358                         }
 359
 360                       stmt = pattern_def_stmt;
 361                       stmt_info = pattern_def_stmt_info;
 362                     }
 363                   else
 364                     {
 365                       pattern_def_si = gsi_none ();
 366                       analyze_pattern_stmt = false;
 367                     }
 368                 }
 369               else
 370                 analyze_pattern_stmt = false;
 371             }
 372
 373           if (gimple_get_lhs (stmt) == NULL_TREE
 374               /* MASK_STORE has no lhs, but is ok.  */
 375               && (!is_gimple_call (stmt)
 376                   || !gimple_call_internal_p (stmt)
 377                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 378             {
 379               if (is_gimple_call (stmt))
 380                 {
 381                   /* Ignore calls with no lhs.  These must be calls to
 382                      #pragma omp simd functions, and what vectorization factor
 383                      it really needs can't be determined until
 384                      vectorizable_simd_clone_call.  */
 385                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 386                     {
 387                       pattern_def_seq = NULL;
 388                       gsi_next (&si);
 389                     }
 390                   continue;
 391                 }
 392               if (dump_enabled_p ())
 393                 {
 394                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 395                                    "not vectorized: irregular stmt.");
 396                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 397                                     0);
 398                 }
 399               return false;
 400             }
 401
 402           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 403             {
 404               if (dump_enabled_p ())
 405                 {
 406                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                    "not vectorized: vector stmt in loop:");
 408                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 409                 }
 410               return false;
 411             }
 412
 413           bool_result = false;
 414
 415           if (STMT_VINFO_VECTYPE (stmt_info))
 416             {
 417               /* The only case when a vectype had been already set is for stmts
 418                  that contain a dataref, or for "pattern-stmts" (stmts
 419                  generated by the vectorizer to represent/replace a certain
 420                  idiom).  */
 421               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 422                           || is_pattern_stmt_p (stmt_info)
 423                           || !gsi_end_p (pattern_def_si));
 424               vectype = STMT_VINFO_VECTYPE (stmt_info);
 425             }
 426           else
 427             {
 428               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 429               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 430                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 431               else
 432                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 433
 434               /* Bool ops don't participate in vectorization factor
 435                  computation.  For comparison use compared types to
 436                  compute a factor.  */
 437               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 438                   && is_gimple_assign (stmt)
 439                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 440                 {
 441                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 442                       || STMT_VINFO_LIVE_P (stmt_info))
 443                     mask_producers.safe_push (stmt_info);
 444                   bool_result = true;
 445
 446                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 447                       == tcc_comparison
 448                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 449                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 450                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 451                   else
 452                     {
 453                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 454                         {
 455                           pattern_def_seq = NULL;
 456                           gsi_next (&si);
 457                         }
 458                       continue;
 459                     }
 460                 }
 461
 462               if (dump_enabled_p ())
 463                 {
 464                   dump_printf_loc (MSG_NOTE, vect_location,
 465                                    "get vectype for scalar type:  ");
 466                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 467                   dump_printf (MSG_NOTE, "\n");
 468                 }
 469               vectype = get_vectype_for_scalar_type (scalar_type);
 470               if (!vectype)
 471                 {
 472                   if (dump_enabled_p ())
 473                     {
 474                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 475                                        "not vectorized: unsupported "
 476                                        "data-type ");
 477                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 478                                          scalar_type);
 479                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 480                     }
 481                   return false;
 482                 }
 483
 484               if (!bool_result)
 485                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 486
 487               if (dump_enabled_p ())
 488                 {
 489                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 490                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 491                   dump_printf (MSG_NOTE, "\n");
 492                 }
 493             }
 494
 495           /* Don't try to compute VF out scalar types if we stmt
 496              produces boolean vector.  Use result vectype instead.  */
 497           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 498             vf_vectype = vectype;
 499           else
 500             {
 501               /* The vectorization factor is according to the smallest
 502                  scalar type (or the largest vector size, but we only
 503                  support one vector size per loop).  */
 504               if (!bool_result)
 505                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 506                                                              &dummy);
 507               if (dump_enabled_p ())
 508                 {
 509                   dump_printf_loc (MSG_NOTE, vect_location,
 510                                    "get vectype for scalar type:  ");
 511                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 512                   dump_printf (MSG_NOTE, "\n");
 513                 }
 514               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 515             }
 516           if (!vf_vectype)
 517             {
 518               if (dump_enabled_p ())
 519                 {
 520                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 521                                    "not vectorized: unsupported data-type ");
 522                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 523                                      scalar_type);
 524                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 525                 }
 526               return false;
 527             }
 528
 529           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 530                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 531             {
 532               if (dump_enabled_p ())
 533                 {
 534                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 535                                    "not vectorized: different sized vector "
 536                                    "types in statement, ");
 537                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 538                                      vectype);
 539                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 540                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 541                                      vf_vectype);
 542                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 543                 }
 544               return false;
 545             }
 546
 547           if (dump_enabled_p ())
 548             {
 549               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 550               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 551               dump_printf (MSG_NOTE, "\n");
 552             }
 553
 554           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 555           if (dump_enabled_p ())
 556             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 557           if (!vectorization_factor
 558               || (nunits > vectorization_factor))
 559             vectorization_factor = nunits;
 560
 561           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 562             {
 563               pattern_def_seq = NULL;
 564               gsi_next (&si);
 565             }
 566         }
 567     }
 568
 569   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 570   if (dump_enabled_p ())
 571     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 572                      vectorization_factor);
 573   if (vectorization_factor <= 1)
 574     {
 575       if (dump_enabled_p ())
 576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 577                          "not vectorized: unsupported data-type\n");
 578       return false;
 579     }
 580   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 581
 582   for (i = 0; i < mask_producers.length (); i++)
 583     {
 584       tree mask_type = NULL;
 585
 586       stmt = STMT_VINFO_STMT (mask_producers[i]);
 587
 588       if (is_gimple_assign (stmt)
 589           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 590           && !VECT_SCALAR_BOOLEAN_TYPE_P
 591                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 592         {
 593           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 594           mask_type = get_mask_type_for_scalar_type (scalar_type);
 595
 596           if (!mask_type)
 597             {
 598               if (dump_enabled_p ())
 599                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 600                                  "not vectorized: unsupported mask\n");
 601               return false;
 602             }
 603         }
 604       else
 605         {
 606           tree rhs;
 607           ssa_op_iter iter;
 608           gimple *def_stmt;
 609           enum vect_def_type dt;
 610
 611           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 612             {
 613               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 614                                        &def_stmt, &dt, &vectype))
 615                 {
 616                   if (dump_enabled_p ())
 617                     {
 618                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 619                                        "not vectorized: can't compute mask type "
 620                                        "for statement, ");
 621                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 622                                         0);
 623                     }
 624                   return false;
 625                 }
 626
 627               /* No vectype probably means external definition.
 628                  Allow it in case there is another operand which
 629                  allows to determine mask type.  */
 630               if (!vectype)
 631                 continue;
 632
 633               if (!mask_type)
 634                 mask_type = vectype;
 635               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 636                        != TYPE_VECTOR_SUBPARTS (vectype))
 637                 {
 638                   if (dump_enabled_p ())
 639                     {
 640                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                                        "not vectorized: different sized masks "
 642                                        "types in statement, ");
 643                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 644                                          mask_type);
 645                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          vectype);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 649                     }
 650                   return false;
 651                 }
 652               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 653                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 654                 {
 655                   if (dump_enabled_p ())
 656                     {
 657                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 658                                        "not vectorized: mixed mask and "
 659                                        "nonmask vector types in statement, ");
 660                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 661                                          mask_type);
 662                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          vectype);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 666                     }
 667                   return false;
 668                 }
 669             }
 670
 671           /* We may compare boolean value loaded as vector of integers.
 672              Fix mask_type in such case.  */
 673           if (mask_type
 674               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 675               && gimple_code (stmt) == GIMPLE_ASSIGN
 676               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 677             mask_type = build_same_sized_truth_vector_type (mask_type);
 678         }
 679
 680       /* No mask_type should mean loop invariant predicate.
 681          This is probably a subject for optimization in
 682          if-conversion.  */
 683       if (!mask_type)
 684         {
 685           if (dump_enabled_p ())
 686             {
 687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                                "not vectorized: can't compute mask type "
 689                                "for statement, ");
 690               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 691                                 0);
 692             }
 693           return false;
 694         }
 695
 696       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 697     }
 698
 699   return true;
 700 }
 701
 702
 703 /* Function vect_is_simple_iv_evolution.
 704
 705    FORNOW: A simple evolution of an induction variables in the loop is
 706    considered a polynomial evolution.  */
 707
 708 static bool
 709 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 710                              tree * step)
 711 {
 712   tree init_expr;
 713   tree step_expr;
 714   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 715   basic_block bb;
 716
 717   /* When there is no evolution in this loop, the evolution function
 718      is not "simple".  */
 719   if (evolution_part == NULL_TREE)
 720     return false;
 721
 722   /* When the evolution is a polynomial of degree >= 2
 723      the evolution function is not "simple".  */
 724   if (tree_is_chrec (evolution_part))
 725     return false;
 726
 727   step_expr = evolution_part;
 728   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 729
 730   if (dump_enabled_p ())
 731     {
 732       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 733       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 734       dump_printf (MSG_NOTE, ",  init: ");
 735       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 736       dump_printf (MSG_NOTE, "\n");
 737     }
 738
 739   *init = init_expr;
 740   *step = step_expr;
 741
 742   if (TREE_CODE (step_expr) != INTEGER_CST
 743       && (TREE_CODE (step_expr) != SSA_NAME
 744           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 745               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 746           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 747               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 748                   || !flag_associative_math)))
 749       && (TREE_CODE (step_expr) != REAL_CST
 750           || !flag_associative_math))
 751     {
 752       if (dump_enabled_p ())
 753         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 754                          "step unknown.\n");
 755       return false;
 756     }
 757
 758   return true;
 759 }
 760
 761 /* Function vect_analyze_scalar_cycles_1.
 762
 763    Examine the cross iteration def-use cycles of scalar variables
 764    in LOOP.  LOOP_VINFO represents the loop that is now being
 765    considered for vectorization (can be LOOP, or an outer-loop
 766    enclosing LOOP).  */
 767
 768 static void
 769 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 770 {
 771   basic_block bb = loop->header;
 772   tree init, step;
 773   auto_vec<gimple *, 64> worklist;
 774   gphi_iterator gsi;
 775   bool double_reduc;
 776
 777   if (dump_enabled_p ())
 778     dump_printf_loc (MSG_NOTE, vect_location,
 779                      "=== vect_analyze_scalar_cycles ===\n");
 780
 781   /* First - identify all inductions.  Reduction detection assumes that all the
 782      inductions have been identified, therefore, this order must not be
 783      changed.  */
 784   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 785     {
 786       gphi *phi = gsi.phi ();
 787       tree access_fn = NULL;
 788       tree def = PHI_RESULT (phi);
 789       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 790
 791       if (dump_enabled_p ())
 792         {
 793           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 794           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 795         }
 796
 797       /* Skip virtual phi's.  The data dependences that are associated with
 798          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 799       if (virtual_operand_p (def))
 800         continue;
 801
 802       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 803
 804       /* Analyze the evolution function.  */
 805       access_fn = analyze_scalar_evolution (loop, def);
 806       if (access_fn)
 807         {
 808           STRIP_NOPS (access_fn);
 809           if (dump_enabled_p ())
 810             {
 811               dump_printf_loc (MSG_NOTE, vect_location,
 812                                "Access function of PHI: ");
 813               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 814               dump_printf (MSG_NOTE, "\n");
 815             }
 816           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 817             = initial_condition_in_loop_num (access_fn, loop->num);
 818           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 819             = evolution_part_in_loop_num (access_fn, loop->num);
 820         }
 821
 822       if (!access_fn
 823           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 824           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 825               && TREE_CODE (step) != INTEGER_CST))
 826         {
 827           worklist.safe_push (phi);
 828           continue;
 829         }
 830
 831       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 832                   != NULL_TREE);
 833       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 834
 835       if (dump_enabled_p ())
 836         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 837       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 838     }
 839
 840
 841   /* Second - identify all reductions and nested cycles.  */
 842   while (worklist.length () > 0)
 843     {
 844       gimple *phi = worklist.pop ();
 845       tree def = PHI_RESULT (phi);
 846       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 847       gimple *reduc_stmt;
 848
 849       if (dump_enabled_p ())
 850         {
 851           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 852           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 853         }
 854
 855       gcc_assert (!virtual_operand_p (def)
 856                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 857
 858       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 859                                                 &double_reduc, false);
 860       if (reduc_stmt)
 861         {
 862           if (double_reduc)
 863             {
 864               if (dump_enabled_p ())
 865                 dump_printf_loc (MSG_NOTE, vect_location,
 866                                  "Detected double reduction.\n");
 867
 868               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 869               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 870                                                     vect_double_reduction_def;
 871             }
 872           else
 873             {
 874               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 875                 {
 876                   if (dump_enabled_p ())
 877                     dump_printf_loc (MSG_NOTE, vect_location,
 878                                      "Detected vectorizable nested cycle.\n");
 879
 880                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 881                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 882                                                              vect_nested_cycle;
 883                 }
 884               else
 885                 {
 886                   if (dump_enabled_p ())
 887                     dump_printf_loc (MSG_NOTE, vect_location,
 888                                      "Detected reduction.\n");
 889
 890                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 891                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 892                                                            vect_reduction_def;
 893                   /* Store the reduction cycles for possible vectorization in
 894                      loop-aware SLP if it was not detected as reduction
 895                      chain.  */
 896                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 897                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 898                 }
 899             }
 900         }
 901       else
 902         if (dump_enabled_p ())
 903           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 904                            "Unknown def-use cycle pattern.\n");
 905     }
 906 }
 907
 908
 909 /* Function vect_analyze_scalar_cycles.
 910
 911    Examine the cross iteration def-use cycles of scalar variables, by
 912    analyzing the loop-header PHIs of scalar variables.  Classify each
 913    cycle as one of the following: invariant, induction, reduction, unknown.
 914    We do that for the loop represented by LOOP_VINFO, and also to its
 915    inner-loop, if exists.
 916    Examples for scalar cycles:
 917
 918    Example1: reduction:
 919
 920               loop1:
 921               for (i=0; i<N; i++)
 922                  sum += a[i];
 923
 924    Example2: induction:
 925
 926               loop2:
 927               for (i=0; i<N; i++)
 928                  a[i] = i;  */
 929
 930 static void
 931 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 932 {
 933   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 934
 935   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 936
 937   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 938      Reductions in such inner-loop therefore have different properties than
 939      the reductions in the nest that gets vectorized:
 940      1. When vectorized, they are executed in the same order as in the original
 941         scalar loop, so we can't change the order of computation when
 942         vectorizing them.
 943      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 944         current checks are too strict.  */
 945
 946   if (loop->inner)
 947     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 948 }
 949
 950 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 951
 952 static void
 953 vect_fixup_reduc_chain (gimple *stmt)
 954 {
 955   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 956   gimple *stmtp;
 957   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 958               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 959   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 960   do
 961     {
 962       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 964       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 965       if (stmt)
 966         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 967           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 968     }
 969   while (stmt);
 970   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 971 }
 972
 973 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 974
 975 static void
 976 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 977 {
 978   gimple *first;
 979   unsigned i;
 980
 981   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 982     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 983       {
 984         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 985         while (next)
 986           {
 987             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 988               break;
 989             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 990           }
 991         /* If not all stmt in the chain are patterns try to handle
 992            the chain without patterns.  */
 993         if (! next)
 994           {
 995             vect_fixup_reduc_chain (first);
 996             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 997               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 998           }
 999       }
1000 }
1001
1002 /* Function vect_get_loop_niters.
1003
1004    Determine how many iterations the loop is executed and place it
1005    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1006    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1007    niter information holds in ASSUMPTIONS.
1008
1009    Return the loop exit condition.  */
1010
1011
1012 static gcond *
1013 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1014                       tree *number_of_iterations, tree *number_of_iterationsm1)
1015 {
1016   edge exit = single_exit (loop);
1017   struct tree_niter_desc niter_desc;
1018   tree niter_assumptions, niter, may_be_zero;
1019   gcond *cond = get_loop_exit_condition (loop);
1020
1021   *assumptions = boolean_true_node;
1022   *number_of_iterationsm1 = chrec_dont_know;
1023   *number_of_iterations = chrec_dont_know;
1024   if (dump_enabled_p ())
1025     dump_printf_loc (MSG_NOTE, vect_location,
1026                      "=== get_loop_niters ===\n");
1027
1028   if (!exit)
1029     return cond;
1030
1031   niter = chrec_dont_know;
1032   may_be_zero = NULL_TREE;
1033   niter_assumptions = boolean_true_node;
1034   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1035       || chrec_contains_undetermined (niter_desc.niter))
1036     return cond;
1037
1038   niter_assumptions = niter_desc.assumptions;
1039   may_be_zero = niter_desc.may_be_zero;
1040   niter = niter_desc.niter;
1041
1042   if (may_be_zero && integer_zerop (may_be_zero))
1043     may_be_zero = NULL_TREE;
1044
1045   if (may_be_zero)
1046     {
1047       if (COMPARISON_CLASS_P (may_be_zero))
1048         {
1049           /* Try to combine may_be_zero with assumptions, this can simplify
1050              computation of niter expression.  */
1051           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1052             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1053                                              niter_assumptions,
1054                                              fold_build1 (TRUTH_NOT_EXPR,
1055                                                           boolean_type_node,
1056                                                           may_be_zero));
1057           else
1058             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1059                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1060
1061           may_be_zero = NULL_TREE;
1062         }
1063       else if (integer_nonzerop (may_be_zero))
1064         {
1065           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1066           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1067           return cond;
1068         }
1069       else
1070         return cond;
1071     }
1072
1073   *assumptions = niter_assumptions;
1074   *number_of_iterationsm1 = niter;
1075
1076   /* We want the number of loop header executions which is the number
1077      of latch executions plus one.
1078      ???  For UINT_MAX latch executions this number overflows to zero
1079      for loops like do { n++; } while (n != 0);  */
1080   if (niter && !chrec_contains_undetermined (niter))
1081     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1082                           build_int_cst (TREE_TYPE (niter), 1));
1083   *number_of_iterations = niter;
1084
1085   return cond;
1086 }
1087
1088 /* Function bb_in_loop_p
1089
1090    Used as predicate for dfs order traversal of the loop bbs.  */
1091
1092 static bool
1093 bb_in_loop_p (const_basic_block bb, const void *data)
1094 {
1095   const struct loop *const loop = (const struct loop *)data;
1096   if (flow_bb_inside_loop_p (loop, bb))
1097     return true;
1098   return false;
1099 }
1100
1101
1102 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1103    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1104
1105 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1106   : vec_info (vec_info::loop, init_cost (loop_in)),
1107     loop (loop_in),
1108     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1109     num_itersm1 (NULL_TREE),
1110     num_iters (NULL_TREE),
1111     num_iters_unchanged (NULL_TREE),
1112     num_iters_assumptions (NULL_TREE),
1113     th (0),
1114     vectorization_factor (0),
1115     max_vectorization_factor (0),
1116     unaligned_dr (NULL),
1117     peeling_for_alignment (0),
1118     ptr_mask (0),
1119     slp_unrolling_factor (1),
1120     single_scalar_iteration_cost (0),
1121     vectorizable (false),
1122     peeling_for_gaps (false),
1123     peeling_for_niter (false),
1124     operands_swapped (false),
1125     no_data_dependencies (false),
1126     has_mask_store (false),
1127     scalar_loop (NULL),
1128     orig_loop_info (NULL)
1129 {
1130   /* Create/Update stmt_info for all stmts in the loop.  */
1131   basic_block *body = get_loop_body (loop);
1132   for (unsigned int i = 0; i < loop->num_nodes; i++)
1133     {
1134       basic_block bb = body[i];
1135       gimple_stmt_iterator si;
1136
1137       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1138         {
1139           gimple *phi = gsi_stmt (si);
1140           gimple_set_uid (phi, 0);
1141           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1142         }
1143
1144       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1145         {
1146           gimple *stmt = gsi_stmt (si);
1147           gimple_set_uid (stmt, 0);
1148           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1149         }
1150     }
1151   free (body);
1152
1153   /* CHECKME: We want to visit all BBs before their successors (except for
1154      latch blocks, for which this assertion wouldn't hold).  In the simple
1155      case of the loop forms we allow, a dfs order of the BBs would the same
1156      as reversed postorder traversal, so we are safe.  */
1157
1158   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1159                                           bbs, loop->num_nodes, loop);
1160   gcc_assert (nbbs == loop->num_nodes);
1161 }
1162
1163
1164 /* Free all memory used by the _loop_vec_info, as well as all the
1165    stmt_vec_info structs of all the stmts in the loop.  */
1166
1167 _loop_vec_info::~_loop_vec_info ()
1168 {
1169   int nbbs;
1170   gimple_stmt_iterator si;
1171   int j;
1172
1173   nbbs = loop->num_nodes;
1174   for (j = 0; j < nbbs; j++)
1175     {
1176       basic_block bb = bbs[j];
1177       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1178         free_stmt_vec_info (gsi_stmt (si));
1179
1180       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1181         {
1182           gimple *stmt = gsi_stmt (si);
1183
1184           /* We may have broken canonical form by moving a constant
1185              into RHS1 of a commutative op.  Fix such occurrences.  */
1186           if (operands_swapped && is_gimple_assign (stmt))
1187             {
1188               enum tree_code code = gimple_assign_rhs_code (stmt);
1189
1190               if ((code == PLUS_EXPR
1191                    || code == POINTER_PLUS_EXPR
1192                    || code == MULT_EXPR)
1193                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1194                 swap_ssa_operands (stmt,
1195                                    gimple_assign_rhs1_ptr (stmt),
1196                                    gimple_assign_rhs2_ptr (stmt));
1197               else if (code == COND_EXPR
1198                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1199                 {
1200                   tree cond_expr = gimple_assign_rhs1 (stmt);
1201                   enum tree_code cond_code = TREE_CODE (cond_expr);
1202
1203                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1204                     {
1205                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1206                                                                   0));
1207                       cond_code = invert_tree_comparison (cond_code,
1208                                                           honor_nans);
1209                       if (cond_code != ERROR_MARK)
1210                         {
1211                           TREE_SET_CODE (cond_expr, cond_code);
1212                           swap_ssa_operands (stmt,
1213                                              gimple_assign_rhs2_ptr (stmt),
1214                                              gimple_assign_rhs3_ptr (stmt));
1215                         }
1216                     }
1217                 }
1218             }
1219
1220           /* Free stmt_vec_info.  */
1221           free_stmt_vec_info (stmt);
1222           gsi_next (&si);
1223         }
1224     }
1225
1226   free (bbs);
1227
1228   loop->aux = NULL;
1229 }
1230
1231
1232 /* Calculate the cost of one scalar iteration of the loop.  */
1233 static void
1234 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1235 {
1236   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1237   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1238   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1239   int innerloop_iters, i;
1240
1241   /* Count statements in scalar loop.  Using this as scalar cost for a single
1242      iteration for now.
1243
1244      TODO: Add outer loop support.
1245
1246      TODO: Consider assigning different costs to different scalar
1247      statements.  */
1248
1249   /* FORNOW.  */
1250   innerloop_iters = 1;
1251   if (loop->inner)
1252     innerloop_iters = 50; /* FIXME */
1253
1254   for (i = 0; i < nbbs; i++)
1255     {
1256       gimple_stmt_iterator si;
1257       basic_block bb = bbs[i];
1258
1259       if (bb->loop_father == loop->inner)
1260         factor = innerloop_iters;
1261       else
1262         factor = 1;
1263
1264       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1265         {
1266           gimple *stmt = gsi_stmt (si);
1267           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1268
1269           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1270             continue;
1271
1272           /* Skip stmts that are not vectorized inside the loop.  */
1273           if (stmt_info
1274               && !STMT_VINFO_RELEVANT_P (stmt_info)
1275               && (!STMT_VINFO_LIVE_P (stmt_info)
1276                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1277               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1278             continue;
1279
1280           vect_cost_for_stmt kind;
1281           if (STMT_VINFO_DATA_REF (stmt_info))
1282             {
1283               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1284                kind = scalar_load;
1285              else
1286                kind = scalar_store;
1287             }
1288           else
1289             kind = scalar_stmt;
1290
1291           scalar_single_iter_cost
1292             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1293                                  factor, kind, stmt_info, 0, vect_prologue);
1294         }
1295     }
1296   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1297     = scalar_single_iter_cost;
1298 }
1299
1300
1301 /* Function vect_analyze_loop_form_1.
1302
1303    Verify that certain CFG restrictions hold, including:
1304    - the loop has a pre-header
1305    - the loop has a single entry and exit
1306    - the loop exit condition is simple enough
1307    - the number of iterations can be analyzed, i.e, a countable loop.  The
1308      niter could be analyzed under some assumptions.  */
1309
1310 bool
1311 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1312                           tree *assumptions, tree *number_of_iterationsm1,
1313                           tree *number_of_iterations, gcond **inner_loop_cond)
1314 {
1315   if (dump_enabled_p ())
1316     dump_printf_loc (MSG_NOTE, vect_location,
1317                      "=== vect_analyze_loop_form ===\n");
1318
1319   /* Different restrictions apply when we are considering an inner-most loop,
1320      vs. an outer (nested) loop.
1321      (FORNOW. May want to relax some of these restrictions in the future).  */
1322
1323   if (!loop->inner)
1324     {
1325       /* Inner-most loop.  We currently require that the number of BBs is
1326          exactly 2 (the header and latch).  Vectorizable inner-most loops
1327          look like this:
1328
1329                         (pre-header)
1330                            |
1331                           header <--------+
1332                            | |            |
1333                            | +--> latch --+
1334                            |
1335                         (exit-bb)  */
1336
1337       if (loop->num_nodes != 2)
1338         {
1339           if (dump_enabled_p ())
1340             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1341                              "not vectorized: control flow in loop.\n");
1342           return false;
1343         }
1344
1345       if (empty_block_p (loop->header))
1346         {
1347           if (dump_enabled_p ())
1348             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1349                              "not vectorized: empty loop.\n");
1350           return false;
1351         }
1352     }
1353   else
1354     {
1355       struct loop *innerloop = loop->inner;
1356       edge entryedge;
1357
1358       /* Nested loop. We currently require that the loop is doubly-nested,
1359          contains a single inner loop, and the number of BBs is exactly 5.
1360          Vectorizable outer-loops look like this:
1361
1362                         (pre-header)
1363                            |
1364                           header <---+
1365                            |         |
1366                           inner-loop |
1367                            |         |
1368                           tail ------+
1369                            |
1370                         (exit-bb)
1371
1372          The inner-loop has the properties expected of inner-most loops
1373          as described above.  */
1374
1375       if ((loop->inner)->inner || (loop->inner)->next)
1376         {
1377           if (dump_enabled_p ())
1378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379                              "not vectorized: multiple nested loops.\n");
1380           return false;
1381         }
1382
1383       if (loop->num_nodes != 5)
1384         {
1385           if (dump_enabled_p ())
1386             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1387                              "not vectorized: control flow in loop.\n");
1388           return false;
1389         }
1390
1391       entryedge = loop_preheader_edge (innerloop);
1392       if (entryedge->src != loop->header
1393           || !single_exit (innerloop)
1394           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1395         {
1396           if (dump_enabled_p ())
1397             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1398                              "not vectorized: unsupported outerloop form.\n");
1399           return false;
1400         }
1401
1402       /* Analyze the inner-loop.  */
1403       tree inner_niterm1, inner_niter, inner_assumptions;
1404       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1405                                       &inner_assumptions, &inner_niterm1,
1406                                       &inner_niter, NULL)
1407           /* Don't support analyzing niter under assumptions for inner
1408              loop.  */
1409           || !integer_onep (inner_assumptions))
1410         {
1411           if (dump_enabled_p ())
1412             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1413                              "not vectorized: Bad inner loop.\n");
1414           return false;
1415         }
1416
1417       if (!expr_invariant_in_loop_p (loop, inner_niter))
1418         {
1419           if (dump_enabled_p ())
1420             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1421                              "not vectorized: inner-loop count not"
1422                              " invariant.\n");
1423           return false;
1424         }
1425
1426       if (dump_enabled_p ())
1427         dump_printf_loc (MSG_NOTE, vect_location,
1428                          "Considering outer-loop vectorization.\n");
1429     }
1430
1431   if (!single_exit (loop)
1432       || EDGE_COUNT (loop->header->preds) != 2)
1433     {
1434       if (dump_enabled_p ())
1435         {
1436           if (!single_exit (loop))
1437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                              "not vectorized: multiple exits.\n");
1439           else if (EDGE_COUNT (loop->header->preds) != 2)
1440             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1441                              "not vectorized: too many incoming edges.\n");
1442         }
1443       return false;
1444     }
1445
1446   /* We assume that the loop exit condition is at the end of the loop. i.e,
1447      that the loop is represented as a do-while (with a proper if-guard
1448      before the loop if needed), where the loop header contains all the
1449      executable statements, and the latch is empty.  */
1450   if (!empty_block_p (loop->latch)
1451       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1452     {
1453       if (dump_enabled_p ())
1454         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1455                          "not vectorized: latch block not empty.\n");
1456       return false;
1457     }
1458
1459   /* Make sure the exit is not abnormal.  */
1460   edge e = single_exit (loop);
1461   if (e->flags & EDGE_ABNORMAL)
1462     {
1463       if (dump_enabled_p ())
1464         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1465                          "not vectorized: abnormal loop exit edge.\n");
1466       return false;
1467     }
1468
1469   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1470                                      number_of_iterationsm1);
1471   if (!*loop_cond)
1472     {
1473       if (dump_enabled_p ())
1474         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1475                          "not vectorized: complicated exit condition.\n");
1476       return false;
1477     }
1478
1479   if (integer_zerop (*assumptions)
1480       || !*number_of_iterations
1481       || chrec_contains_undetermined (*number_of_iterations))
1482     {
1483       if (dump_enabled_p ())
1484         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485                          "not vectorized: number of iterations cannot be "
1486                          "computed.\n");
1487       return false;
1488     }
1489
1490   if (integer_zerop (*number_of_iterations))
1491     {
1492       if (dump_enabled_p ())
1493         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1494                          "not vectorized: number of iterations = 0.\n");
1495       return false;
1496     }
1497
1498   return true;
1499 }
1500
1501 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1502
1503 loop_vec_info
1504 vect_analyze_loop_form (struct loop *loop)
1505 {
1506   tree assumptions, number_of_iterations, number_of_iterationsm1;
1507   gcond *loop_cond, *inner_loop_cond = NULL;
1508
1509   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1510                                   &assumptions, &number_of_iterationsm1,
1511                                   &number_of_iterations, &inner_loop_cond))
1512     return NULL;
1513
1514   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1515   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1516   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1517   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1518   if (!integer_onep (assumptions))
1519     {
1520       /* We consider to vectorize this loop by versioning it under
1521          some assumptions.  In order to do this, we need to clear
1522          existing information computed by scev and niter analyzer.  */
1523       scev_reset_htab ();
1524       free_numbers_of_iterations_estimates (loop);
1525       /* Also set flag for this loop so that following scev and niter
1526          analysis are done under the assumptions.  */
1527       loop_constraint_set (loop, LOOP_C_FINITE);
1528       /* Also record the assumptions for versioning.  */
1529       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1530     }
1531
1532   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1533     {
1534       if (dump_enabled_p ())
1535         {
1536           dump_printf_loc (MSG_NOTE, vect_location,
1537                            "Symbolic number of iterations is ");
1538           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1539           dump_printf (MSG_NOTE, "\n");
1540         }
1541     }
1542
1543   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1544   if (inner_loop_cond)
1545     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1546       = loop_exit_ctrl_vec_info_type;
1547
1548   gcc_assert (!loop->aux);
1549   loop->aux = loop_vinfo;
1550   return loop_vinfo;
1551 }
1552
1553
1554
1555 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1556    statements update the vectorization factor.  */
1557
1558 static void
1559 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1560 {
1561   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1562   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1563   int nbbs = loop->num_nodes;
1564   unsigned int vectorization_factor;
1565   int i;
1566
1567   if (dump_enabled_p ())
1568     dump_printf_loc (MSG_NOTE, vect_location,
1569                      "=== vect_update_vf_for_slp ===\n");
1570
1571   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1572   gcc_assert (vectorization_factor != 0);
1573
1574   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1575      vectorization factor of the loop is the unrolling factor required by
1576      the SLP instances.  If that unrolling factor is 1, we say, that we
1577      perform pure SLP on loop - cross iteration parallelism is not
1578      exploited.  */
1579   bool only_slp_in_loop = true;
1580   for (i = 0; i < nbbs; i++)
1581     {
1582       basic_block bb = bbs[i];
1583       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1584            gsi_next (&si))
1585         {
1586           gimple *stmt = gsi_stmt (si);
1587           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1588           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1589               && STMT_VINFO_RELATED_STMT (stmt_info))
1590             {
1591               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1592               stmt_info = vinfo_for_stmt (stmt);
1593             }
1594           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1595                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1596               && !PURE_SLP_STMT (stmt_info))
1597             /* STMT needs both SLP and loop-based vectorization.  */
1598             only_slp_in_loop = false;
1599         }
1600     }
1601
1602   if (only_slp_in_loop)
1603     {
1604       dump_printf_loc (MSG_NOTE, vect_location,
1605                        "Loop contains only SLP stmts\n");
1606       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1607     }
1608   else
1609     {
1610       dump_printf_loc (MSG_NOTE, vect_location,
1611                        "Loop contains SLP and non-SLP stmts\n");
1612       vectorization_factor
1613         = least_common_multiple (vectorization_factor,
1614                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1615     }
1616
1617   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1618   if (dump_enabled_p ())
1619     dump_printf_loc (MSG_NOTE, vect_location,
1620                      "Updating vectorization factor to %d\n",
1621                      vectorization_factor);
1622 }
1623
1624 /* Function vect_analyze_loop_operations.
1625
1626    Scan the loop stmts and make sure they are all vectorizable.  */
1627
1628 static bool
1629 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1630 {
1631   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1632   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1633   int nbbs = loop->num_nodes;
1634   int i;
1635   stmt_vec_info stmt_info;
1636   bool need_to_vectorize = false;
1637   bool ok;
1638
1639   if (dump_enabled_p ())
1640     dump_printf_loc (MSG_NOTE, vect_location,
1641                      "=== vect_analyze_loop_operations ===\n");
1642
1643   for (i = 0; i < nbbs; i++)
1644     {
1645       basic_block bb = bbs[i];
1646
1647       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1648            gsi_next (&si))
1649         {
1650           gphi *phi = si.phi ();
1651           ok = true;
1652
1653           stmt_info = vinfo_for_stmt (phi);
1654           if (dump_enabled_p ())
1655             {
1656               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1657               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1658             }
1659           if (virtual_operand_p (gimple_phi_result (phi)))
1660             continue;
1661
1662           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1663              (i.e., a phi in the tail of the outer-loop).  */
1664           if (! is_loop_header_bb_p (bb))
1665             {
1666               /* FORNOW: we currently don't support the case that these phis
1667                  are not used in the outerloop (unless it is double reduction,
1668                  i.e., this phi is vect_reduction_def), cause this case
1669                  requires to actually do something here.  */
1670               if (STMT_VINFO_LIVE_P (stmt_info)
1671                   && STMT_VINFO_DEF_TYPE (stmt_info)
1672                      != vect_double_reduction_def)
1673                 {
1674                   if (dump_enabled_p ())
1675                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1676                                      "Unsupported loop-closed phi in "
1677                                      "outer-loop.\n");
1678                   return false;
1679                 }
1680
1681               /* If PHI is used in the outer loop, we check that its operand
1682                  is defined in the inner loop.  */
1683               if (STMT_VINFO_RELEVANT_P (stmt_info))
1684                 {
1685                   tree phi_op;
1686                   gimple *op_def_stmt;
1687
1688                   if (gimple_phi_num_args (phi) != 1)
1689                     return false;
1690
1691                   phi_op = PHI_ARG_DEF (phi, 0);
1692                   if (TREE_CODE (phi_op) != SSA_NAME)
1693                     return false;
1694
1695                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1696                   if (gimple_nop_p (op_def_stmt)
1697                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1698                       || !vinfo_for_stmt (op_def_stmt))
1699                     return false;
1700
1701                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1702                         != vect_used_in_outer
1703                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1704                            != vect_used_in_outer_by_reduction)
1705                     return false;
1706                 }
1707
1708               continue;
1709             }
1710
1711           gcc_assert (stmt_info);
1712
1713           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1714                || STMT_VINFO_LIVE_P (stmt_info))
1715               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1716             {
1717               /* A scalar-dependence cycle that we don't support.  */
1718               if (dump_enabled_p ())
1719                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720                                  "not vectorized: scalar dependence cycle.\n");
1721               return false;
1722             }
1723
1724           if (STMT_VINFO_RELEVANT_P (stmt_info))
1725             {
1726               need_to_vectorize = true;
1727               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1728                   && ! PURE_SLP_STMT (stmt_info))
1729                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1730               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1731                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1732                        && ! PURE_SLP_STMT (stmt_info))
1733                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1734             }
1735
1736           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1737             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1738
1739           if (!ok)
1740             {
1741               if (dump_enabled_p ())
1742                 {
1743                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744                                    "not vectorized: relevant phi not "
1745                                    "supported: ");
1746                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1747                 }
1748               return false;
1749             }
1750         }
1751
1752       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1753            gsi_next (&si))
1754         {
1755           gimple *stmt = gsi_stmt (si);
1756           if (!gimple_clobber_p (stmt)
1757               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1758             return false;
1759         }
1760     } /* bbs */
1761
1762   /* All operations in the loop are either irrelevant (deal with loop
1763      control, or dead), or only used outside the loop and can be moved
1764      out of the loop (e.g. invariants, inductions).  The loop can be
1765      optimized away by scalar optimizations.  We're better off not
1766      touching this loop.  */
1767   if (!need_to_vectorize)
1768     {
1769       if (dump_enabled_p ())
1770         dump_printf_loc (MSG_NOTE, vect_location,
1771                          "All the computation can be taken out of the loop.\n");
1772       if (dump_enabled_p ())
1773         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1774                          "not vectorized: redundant loop. no profit to "
1775                          "vectorize.\n");
1776       return false;
1777     }
1778
1779   return true;
1780 }
1781
1782
1783 /* Function vect_analyze_loop_2.
1784
1785    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1786    for it.  The different analyses will record information in the
1787    loop_vec_info struct.  */
1788 static bool
1789 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1790 {
1791   bool ok;
1792   int max_vf = MAX_VECTORIZATION_FACTOR;
1793   int min_vf = 2;
1794   unsigned int n_stmts = 0;
1795
1796   /* The first group of checks is independent of the vector size.  */
1797   fatal = true;
1798
1799   /* Find all data references in the loop (which correspond to vdefs/vuses)
1800      and analyze their evolution in the loop.  */
1801
1802   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1803
1804   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1805   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1806     {
1807       if (dump_enabled_p ())
1808         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1809                          "not vectorized: loop nest containing two "
1810                          "or more consecutive inner loops cannot be "
1811                          "vectorized\n");
1812       return false;
1813     }
1814
1815   for (unsigned i = 0; i < loop->num_nodes; i++)
1816     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1817          !gsi_end_p (gsi); gsi_next (&gsi))
1818       {
1819         gimple *stmt = gsi_stmt (gsi);
1820         if (is_gimple_debug (stmt))
1821           continue;
1822         ++n_stmts;
1823         if (!find_data_references_in_stmt (loop, stmt,
1824                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1825           {
1826             if (is_gimple_call (stmt) && loop->safelen)
1827               {
1828                 tree fndecl = gimple_call_fndecl (stmt), op;
1829                 if (fndecl != NULL_TREE)
1830                   {
1831                     cgraph_node *node = cgraph_node::get (fndecl);
1832                     if (node != NULL && node->simd_clones != NULL)
1833                       {
1834                         unsigned int j, n = gimple_call_num_args (stmt);
1835                         for (j = 0; j < n; j++)
1836                           {
1837                             op = gimple_call_arg (stmt, j);
1838                             if (DECL_P (op)
1839                                 || (REFERENCE_CLASS_P (op)
1840                                     && get_base_address (op)))
1841                               break;
1842                           }
1843                         op = gimple_call_lhs (stmt);
1844                         /* Ignore #pragma omp declare simd functions
1845                            if they don't have data references in the
1846                            call stmt itself.  */
1847                         if (j == n
1848                             && !(op
1849                                  && (DECL_P (op)
1850                                      || (REFERENCE_CLASS_P (op)
1851                                          && get_base_address (op)))))
1852                           continue;
1853                       }
1854                   }
1855               }
1856             if (dump_enabled_p ())
1857               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1858                                "not vectorized: loop contains function "
1859                                "calls or data references that cannot "
1860                                "be analyzed\n");
1861             return false;
1862           }
1863       }
1864
1865   /* Analyze the data references and also adjust the minimal
1866      vectorization factor according to the loads and stores.  */
1867
1868   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1869   if (!ok)
1870     {
1871       if (dump_enabled_p ())
1872         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1873                          "bad data references.\n");
1874       return false;
1875     }
1876
1877   /* Classify all cross-iteration scalar data-flow cycles.
1878      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1879   vect_analyze_scalar_cycles (loop_vinfo);
1880
1881   vect_pattern_recog (loop_vinfo);
1882
1883   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1884
1885   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1886      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1887
1888   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1889   if (!ok)
1890     {
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1893                          "bad data access.\n");
1894       return false;
1895     }
1896
1897   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1898
1899   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1900   if (!ok)
1901     {
1902       if (dump_enabled_p ())
1903         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1904                          "unexpected pattern.\n");
1905       return false;
1906     }
1907
1908   /* While the rest of the analysis below depends on it in some way.  */
1909   fatal = false;
1910
1911   /* Analyze data dependences between the data-refs in the loop
1912      and adjust the maximum vectorization factor according to
1913      the dependences.
1914      FORNOW: fail at the first data dependence that we encounter.  */
1915
1916   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1917   if (!ok
1918       || max_vf < min_vf)
1919     {
1920       if (dump_enabled_p ())
1921             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922                              "bad data dependence.\n");
1923       return false;
1924     }
1925   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1926
1927   ok = vect_determine_vectorization_factor (loop_vinfo);
1928   if (!ok)
1929     {
1930       if (dump_enabled_p ())
1931         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1932                          "can't determine vectorization factor.\n");
1933       return false;
1934     }
1935   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1936     {
1937       if (dump_enabled_p ())
1938         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1939                          "bad data dependence.\n");
1940       return false;
1941     }
1942
1943   /* Compute the scalar iteration cost.  */
1944   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1945
1946   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1947   HOST_WIDE_INT estimated_niter;
1948   unsigned th;
1949   int min_scalar_loop_bound;
1950
1951   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1952   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1953   if (!ok)
1954     return false;
1955
1956   /* If there are any SLP instances mark them as pure_slp.  */
1957   bool slp = vect_make_slp_decision (loop_vinfo);
1958   if (slp)
1959     {
1960       /* Find stmts that need to be both vectorized and SLPed.  */
1961       vect_detect_hybrid_slp (loop_vinfo);
1962
1963       /* Update the vectorization factor based on the SLP decision.  */
1964       vect_update_vf_for_slp (loop_vinfo);
1965     }
1966
1967   /* This is the point where we can re-start analysis with SLP forced off.  */
1968 start_over:
1969
1970   /* Now the vectorization factor is final.  */
1971   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1972   gcc_assert (vectorization_factor != 0);
1973
1974   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1975     dump_printf_loc (MSG_NOTE, vect_location,
1976                      "vectorization_factor = %d, niters = "
1977                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1978                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1979
1980   HOST_WIDE_INT max_niter
1981     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1982   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1983        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1984       || (max_niter != -1
1985           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1986     {
1987       if (dump_enabled_p ())
1988         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989                          "not vectorized: iteration count smaller than "
1990                          "vectorization factor.\n");
1991       return false;
1992     }
1993
1994   /* Analyze the alignment of the data-refs in the loop.
1995      Fail if a data reference is found that cannot be vectorized.  */
1996
1997   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1998   if (!ok)
1999     {
2000       if (dump_enabled_p ())
2001         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2002                          "bad data alignment.\n");
2003       return false;
2004     }
2005
2006   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2007      It is important to call pruning after vect_analyze_data_ref_accesses,
2008      since we use grouping information gathered by interleaving analysis.  */
2009   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2010   if (!ok)
2011     return false;
2012
2013   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2014      vectorization.  */
2015   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2016     {
2017     /* This pass will decide on using loop versioning and/or loop peeling in
2018        order to enhance the alignment of data references in the loop.  */
2019     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2020     if (!ok)
2021       {
2022         if (dump_enabled_p ())
2023           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2024                            "bad data alignment.\n");
2025         return false;
2026       }
2027     }
2028
2029   if (slp)
2030     {
2031       /* Analyze operations in the SLP instances.  Note this may
2032          remove unsupported SLP instances which makes the above
2033          SLP kind detection invalid.  */
2034       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2035       vect_slp_analyze_operations (loop_vinfo);
2036       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2037         goto again;
2038     }
2039
2040   /* Scan all the remaining operations in the loop that are not subject
2041      to SLP and make sure they are vectorizable.  */
2042   ok = vect_analyze_loop_operations (loop_vinfo);
2043   if (!ok)
2044     {
2045       if (dump_enabled_p ())
2046         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047                          "bad operation or unsupported loop bound.\n");
2048       return false;
2049     }
2050
2051   /* If epilog loop is required because of data accesses with gaps,
2052      one additional iteration needs to be peeled.  Check if there is
2053      enough iterations for vectorization.  */
2054   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2055       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2056     {
2057       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2058       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2059
2060       if (wi::to_widest (scalar_niters) < vf)
2061         {
2062           if (dump_enabled_p ())
2063             dump_printf_loc (MSG_NOTE, vect_location,
2064                              "loop has no enough iterations to support"
2065                              " peeling for gaps.\n");
2066           return false;
2067         }
2068     }
2069
2070   /* Analyze cost.  Decide if worth while to vectorize.  */
2071   int min_profitable_estimate, min_profitable_iters;
2072   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2073                                       &min_profitable_estimate);
2074
2075   if (min_profitable_iters < 0)
2076     {
2077       if (dump_enabled_p ())
2078         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079                          "not vectorized: vectorization not profitable.\n");
2080       if (dump_enabled_p ())
2081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2082                          "not vectorized: vector version will never be "
2083                          "profitable.\n");
2084       goto again;
2085     }
2086
2087   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2088                            * vectorization_factor);
2089
2090   /* Use the cost model only if it is more conservative than user specified
2091      threshold.  */
2092   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2093
2094   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2095
2096   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2097       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2098     {
2099       if (dump_enabled_p ())
2100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101                          "not vectorized: vectorization not profitable.\n");
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_NOTE, vect_location,
2104                          "not vectorized: iteration count smaller than user "
2105                          "specified loop bound parameter or minimum profitable "
2106                          "iterations (whichever is more conservative).\n");
2107       goto again;
2108     }
2109
2110   estimated_niter
2111     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2112   if (estimated_niter == -1)
2113     estimated_niter = max_niter;
2114   if (estimated_niter != -1
2115       && ((unsigned HOST_WIDE_INT) estimated_niter
2116           < MAX (th, (unsigned) min_profitable_estimate)))
2117     {
2118       if (dump_enabled_p ())
2119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2120                          "not vectorized: estimated iteration count too "
2121                          "small.\n");
2122       if (dump_enabled_p ())
2123         dump_printf_loc (MSG_NOTE, vect_location,
2124                          "not vectorized: estimated iteration count smaller "
2125                          "than specified loop bound parameter or minimum "
2126                          "profitable iterations (whichever is more "
2127                          "conservative).\n");
2128       goto again;
2129     }
2130
2131   /* Decide whether we need to create an epilogue loop to handle
2132      remaining scalar iterations.  */
2133   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2134          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2135         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2136
2137   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2138       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2139     {
2140       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2141                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2142           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2143         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2144     }
2145   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2146            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2147                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2148                /* In case of versioning, check if the maximum number of
2149                   iterations is greater than th.  If they are identical,
2150                   the epilogue is unnecessary.  */
2151                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2152                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2153     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2154
2155   /* If an epilogue loop is required make sure we can create one.  */
2156   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2157       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2158     {
2159       if (dump_enabled_p ())
2160         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2161       if (!vect_can_advance_ivs_p (loop_vinfo)
2162           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2163                                            single_exit (LOOP_VINFO_LOOP
2164                                                          (loop_vinfo))))
2165         {
2166           if (dump_enabled_p ())
2167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168                              "not vectorized: can't create required "
2169                              "epilog loop\n");
2170           goto again;
2171         }
2172     }
2173
2174   /* During peeling, we need to check if number of loop iterations is
2175      enough for both peeled prolog loop and vector loop.  This check
2176      can be merged along with threshold check of loop versioning, so
2177      increase threshold for this case if necessary.  */
2178   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2179       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2180           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2181     {
2182       unsigned niters_th;
2183
2184       /* Niters for peeled prolog loop.  */
2185       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2186         {
2187           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2188           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2189
2190           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2191         }
2192       else
2193         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2194
2195       /* Niters for at least one iteration of vectorized loop.  */
2196       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       /* One additional iteration because of peeling for gap.  */
2198       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2199         niters_th++;
2200       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2201         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2202     }
2203
2204   gcc_assert (vectorization_factor
2205               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2206
2207   /* Ok to vectorize!  */
2208   return true;
2209
2210 again:
2211   /* Try again with SLP forced off but if we didn't do any SLP there is
2212      no point in re-trying.  */
2213   if (!slp)
2214     return false;
2215
2216   /* If there are reduction chains re-trying will fail anyway.  */
2217   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2218     return false;
2219
2220   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2221      via interleaving or lane instructions.  */
2222   slp_instance instance;
2223   slp_tree node;
2224   unsigned i, j;
2225   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2226     {
2227       stmt_vec_info vinfo;
2228       vinfo = vinfo_for_stmt
2229           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2230       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2231         continue;
2232       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2233       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2234       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2235       if (! vect_store_lanes_supported (vectype, size)
2236           && ! vect_grouped_store_supported (vectype, size))
2237         return false;
2238       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2239         {
2240           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2241           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2242           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2243           size = STMT_VINFO_GROUP_SIZE (vinfo);
2244           vectype = STMT_VINFO_VECTYPE (vinfo);
2245           if (! vect_load_lanes_supported (vectype, size)
2246               && ! vect_grouped_load_supported (vectype, single_element_p,
2247                                                 size))
2248             return false;
2249         }
2250     }
2251
2252   if (dump_enabled_p ())
2253     dump_printf_loc (MSG_NOTE, vect_location,
2254                      "re-trying with SLP disabled\n");
2255
2256   /* Roll back state appropriately.  No SLP this time.  */
2257   slp = false;
2258   /* Restore vectorization factor as it were without SLP.  */
2259   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2260   /* Free the SLP instances.  */
2261   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2262     vect_free_slp_instance (instance);
2263   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2264   /* Reset SLP type to loop_vect on all stmts.  */
2265   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2266     {
2267       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2268       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2269            !gsi_end_p (si); gsi_next (&si))
2270         {
2271           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2272           STMT_SLP_TYPE (stmt_info) = loop_vect;
2273         }
2274       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2275            !gsi_end_p (si); gsi_next (&si))
2276         {
2277           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2278           STMT_SLP_TYPE (stmt_info) = loop_vect;
2279           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2280             {
2281               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2282               STMT_SLP_TYPE (stmt_info) = loop_vect;
2283               for (gimple_stmt_iterator pi
2284                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2285                    !gsi_end_p (pi); gsi_next (&pi))
2286                 {
2287                   gimple *pstmt = gsi_stmt (pi);
2288                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2289                 }
2290             }
2291         }
2292     }
2293   /* Free optimized alias test DDRS.  */
2294   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2295   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2296   /* Reset target cost data.  */
2297   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2298   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2299     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2300   /* Reset assorted flags.  */
2301   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2302   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2303   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2304
2305   goto start_over;
2306 }
2307
2308 /* Function vect_analyze_loop.
2309
2310    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2311    for it.  The different analyses will record information in the
2312    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2313    be vectorized.  */
2314 loop_vec_info
2315 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2316 {
2317   loop_vec_info loop_vinfo;
2318   unsigned int vector_sizes;
2319
2320   /* Autodetect first vector size we try.  */
2321   current_vector_size = 0;
2322   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2323
2324   if (dump_enabled_p ())
2325     dump_printf_loc (MSG_NOTE, vect_location,
2326                      "===== analyze_loop_nest =====\n");
2327
2328   if (loop_outer (loop)
2329       && loop_vec_info_for_loop (loop_outer (loop))
2330       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_NOTE, vect_location,
2334                          "outer-loop already vectorized.\n");
2335       return NULL;
2336     }
2337
2338   while (1)
2339     {
2340       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2341       loop_vinfo = vect_analyze_loop_form (loop);
2342       if (!loop_vinfo)
2343         {
2344           if (dump_enabled_p ())
2345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                              "bad loop form.\n");
2347           return NULL;
2348         }
2349
2350       bool fatal = false;
2351
2352       if (orig_loop_vinfo)
2353         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2354
2355       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2356         {
2357           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2358
2359           return loop_vinfo;
2360         }
2361
2362       delete loop_vinfo;
2363
2364       vector_sizes &= ~current_vector_size;
2365       if (fatal
2366           || vector_sizes == 0
2367           || current_vector_size == 0)
2368         return NULL;
2369
2370       /* Try the next biggest vector size.  */
2371       current_vector_size = 1 << floor_log2 (vector_sizes);
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_NOTE, vect_location,
2374                          "***** Re-trying analysis with "
2375                          "vector size %d\n", current_vector_size);
2376     }
2377 }
2378
2379
2380 /* Function reduction_fn_for_scalar_code
2381
2382    Input:
2383    CODE - tree_code of a reduction operations.
2384
2385    Output:
2386    REDUC_FN - the corresponding internal function to be used to reduce the
2387       vector of partial results into a single scalar result, or IFN_LAST
2388       if the operation is a supported reduction operation, but does not have
2389       such an internal function.
2390
2391    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2392
2393 static bool
2394 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2395 {
2396   switch (code)
2397     {
2398       case MAX_EXPR:
2399         *reduc_fn = IFN_REDUC_MAX;
2400         return true;
2401
2402       case MIN_EXPR:
2403         *reduc_fn = IFN_REDUC_MIN;
2404         return true;
2405
2406       case PLUS_EXPR:
2407         *reduc_fn = IFN_REDUC_PLUS;
2408         return true;
2409
2410       case MULT_EXPR:
2411       case MINUS_EXPR:
2412       case BIT_IOR_EXPR:
2413       case BIT_XOR_EXPR:
2414       case BIT_AND_EXPR:
2415         *reduc_fn = IFN_LAST;
2416         return true;
2417
2418       default:
2419        return false;
2420     }
2421 }
2422
2423
2424 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2425    STMT is printed with a message MSG. */
2426
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2429 {
2430   dump_printf_loc (msg_type, vect_location, "%s", msg);
2431   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2432 }
2433
2434
2435 /* Detect SLP reduction of the form:
2436
2437    #a1 = phi <a5, a0>
2438    a2 = operation (a1)
2439    a3 = operation (a2)
2440    a4 = operation (a3)
2441    a5 = operation (a4)
2442
2443    #a = phi <a5>
2444
2445    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446    FIRST_STMT is the first reduction stmt in the chain
2447    (a2 = operation (a1)).
2448
2449    Return TRUE if a reduction chain was detected.  */
2450
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453                        gimple *first_stmt)
2454 {
2455   struct loop *loop = (gimple_bb (phi))->loop_father;
2456   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457   enum tree_code code;
2458   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459   stmt_vec_info use_stmt_info, current_stmt_info;
2460   tree lhs;
2461   imm_use_iterator imm_iter;
2462   use_operand_p use_p;
2463   int nloop_uses, size = 0, n_out_of_loop_uses;
2464   bool found = false;
2465
2466   if (loop != vect_loop)
2467     return false;
2468
2469   lhs = PHI_RESULT (phi);
2470   code = gimple_assign_rhs_code (first_stmt);
2471   while (1)
2472     {
2473       nloop_uses = 0;
2474       n_out_of_loop_uses = 0;
2475       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2476         {
2477           gimple *use_stmt = USE_STMT (use_p);
2478           if (is_gimple_debug (use_stmt))
2479             continue;
2480
2481           /* Check if we got back to the reduction phi.  */
2482           if (use_stmt == phi)
2483             {
2484               loop_use_stmt = use_stmt;
2485               found = true;
2486               break;
2487             }
2488
2489           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2490             {
2491               loop_use_stmt = use_stmt;
2492               nloop_uses++;
2493             }
2494            else
2495              n_out_of_loop_uses++;
2496
2497            /* There are can be either a single use in the loop or two uses in
2498               phi nodes.  */
2499            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500              return false;
2501         }
2502
2503       if (found)
2504         break;
2505
2506       /* We reached a statement with no loop uses.  */
2507       if (nloop_uses == 0)
2508         return false;
2509
2510       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2511       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512         return false;
2513
2514       if (!is_gimple_assign (loop_use_stmt)
2515           || code != gimple_assign_rhs_code (loop_use_stmt)
2516           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517         return false;
2518
2519       /* Insert USE_STMT into reduction chain.  */
2520       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521       if (current_stmt)
2522         {
2523           current_stmt_info = vinfo_for_stmt (current_stmt);
2524           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525           GROUP_FIRST_ELEMENT (use_stmt_info)
2526             = GROUP_FIRST_ELEMENT (current_stmt_info);
2527         }
2528       else
2529         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2530
2531       lhs = gimple_assign_lhs (loop_use_stmt);
2532       current_stmt = loop_use_stmt;
2533       size++;
2534    }
2535
2536   if (!found || loop_use_stmt != phi || size < 2)
2537     return false;
2538
2539   /* Swap the operands, if needed, to make the reduction operand be the second
2540      operand.  */
2541   lhs = PHI_RESULT (phi);
2542   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543   while (next_stmt)
2544     {
2545       if (gimple_assign_rhs2 (next_stmt) == lhs)
2546         {
2547           tree op = gimple_assign_rhs1 (next_stmt);
2548           gimple *def_stmt = NULL;
2549
2550           if (TREE_CODE (op) == SSA_NAME)
2551             def_stmt = SSA_NAME_DEF_STMT (op);
2552
2553           /* Check that the other def is either defined in the loop
2554              ("vect_internal_def"), or it's an induction (defined by a
2555              loop-header phi-node).  */
2556           if (def_stmt
2557               && gimple_bb (def_stmt)
2558               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559               && (is_gimple_assign (def_stmt)
2560                   || is_gimple_call (def_stmt)
2561                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562                            == vect_induction_def
2563                   || (gimple_code (def_stmt) == GIMPLE_PHI
2564                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565                                   == vect_internal_def
2566                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2567             {
2568               lhs = gimple_assign_lhs (next_stmt);
2569               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570               continue;
2571             }
2572
2573           return false;
2574         }
2575       else
2576         {
2577           tree op = gimple_assign_rhs2 (next_stmt);
2578           gimple *def_stmt = NULL;
2579
2580           if (TREE_CODE (op) == SSA_NAME)
2581             def_stmt = SSA_NAME_DEF_STMT (op);
2582
2583           /* Check that the other def is either defined in the loop
2584             ("vect_internal_def"), or it's an induction (defined by a
2585             loop-header phi-node).  */
2586           if (def_stmt
2587               && gimple_bb (def_stmt)
2588               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589               && (is_gimple_assign (def_stmt)
2590                   || is_gimple_call (def_stmt)
2591                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592                               == vect_induction_def
2593                   || (gimple_code (def_stmt) == GIMPLE_PHI
2594                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595                                   == vect_internal_def
2596                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2597             {
2598               if (dump_enabled_p ())
2599                 {
2600                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2602                 }
2603
2604               swap_ssa_operands (next_stmt,
2605                                  gimple_assign_rhs1_ptr (next_stmt),
2606                                  gimple_assign_rhs2_ptr (next_stmt));
2607               update_stmt (next_stmt);
2608
2609               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2611             }
2612           else
2613             return false;
2614         }
2615
2616       lhs = gimple_assign_lhs (next_stmt);
2617       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2618     }
2619
2620   /* Save the chain for further analysis in SLP detection.  */
2621   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2624
2625   return true;
2626 }
2627
2628
2629 /* Function vect_is_simple_reduction
2630
2631    (1) Detect a cross-iteration def-use cycle that represents a simple
2632    reduction computation.  We look for the following pattern:
2633
2634    loop_header:
2635      a1 = phi < a0, a2 >
2636      a3 = ...
2637      a2 = operation (a3, a1)
2638
2639    or
2640
2641    a3 = ...
2642    loop_header:
2643      a1 = phi < a0, a2 >
2644      a2 = operation (a3, a1)
2645
2646    such that:
2647    1. operation is commutative and associative and it is safe to
2648       change the order of the computation
2649    2. no uses for a2 in the loop (a2 is used out of the loop)
2650    3. no uses of a1 in the loop besides the reduction operation
2651    4. no uses of a1 outside the loop.
2652
2653    Conditions 1,4 are tested here.
2654    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2655
2656    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2657    nested cycles.
2658
2659    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2660    reductions:
2661
2662      a1 = phi < a0, a2 >
2663      inner loop (def of a3)
2664      a2 = phi < a3 >
2665
2666    (4) Detect condition expressions, ie:
2667      for (int i = 0; i < N; i++)
2668        if (a[i] < val)
2669         ret_val = a[i];
2670
2671 */
2672
2673 static gimple *
2674 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2675                           bool *double_reduc,
2676                           bool need_wrapping_integral_overflow,
2677                           enum vect_reduction_type *v_reduc_type)
2678 {
2679   struct loop *loop = (gimple_bb (phi))->loop_father;
2680   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2681   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2682   enum tree_code orig_code, code;
2683   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2684   tree type;
2685   int nloop_uses;
2686   tree name;
2687   imm_use_iterator imm_iter;
2688   use_operand_p use_p;
2689   bool phi_def;
2690
2691   *double_reduc = false;
2692   *v_reduc_type = TREE_CODE_REDUCTION;
2693
2694   tree phi_name = PHI_RESULT (phi);
2695   /* ???  If there are no uses of the PHI result the inner loop reduction
2696      won't be detected as possibly double-reduction by vectorizable_reduction
2697      because that tries to walk the PHI arg from the preheader edge which
2698      can be constant.  See PR60382.  */
2699   if (has_zero_uses (phi_name))
2700     return NULL;
2701   nloop_uses = 0;
2702   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2703     {
2704       gimple *use_stmt = USE_STMT (use_p);
2705       if (is_gimple_debug (use_stmt))
2706         continue;
2707
2708       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2709         {
2710           if (dump_enabled_p ())
2711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712                              "intermediate value used outside loop.\n");
2713
2714           return NULL;
2715         }
2716
2717       nloop_uses++;
2718       if (nloop_uses > 1)
2719         {
2720           if (dump_enabled_p ())
2721             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722                              "reduction value used in loop.\n");
2723           return NULL;
2724         }
2725
2726       phi_use_stmt = use_stmt;
2727     }
2728
2729   edge latch_e = loop_latch_edge (loop);
2730   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2731   if (TREE_CODE (loop_arg) != SSA_NAME)
2732     {
2733       if (dump_enabled_p ())
2734         {
2735           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736                            "reduction: not ssa_name: ");
2737           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2738           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2739         }
2740       return NULL;
2741     }
2742
2743   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2744   if (is_gimple_assign (def_stmt))
2745     {
2746       name = gimple_assign_lhs (def_stmt);
2747       phi_def = false;
2748     }
2749   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2750     {
2751       name = PHI_RESULT (def_stmt);
2752       phi_def = true;
2753     }
2754   else
2755     {
2756       if (dump_enabled_p ())
2757         {
2758           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759                            "reduction: unhandled reduction operation: ");
2760           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2761         }
2762       return NULL;
2763     }
2764
2765   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2766     return NULL;
2767
2768   nloop_uses = 0;
2769   auto_vec<gphi *, 3> lcphis;
2770   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2771     {
2772       gimple *use_stmt = USE_STMT (use_p);
2773       if (is_gimple_debug (use_stmt))
2774         continue;
2775       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2776         nloop_uses++;
2777       else
2778         /* We can have more than one loop-closed PHI.  */
2779         lcphis.safe_push (as_a <gphi *> (use_stmt));
2780       if (nloop_uses > 1)
2781         {
2782           if (dump_enabled_p ())
2783             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2784                              "reduction used in loop.\n");
2785           return NULL;
2786         }
2787     }
2788
2789   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2790      defined in the inner loop.  */
2791   if (phi_def)
2792     {
2793       op1 = PHI_ARG_DEF (def_stmt, 0);
2794
2795       if (gimple_phi_num_args (def_stmt) != 1
2796           || TREE_CODE (op1) != SSA_NAME)
2797         {
2798           if (dump_enabled_p ())
2799             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800                              "unsupported phi node definition.\n");
2801
2802           return NULL;
2803         }
2804
2805       def1 = SSA_NAME_DEF_STMT (op1);
2806       if (gimple_bb (def1)
2807           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2808           && loop->inner
2809           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2810           && is_gimple_assign (def1)
2811           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2812         {
2813           if (dump_enabled_p ())
2814             report_vect_op (MSG_NOTE, def_stmt,
2815                             "detected double reduction: ");
2816
2817           *double_reduc = true;
2818           return def_stmt;
2819         }
2820
2821       return NULL;
2822     }
2823
2824   /* If we are vectorizing an inner reduction we are executing that
2825      in the original order only in case we are not dealing with a
2826      double reduction.  */
2827   bool check_reduction = true;
2828   if (flow_loop_nested_p (vect_loop, loop))
2829     {
2830       gphi *lcphi;
2831       unsigned i;
2832       check_reduction = false;
2833       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2834         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2835           {
2836             gimple *use_stmt = USE_STMT (use_p);
2837             if (is_gimple_debug (use_stmt))
2838               continue;
2839             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2840               check_reduction = true;
2841           }
2842     }
2843
2844   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2845   code = orig_code = gimple_assign_rhs_code (def_stmt);
2846
2847   /* We can handle "res -= x[i]", which is non-associative by
2848      simply rewriting this into "res += -x[i]".  Avoid changing
2849      gimple instruction for the first simple tests and only do this
2850      if we're allowed to change code at all.  */
2851   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2852     code = PLUS_EXPR;
2853
2854   if (code == COND_EXPR)
2855     {
2856       if (! nested_in_vect_loop)
2857         *v_reduc_type = COND_REDUCTION;
2858
2859       op3 = gimple_assign_rhs1 (def_stmt);
2860       if (COMPARISON_CLASS_P (op3))
2861         {
2862           op4 = TREE_OPERAND (op3, 1);
2863           op3 = TREE_OPERAND (op3, 0);
2864         }
2865       if (op3 == phi_name || op4 == phi_name)
2866         {
2867           if (dump_enabled_p ())
2868             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2869                             "reduction: condition depends on previous"
2870                             " iteration: ");
2871           return NULL;
2872         }
2873
2874       op1 = gimple_assign_rhs2 (def_stmt);
2875       op2 = gimple_assign_rhs3 (def_stmt);
2876     }
2877   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2878     {
2879       if (dump_enabled_p ())
2880         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881                         "reduction: not commutative/associative: ");
2882       return NULL;
2883     }
2884   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2885     {
2886       op1 = gimple_assign_rhs1 (def_stmt);
2887       op2 = gimple_assign_rhs2 (def_stmt);
2888     }
2889   else
2890     {
2891       if (dump_enabled_p ())
2892         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2893                         "reduction: not handled operation: ");
2894       return NULL;
2895     }
2896
2897   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2898     {
2899       if (dump_enabled_p ())
2900         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2901                         "reduction: both uses not ssa_names: ");
2902
2903       return NULL;
2904     }
2905
2906   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2907   if ((TREE_CODE (op1) == SSA_NAME
2908        && !types_compatible_p (type,TREE_TYPE (op1)))
2909       || (TREE_CODE (op2) == SSA_NAME
2910           && !types_compatible_p (type, TREE_TYPE (op2)))
2911       || (op3 && TREE_CODE (op3) == SSA_NAME
2912           && !types_compatible_p (type, TREE_TYPE (op3)))
2913       || (op4 && TREE_CODE (op4) == SSA_NAME
2914           && !types_compatible_p (type, TREE_TYPE (op4))))
2915     {
2916       if (dump_enabled_p ())
2917         {
2918           dump_printf_loc (MSG_NOTE, vect_location,
2919                            "reduction: multiple types: operation type: ");
2920           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2921           dump_printf (MSG_NOTE, ", operands types: ");
2922           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923                              TREE_TYPE (op1));
2924           dump_printf (MSG_NOTE, ",");
2925           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926                              TREE_TYPE (op2));
2927           if (op3)
2928             {
2929               dump_printf (MSG_NOTE, ",");
2930               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931                                  TREE_TYPE (op3));
2932             }
2933
2934           if (op4)
2935             {
2936               dump_printf (MSG_NOTE, ",");
2937               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2938                                  TREE_TYPE (op4));
2939             }
2940           dump_printf (MSG_NOTE, "\n");
2941         }
2942
2943       return NULL;
2944     }
2945
2946   /* Check that it's ok to change the order of the computation.
2947      Generally, when vectorizing a reduction we change the order of the
2948      computation.  This may change the behavior of the program in some
2949      cases, so we need to check that this is ok.  One exception is when
2950      vectorizing an outer-loop: the inner-loop is executed sequentially,
2951      and therefore vectorizing reductions in the inner-loop during
2952      outer-loop vectorization is safe.  */
2953
2954   if (*v_reduc_type != COND_REDUCTION
2955       && check_reduction)
2956     {
2957       /* CHECKME: check for !flag_finite_math_only too?  */
2958       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2959         {
2960           /* Changing the order of operations changes the semantics.  */
2961           if (dump_enabled_p ())
2962             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2963                         "reduction: unsafe fp math optimization: ");
2964           return NULL;
2965         }
2966       else if (INTEGRAL_TYPE_P (type))
2967         {
2968           if (!operation_no_trapping_overflow (type, code))
2969             {
2970               /* Changing the order of operations changes the semantics.  */
2971               if (dump_enabled_p ())
2972                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2973                                 "reduction: unsafe int math optimization"
2974                                 " (overflow traps): ");
2975               return NULL;
2976             }
2977           if (need_wrapping_integral_overflow
2978               && !TYPE_OVERFLOW_WRAPS (type)
2979               && operation_can_overflow (code))
2980             {
2981               /* Changing the order of operations changes the semantics.  */
2982               if (dump_enabled_p ())
2983                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2984                                 "reduction: unsafe int math optimization"
2985                                 " (overflow doesn't wrap): ");
2986               return NULL;
2987             }
2988         }
2989       else if (SAT_FIXED_POINT_TYPE_P (type))
2990         {
2991           /* Changing the order of operations changes the semantics.  */
2992           if (dump_enabled_p ())
2993           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2994                           "reduction: unsafe fixed-point math optimization: ");
2995           return NULL;
2996         }
2997     }
2998
2999   /* Reduction is safe. We're dealing with one of the following:
3000      1) integer arithmetic and no trapv
3001      2) floating point arithmetic, and special flags permit this optimization
3002      3) nested cycle (i.e., outer loop vectorization).  */
3003   if (TREE_CODE (op1) == SSA_NAME)
3004     def1 = SSA_NAME_DEF_STMT (op1);
3005
3006   if (TREE_CODE (op2) == SSA_NAME)
3007     def2 = SSA_NAME_DEF_STMT (op2);
3008
3009   if (code != COND_EXPR
3010       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3011     {
3012       if (dump_enabled_p ())
3013         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3014       return NULL;
3015     }
3016
3017   /* Check that one def is the reduction def, defined by PHI,
3018      the other def is either defined in the loop ("vect_internal_def"),
3019      or it's an induction (defined by a loop-header phi-node).  */
3020
3021   if (def2 && def2 == phi
3022       && (code == COND_EXPR
3023           || !def1 || gimple_nop_p (def1)
3024           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3026               && (is_gimple_assign (def1)
3027                   || is_gimple_call (def1)
3028                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3029                       == vect_induction_def
3030                   || (gimple_code (def1) == GIMPLE_PHI
3031                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3032                           == vect_internal_def
3033                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3034     {
3035       if (dump_enabled_p ())
3036         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3037       return def_stmt;
3038     }
3039
3040   if (def1 && def1 == phi
3041       && (code == COND_EXPR
3042           || !def2 || gimple_nop_p (def2)
3043           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3045               && (is_gimple_assign (def2)
3046                   || is_gimple_call (def2)
3047                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3048                        == vect_induction_def
3049                   || (gimple_code (def2) == GIMPLE_PHI
3050                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3051                            == vect_internal_def
3052                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3053     {
3054       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3055         {
3056           /* Check if we can swap operands (just for simplicity - so that
3057              the rest of the code can assume that the reduction variable
3058              is always the last (second) argument).  */
3059           if (code == COND_EXPR)
3060             {
3061               /* Swap cond_expr by inverting the condition.  */
3062               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3063               enum tree_code invert_code = ERROR_MARK;
3064               enum tree_code cond_code = TREE_CODE (cond_expr);
3065
3066               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3067                 {
3068                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3069                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3070                 }
3071               if (invert_code != ERROR_MARK)
3072                 {
3073                   TREE_SET_CODE (cond_expr, invert_code);
3074                   swap_ssa_operands (def_stmt,
3075                                      gimple_assign_rhs2_ptr (def_stmt),
3076                                      gimple_assign_rhs3_ptr (def_stmt));
3077                 }
3078               else
3079                 {
3080                   if (dump_enabled_p ())
3081                     report_vect_op (MSG_NOTE, def_stmt,
3082                                     "detected reduction: cannot swap operands "
3083                                     "for cond_expr");
3084                   return NULL;
3085                 }
3086             }
3087           else
3088             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3089                                gimple_assign_rhs2_ptr (def_stmt));
3090
3091           if (dump_enabled_p ())
3092             report_vect_op (MSG_NOTE, def_stmt,
3093                             "detected reduction: need to swap operands: ");
3094
3095           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3096             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3097         }
3098       else
3099         {
3100           if (dump_enabled_p ())
3101             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3102         }
3103
3104       return def_stmt;
3105     }
3106
3107   /* Try to find SLP reduction chain.  */
3108   if (! nested_in_vect_loop
3109       && code != COND_EXPR
3110       && orig_code != MINUS_EXPR
3111       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3112     {
3113       if (dump_enabled_p ())
3114         report_vect_op (MSG_NOTE, def_stmt,
3115                         "reduction: detected reduction chain: ");
3116
3117       return def_stmt;
3118     }
3119
3120   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3121   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3122   while (first)
3123     {
3124       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3125       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3127       first = next;
3128     }
3129
3130   /* Look for the expression computing loop_arg from loop PHI result.  */
3131   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3132   auto_bitmap visited;
3133   tree lookfor = PHI_RESULT (phi);
3134   ssa_op_iter curri;
3135   use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3136                                             SSA_OP_USE);
3137   while (USE_FROM_PTR (curr) != loop_arg)
3138     curr = op_iter_next_use (&curri);
3139   curri.i = curri.numops;
3140   do
3141     {
3142       path.safe_push (std::make_pair (curri, curr));
3143       tree use = USE_FROM_PTR (curr);
3144       if (use == lookfor)
3145         break;
3146       gimple *def = SSA_NAME_DEF_STMT (use);
3147       if (gimple_nop_p (def)
3148           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3149         {
3150 pop:
3151           do
3152             {
3153               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3154               curri = x.first;
3155               curr = x.second;
3156               do
3157                 curr = op_iter_next_use (&curri);
3158               /* Skip already visited or non-SSA operands (from iterating
3159                  over PHI args).  */
3160               while (curr != NULL_USE_OPERAND_P
3161                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3162                          || ! bitmap_set_bit (visited,
3163                                               SSA_NAME_VERSION
3164                                                 (USE_FROM_PTR (curr)))));
3165             }
3166           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3167           if (curr == NULL_USE_OPERAND_P)
3168             break;
3169         }
3170       else
3171         {
3172           if (gimple_code (def) == GIMPLE_PHI)
3173             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3174           else
3175             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3176           while (curr != NULL_USE_OPERAND_P
3177                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3178                      || ! bitmap_set_bit (visited,
3179                                           SSA_NAME_VERSION
3180                                             (USE_FROM_PTR (curr)))))
3181             curr = op_iter_next_use (&curri);
3182           if (curr == NULL_USE_OPERAND_P)
3183             goto pop;
3184         }
3185     }
3186   while (1);
3187   if (dump_file && (dump_flags & TDF_DETAILS))
3188     {
3189       dump_printf_loc (MSG_NOTE, vect_location,
3190                        "reduction path: ");
3191       unsigned i;
3192       std::pair<ssa_op_iter, use_operand_p> *x;
3193       FOR_EACH_VEC_ELT (path, i, x)
3194         {
3195           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3196           dump_printf (MSG_NOTE, " ");
3197         }
3198       dump_printf (MSG_NOTE, "\n");
3199     }
3200
3201   /* Check whether the reduction path detected is valid.  */
3202   bool fail = path.length () == 0;
3203   bool neg = false;
3204   for (unsigned i = 1; i < path.length (); ++i)
3205     {
3206       gimple *use_stmt = USE_STMT (path[i].second);
3207       tree op = USE_FROM_PTR (path[i].second);
3208       if (! has_single_use (op)
3209           || ! is_gimple_assign (use_stmt))
3210         {
3211           fail = true;
3212           break;
3213         }
3214       if (gimple_assign_rhs_code (use_stmt) != code)
3215         {
3216           if (code == PLUS_EXPR
3217               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3218             {
3219               /* Track whether we negate the reduction value each iteration.  */
3220               if (gimple_assign_rhs2 (use_stmt) == op)
3221                 neg = ! neg;
3222             }
3223           else
3224             {
3225               fail = true;
3226               break;
3227             }
3228         }
3229     }
3230   if (! fail && ! neg)
3231     return def_stmt;
3232
3233   if (dump_enabled_p ())
3234     {
3235       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3236                       "reduction: unknown pattern: ");
3237     }
3238
3239   return NULL;
3240 }
3241
3242 /* Wrapper around vect_is_simple_reduction, which will modify code
3243    in-place if it enables detection of more reductions.  Arguments
3244    as there.  */
3245
3246 gimple *
3247 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3248                              bool *double_reduc,
3249                              bool need_wrapping_integral_overflow)
3250 {
3251   enum vect_reduction_type v_reduc_type;
3252   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3253                                           need_wrapping_integral_overflow,
3254                                           &v_reduc_type);
3255   if (def)
3256     {
3257       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3258       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3259       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3260       reduc_def_info = vinfo_for_stmt (def);
3261       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3262     }
3263   return def;
3264 }
3265
3266 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3267 int
3268 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3269                              int *peel_iters_epilogue,
3270                              stmt_vector_for_cost *scalar_cost_vec,
3271                              stmt_vector_for_cost *prologue_cost_vec,
3272                              stmt_vector_for_cost *epilogue_cost_vec)
3273 {
3274   int retval = 0;
3275   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3276
3277   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3278     {
3279       *peel_iters_epilogue = vf/2;
3280       if (dump_enabled_p ())
3281         dump_printf_loc (MSG_NOTE, vect_location,
3282                          "cost model: epilogue peel iters set to vf/2 "
3283                          "because loop iterations are unknown .\n");
3284
3285       /* If peeled iterations are known but number of scalar loop
3286          iterations are unknown, count a taken branch per peeled loop.  */
3287       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3288                                  NULL, 0, vect_prologue);
3289       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3290                                  NULL, 0, vect_epilogue);
3291     }
3292   else
3293     {
3294       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3295       peel_iters_prologue = niters < peel_iters_prologue ?
3296                             niters : peel_iters_prologue;
3297       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3298       /* If we need to peel for gaps, but no peeling is required, we have to
3299          peel VF iterations.  */
3300       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3301         *peel_iters_epilogue = vf;
3302     }
3303
3304   stmt_info_for_cost *si;
3305   int j;
3306   if (peel_iters_prologue)
3307     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3308         {
3309           stmt_vec_info stmt_info
3310             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3311           retval += record_stmt_cost (prologue_cost_vec,
3312                                       si->count * peel_iters_prologue,
3313                                       si->kind, stmt_info, si->misalign,
3314                                       vect_prologue);
3315         }
3316   if (*peel_iters_epilogue)
3317     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3318         {
3319           stmt_vec_info stmt_info
3320             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321           retval += record_stmt_cost (epilogue_cost_vec,
3322                                       si->count * *peel_iters_epilogue,
3323                                       si->kind, stmt_info, si->misalign,
3324                                       vect_epilogue);
3325         }
3326
3327   return retval;
3328 }
3329
3330 /* Function vect_estimate_min_profitable_iters
3331
3332    Return the number of iterations required for the vector version of the
3333    loop to be profitable relative to the cost of the scalar version of the
3334    loop.
3335
3336    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3337    of iterations for vectorization.  -1 value means loop vectorization
3338    is not profitable.  This returned value may be used for dynamic
3339    profitability check.
3340
3341    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3342    for static check against estimated number of iterations.  */
3343
3344 static void
3345 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3346                                     int *ret_min_profitable_niters,
3347                                     int *ret_min_profitable_estimate)
3348 {
3349   int min_profitable_iters;
3350   int min_profitable_estimate;
3351   int peel_iters_prologue;
3352   int peel_iters_epilogue;
3353   unsigned vec_inside_cost = 0;
3354   int vec_outside_cost = 0;
3355   unsigned vec_prologue_cost = 0;
3356   unsigned vec_epilogue_cost = 0;
3357   int scalar_single_iter_cost = 0;
3358   int scalar_outside_cost = 0;
3359   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3361   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3362
3363   /* Cost model disabled.  */
3364   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3365     {
3366       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3367       *ret_min_profitable_niters = 0;
3368       *ret_min_profitable_estimate = 0;
3369       return;
3370     }
3371
3372   /* Requires loop versioning tests to handle misalignment.  */
3373   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3374     {
3375       /*  FIXME: Make cost depend on complexity of individual check.  */
3376       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3377       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3378                             vect_prologue);
3379       dump_printf (MSG_NOTE,
3380                    "cost model: Adding cost of checks for loop "
3381                    "versioning to treat misalignment.\n");
3382     }
3383
3384   /* Requires loop versioning with alias checks.  */
3385   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3386     {
3387       /*  FIXME: Make cost depend on complexity of individual check.  */
3388       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3389       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3390                             vect_prologue);
3391       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3392       if (len)
3393         /* Count LEN - 1 ANDs and LEN comparisons.  */
3394         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3395                               NULL, 0, vect_prologue);
3396       dump_printf (MSG_NOTE,
3397                    "cost model: Adding cost of checks for loop "
3398                    "versioning aliasing.\n");
3399     }
3400
3401   /* Requires loop versioning with niter checks.  */
3402   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3403     {
3404       /*  FIXME: Make cost depend on complexity of individual check.  */
3405       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3406                             vect_prologue);
3407       dump_printf (MSG_NOTE,
3408                    "cost model: Adding cost of checks for loop "
3409                    "versioning niters.\n");
3410     }
3411
3412   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3413     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3414                           vect_prologue);
3415
3416   /* Count statements in scalar loop.  Using this as scalar cost for a single
3417      iteration for now.
3418
3419      TODO: Add outer loop support.
3420
3421      TODO: Consider assigning different costs to different scalar
3422      statements.  */
3423
3424   scalar_single_iter_cost
3425     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3426
3427   /* Add additional cost for the peeled instructions in prologue and epilogue
3428      loop.
3429
3430      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3431      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3432
3433      TODO: Build an expression that represents peel_iters for prologue and
3434      epilogue to be used in a run-time test.  */
3435
3436   if (npeel  < 0)
3437     {
3438       peel_iters_prologue = vf/2;
3439       dump_printf (MSG_NOTE, "cost model: "
3440                    "prologue peel iters set to vf/2.\n");
3441
3442       /* If peeling for alignment is unknown, loop bound of main loop becomes
3443          unknown.  */
3444       peel_iters_epilogue = vf/2;
3445       dump_printf (MSG_NOTE, "cost model: "
3446                    "epilogue peel iters set to vf/2 because "
3447                    "peeling for alignment is unknown.\n");
3448
3449       /* If peeled iterations are unknown, count a taken branch and a not taken
3450          branch per peeled loop. Even if scalar loop iterations are known,
3451          vector iterations are not known since peeled prologue iterations are
3452          not known. Hence guards remain the same.  */
3453       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3454                             NULL, 0, vect_prologue);
3455       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3456                             NULL, 0, vect_prologue);
3457       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458                             NULL, 0, vect_epilogue);
3459       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460                             NULL, 0, vect_epilogue);
3461       stmt_info_for_cost *si;
3462       int j;
3463       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3464         {
3465           struct _stmt_vec_info *stmt_info
3466             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3467           (void) add_stmt_cost (target_cost_data,
3468                                 si->count * peel_iters_prologue,
3469                                 si->kind, stmt_info, si->misalign,
3470                                 vect_prologue);
3471           (void) add_stmt_cost (target_cost_data,
3472                                 si->count * peel_iters_epilogue,
3473                                 si->kind, stmt_info, si->misalign,
3474                                 vect_epilogue);
3475         }
3476     }
3477   else
3478     {
3479       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3480       stmt_info_for_cost *si;
3481       int j;
3482       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3483
3484       prologue_cost_vec.create (2);
3485       epilogue_cost_vec.create (2);
3486       peel_iters_prologue = npeel;
3487
3488       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3489                                           &peel_iters_epilogue,
3490                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3491                                             (loop_vinfo),
3492                                           &prologue_cost_vec,
3493                                           &epilogue_cost_vec);
3494
3495       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3496         {
3497           struct _stmt_vec_info *stmt_info
3498             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3499           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3500                                 si->misalign, vect_prologue);
3501         }
3502
3503       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3504         {
3505           struct _stmt_vec_info *stmt_info
3506             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3507           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3508                                 si->misalign, vect_epilogue);
3509         }
3510
3511       prologue_cost_vec.release ();
3512       epilogue_cost_vec.release ();
3513     }
3514
3515   /* FORNOW: The scalar outside cost is incremented in one of the
3516      following ways:
3517
3518      1. The vectorizer checks for alignment and aliasing and generates
3519      a condition that allows dynamic vectorization.  A cost model
3520      check is ANDED with the versioning condition.  Hence scalar code
3521      path now has the added cost of the versioning check.
3522
3523        if (cost > th & versioning_check)
3524          jmp to vector code
3525
3526      Hence run-time scalar is incremented by not-taken branch cost.
3527
3528      2. The vectorizer then checks if a prologue is required.  If the
3529      cost model check was not done before during versioning, it has to
3530      be done before the prologue check.
3531
3532        if (cost <= th)
3533          prologue = scalar_iters
3534        if (prologue == 0)
3535          jmp to vector code
3536        else
3537          execute prologue
3538        if (prologue == num_iters)
3539          go to exit
3540
3541      Hence the run-time scalar cost is incremented by a taken branch,
3542      plus a not-taken branch, plus a taken branch cost.
3543
3544      3. The vectorizer then checks if an epilogue is required.  If the
3545      cost model check was not done before during prologue check, it
3546      has to be done with the epilogue check.
3547
3548        if (prologue == 0)
3549          jmp to vector code
3550        else
3551          execute prologue
3552        if (prologue == num_iters)
3553          go to exit
3554        vector code:
3555          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3556            jmp to epilogue
3557
3558      Hence the run-time scalar cost should be incremented by 2 taken
3559      branches.
3560
3561      TODO: The back end may reorder the BBS's differently and reverse
3562      conditions/branch directions.  Change the estimates below to
3563      something more reasonable.  */
3564
3565   /* If the number of iterations is known and we do not do versioning, we can
3566      decide whether to vectorize at compile time.  Hence the scalar version
3567      do not carry cost model guard costs.  */
3568   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3569       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3570     {
3571       /* Cost model check occurs at versioning.  */
3572       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3573         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3574       else
3575         {
3576           /* Cost model check occurs at prologue generation.  */
3577           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3578             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3579               + vect_get_stmt_cost (cond_branch_not_taken);
3580           /* Cost model check occurs at epilogue generation.  */
3581           else
3582             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3583         }
3584     }
3585
3586   /* Complete the target-specific cost calculations.  */
3587   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3588                &vec_inside_cost, &vec_epilogue_cost);
3589
3590   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3591
3592   if (dump_enabled_p ())
3593     {
3594       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3595       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3596                    vec_inside_cost);
3597       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3598                    vec_prologue_cost);
3599       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3600                    vec_epilogue_cost);
3601       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3602                    scalar_single_iter_cost);
3603       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3604                    scalar_outside_cost);
3605       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3606                    vec_outside_cost);
3607       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3608                    peel_iters_prologue);
3609       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3610                    peel_iters_epilogue);
3611     }
3612
3613   /* Calculate number of iterations required to make the vector version
3614      profitable, relative to the loop bodies only.  The following condition
3615      must hold true:
3616      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3617      where
3618      SIC = scalar iteration cost, VIC = vector iteration cost,
3619      VOC = vector outside cost, VF = vectorization factor,
3620      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3621      SOC = scalar outside cost for run time cost model check.  */
3622
3623   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3624     {
3625       if (vec_outside_cost <= 0)
3626         min_profitable_iters = 0;
3627       else
3628         {
3629           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3630                                   - vec_inside_cost * peel_iters_prologue
3631                                   - vec_inside_cost * peel_iters_epilogue)
3632                                  / ((scalar_single_iter_cost * vf)
3633                                     - vec_inside_cost);
3634
3635           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3636               <= (((int) vec_inside_cost * min_profitable_iters)
3637                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3638             min_profitable_iters++;
3639         }
3640     }
3641   /* vector version will never be profitable.  */
3642   else
3643     {
3644       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3645         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3646                     "did not happen for a simd loop");
3647
3648       if (dump_enabled_p ())
3649         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650                          "cost model: the vector iteration cost = %d "
3651                          "divided by the scalar iteration cost = %d "
3652                          "is greater or equal to the vectorization factor = %d"
3653                          ".\n",
3654                          vec_inside_cost, scalar_single_iter_cost, vf);
3655       *ret_min_profitable_niters = -1;
3656       *ret_min_profitable_estimate = -1;
3657       return;
3658     }
3659
3660   dump_printf (MSG_NOTE,
3661                "  Calculated minimum iters for profitability: %d\n",
3662                min_profitable_iters);
3663
3664   /* We want the vectorized loop to execute at least once.  */
3665   if (min_profitable_iters < (vf + peel_iters_prologue))
3666     min_profitable_iters = vf + peel_iters_prologue;
3667
3668   if (dump_enabled_p ())
3669     dump_printf_loc (MSG_NOTE, vect_location,
3670                      "  Runtime profitability threshold = %d\n",
3671                      min_profitable_iters);
3672
3673   *ret_min_profitable_niters = min_profitable_iters;
3674
3675   /* Calculate number of iterations required to make the vector version
3676      profitable, relative to the loop bodies only.
3677
3678      Non-vectorized variant is SIC * niters and it must win over vector
3679      variant on the expected loop trip count.  The following condition must hold true:
3680      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3681
3682   if (vec_outside_cost <= 0)
3683     min_profitable_estimate = 0;
3684   else
3685     {
3686       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3687                                  - vec_inside_cost * peel_iters_prologue
3688                                  - vec_inside_cost * peel_iters_epilogue)
3689                                  / ((scalar_single_iter_cost * vf)
3690                                    - vec_inside_cost);
3691     }
3692   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3693   if (dump_enabled_p ())
3694     dump_printf_loc (MSG_NOTE, vect_location,
3695                      "  Static estimate profitability threshold = %d\n",
3696                      min_profitable_estimate);
3697
3698   *ret_min_profitable_estimate = min_profitable_estimate;
3699 }
3700
3701 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3702    vector elements (not bits) for a vector with NELT elements.  */
3703 static void
3704 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3705                               vec_perm_indices *sel)
3706 {
3707   unsigned int i;
3708
3709   for (i = 0; i < nelt; i++)
3710     sel->quick_push ((i + offset) & (2 * nelt - 1));
3711 }
3712
3713 /* Checks whether the target supports whole-vector shifts for vectors of mode
3714    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3715    it supports vec_perm_const with masks for all necessary shift amounts.  */
3716 static bool
3717 have_whole_vector_shift (machine_mode mode)
3718 {
3719   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3720     return true;
3721
3722   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3723     return false;
3724
3725   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3726   auto_vec_perm_indices sel (nelt);
3727
3728   for (i = nelt/2; i >= 1; i/=2)
3729     {
3730       sel.truncate (0);
3731       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3732       if (!can_vec_perm_p (mode, false, &sel))
3733         return false;
3734     }
3735   return true;
3736 }
3737
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739    functions. Design better to avoid maintenance issues.  */
3740
3741 /* Function vect_model_reduction_cost.
3742
3743    Models cost for a reduction operation, including the vector ops
3744    generated within the strip-mine loop, the initial definition before
3745    the loop, and the epilogue code that must be generated.  */
3746
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
3749                            int ncopies)
3750 {
3751   int prologue_cost = 0, epilogue_cost = 0;
3752   enum tree_code code;
3753   optab optab;
3754   tree vectype;
3755   gimple *orig_stmt;
3756   machine_mode mode;
3757   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3758   struct loop *loop = NULL;
3759   void *target_cost_data;
3760
3761   if (loop_vinfo)
3762     {
3763       loop = LOOP_VINFO_LOOP (loop_vinfo);
3764       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3765     }
3766   else
3767     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3768
3769   /* Condition reductions generate two reductions in the loop.  */
3770   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3771     ncopies *= 2;
3772
3773   /* Cost of reduction op inside loop.  */
3774   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3775                                         stmt_info, 0, vect_body);
3776
3777   vectype = STMT_VINFO_VECTYPE (stmt_info);
3778   mode = TYPE_MODE (vectype);
3779   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3780
3781   if (!orig_stmt)
3782     orig_stmt = STMT_VINFO_STMT (stmt_info);
3783
3784   code = gimple_assign_rhs_code (orig_stmt);
3785
3786   /* Add in cost for initial definition.
3787      For cond reduction we have four vectors: initial index, step, initial
3788      result of the data reduction, initial value of the index reduction.  */
3789   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3790                        == COND_REDUCTION ? 4 : 1;
3791   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3792                                   scalar_to_vec, stmt_info, 0,
3793                                   vect_prologue);
3794
3795   /* Determine cost of epilogue code.
3796
3797      We have a reduction operator that will reduce the vector in one statement.
3798      Also requires scalar extract.  */
3799
3800   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3801     {
3802       if (reduc_fn != IFN_LAST)
3803         {
3804           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3805             {
3806               /* An EQ stmt and an COND_EXPR stmt.  */
3807               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3808                                               vector_stmt, stmt_info, 0,
3809                                               vect_epilogue);
3810               /* Reduction of the max index and a reduction of the found
3811                  values.  */
3812               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3813                                               vec_to_scalar, stmt_info, 0,
3814                                               vect_epilogue);
3815               /* A broadcast of the max value.  */
3816               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3817                                               scalar_to_vec, stmt_info, 0,
3818                                               vect_epilogue);
3819             }
3820           else
3821             {
3822               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3823                                               stmt_info, 0, vect_epilogue);
3824               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3825                                               vec_to_scalar, stmt_info, 0,
3826                                               vect_epilogue);
3827             }
3828         }
3829       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3830         {
3831           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832           /* Extraction of scalar elements.  */
3833           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3834                                           vec_to_scalar, stmt_info, 0,
3835                                           vect_epilogue);
3836           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3837           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3838                                           scalar_stmt, stmt_info, 0,
3839                                           vect_epilogue);
3840         }
3841       else
3842         {
3843           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3844           tree bitsize =
3845             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3846           int element_bitsize = tree_to_uhwi (bitsize);
3847           int nelements = vec_size_in_bits / element_bitsize;
3848
3849           if (code == COND_EXPR)
3850             code = MAX_EXPR;
3851
3852           optab = optab_for_tree_code (code, vectype, optab_default);
3853
3854           /* We have a whole vector shift available.  */
3855           if (optab != unknown_optab
3856               && VECTOR_MODE_P (mode)
3857               && optab_handler (optab, mode) != CODE_FOR_nothing
3858               && have_whole_vector_shift (mode))
3859             {
3860               /* Final reduction via vector shifts and the reduction operator.
3861                  Also requires scalar extract.  */
3862               epilogue_cost += add_stmt_cost (target_cost_data,
3863                                               exact_log2 (nelements) * 2,
3864                                               vector_stmt, stmt_info, 0,
3865                                               vect_epilogue);
3866               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3867                                               vec_to_scalar, stmt_info, 0,
3868                                               vect_epilogue);
3869             }
3870           else
3871             /* Use extracts and reduction op for final reduction.  For N
3872                elements, we have N extracts and N-1 reduction ops.  */
3873             epilogue_cost += add_stmt_cost (target_cost_data,
3874                                             nelements + nelements - 1,
3875                                             vector_stmt, stmt_info, 0,
3876                                             vect_epilogue);
3877         }
3878     }
3879
3880   if (dump_enabled_p ())
3881     dump_printf (MSG_NOTE,
3882                  "vect_model_reduction_cost: inside_cost = %d, "
3883                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3884                  prologue_cost, epilogue_cost);
3885 }
3886
3887
3888 /* Function vect_model_induction_cost.
3889
3890    Models cost for induction operations.  */
3891
3892 static void
3893 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3894 {
3895   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3896   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3897   unsigned inside_cost, prologue_cost;
3898
3899   if (PURE_SLP_STMT (stmt_info))
3900     return;
3901
3902   /* loop cost for vec_loop.  */
3903   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3904                                stmt_info, 0, vect_body);
3905
3906   /* prologue cost for vec_init and vec_step.  */
3907   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3908                                  stmt_info, 0, vect_prologue);
3909
3910   if (dump_enabled_p ())
3911     dump_printf_loc (MSG_NOTE, vect_location,
3912                      "vect_model_induction_cost: inside_cost = %d, "
3913                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3914 }
3915
3916
3917
3918 /* Function get_initial_def_for_reduction
3919
3920    Input:
3921    STMT - a stmt that performs a reduction operation in the loop.
3922    INIT_VAL - the initial value of the reduction variable
3923
3924    Output:
3925    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3926         of the reduction (used for adjusting the epilog - see below).
3927    Return a vector variable, initialized according to the operation that STMT
3928         performs. This vector will be used as the initial value of the
3929         vector of partial results.
3930
3931    Option1 (adjust in epilog): Initialize the vector as follows:
3932      add/bit or/xor:    [0,0,...,0,0]
3933      mult/bit and:      [1,1,...,1,1]
3934      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3935    and when necessary (e.g. add/mult case) let the caller know
3936    that it needs to adjust the result by init_val.
3937
3938    Option2: Initialize the vector as follows:
3939      add/bit or/xor:    [init_val,0,0,...,0]
3940      mult/bit and:      [init_val,1,1,...,1]
3941      min/max/cond_expr: [init_val,init_val,...,init_val]
3942    and no adjustments are needed.
3943
3944    For example, for the following code:
3945
3946    s = init_val;
3947    for (i=0;i<n;i++)
3948      s = s + a[i];
3949
3950    STMT is 's = s + a[i]', and the reduction variable is 's'.
3951    For a vector of 4 units, we want to return either [0,0,0,init_val],
3952    or [0,0,0,0] and let the caller know that it needs to adjust
3953    the result at the end by 'init_val'.
3954
3955    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3956    initialization vector is simpler (same element in all entries), if
3957    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3958
3959    A cost model should help decide between these two schemes.  */
3960
3961 tree
3962 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3963                                tree *adjustment_def)
3964 {
3965   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3966   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3967   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3968   tree scalar_type = TREE_TYPE (init_val);
3969   tree vectype = get_vectype_for_scalar_type (scalar_type);
3970   int nunits;
3971   enum tree_code code = gimple_assign_rhs_code (stmt);
3972   tree def_for_init;
3973   tree init_def;
3974   int i;
3975   bool nested_in_vect_loop = false;
3976   REAL_VALUE_TYPE real_init_val = dconst0;
3977   int int_init_val = 0;
3978   gimple *def_stmt = NULL;
3979   gimple_seq stmts = NULL;
3980
3981   gcc_assert (vectype);
3982   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3983
3984   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3985               || SCALAR_FLOAT_TYPE_P (scalar_type));
3986
3987   if (nested_in_vect_loop_p (loop, stmt))
3988     nested_in_vect_loop = true;
3989   else
3990     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3991
3992   /* In case of double reduction we only create a vector variable to be put
3993      in the reduction phi node.  The actual statement creation is done in
3994      vect_create_epilog_for_reduction.  */
3995   if (adjustment_def && nested_in_vect_loop
3996       && TREE_CODE (init_val) == SSA_NAME
3997       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3998       && gimple_code (def_stmt) == GIMPLE_PHI
3999       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4000       && vinfo_for_stmt (def_stmt)
4001       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4002           == vect_double_reduction_def)
4003     {
4004       *adjustment_def = NULL;
4005       return vect_create_destination_var (init_val, vectype);
4006     }
4007
4008   /* In case of a nested reduction do not use an adjustment def as
4009      that case is not supported by the epilogue generation correctly
4010      if ncopies is not one.  */
4011   if (adjustment_def && nested_in_vect_loop)
4012     {
4013       *adjustment_def = NULL;
4014       return vect_get_vec_def_for_operand (init_val, stmt);
4015     }
4016
4017   switch (code)
4018     {
4019     case WIDEN_SUM_EXPR:
4020     case DOT_PROD_EXPR:
4021     case SAD_EXPR:
4022     case PLUS_EXPR:
4023     case MINUS_EXPR:
4024     case BIT_IOR_EXPR:
4025     case BIT_XOR_EXPR:
4026     case MULT_EXPR:
4027     case BIT_AND_EXPR:
4028       {
4029         /* ADJUSMENT_DEF is NULL when called from
4030            vect_create_epilog_for_reduction to vectorize double reduction.  */
4031         if (adjustment_def)
4032           *adjustment_def = init_val;
4033
4034         if (code == MULT_EXPR)
4035           {
4036             real_init_val = dconst1;
4037             int_init_val = 1;
4038           }
4039
4040         if (code == BIT_AND_EXPR)
4041           int_init_val = -1;
4042
4043         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044           def_for_init = build_real (scalar_type, real_init_val);
4045         else
4046           def_for_init = build_int_cst (scalar_type, int_init_val);
4047
4048         if (adjustment_def)
4049           /* Option1: the first element is '0' or '1' as well.  */
4050           init_def = gimple_build_vector_from_val (&stmts, vectype,
4051                                                    def_for_init);
4052         else
4053           {
4054             /* Option2: the first element is INIT_VAL.  */
4055             auto_vec<tree, 32> elts (nunits);
4056             elts.quick_push (init_val);
4057             for (i = 1; i < nunits; ++i)
4058               elts.quick_push (def_for_init);
4059             init_def = gimple_build_vector (&stmts, vectype, elts);
4060           }
4061       }
4062       break;
4063
4064     case MIN_EXPR:
4065     case MAX_EXPR:
4066     case COND_EXPR:
4067       {
4068         if (adjustment_def)
4069           {
4070             *adjustment_def = NULL_TREE;
4071             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4072               {
4073                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4074                 break;
4075               }
4076           }
4077         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4079       }
4080       break;
4081
4082     default:
4083       gcc_unreachable ();
4084     }
4085
4086   if (stmts)
4087     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088   return init_def;
4089 }
4090
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4093
4094 static void
4095 get_initial_defs_for_reduction (slp_tree slp_node,
4096                                 vec<tree> *vec_oprnds,
4097                                 unsigned int number_of_vectors,
4098                                 enum tree_code code, bool reduc_chain)
4099 {
4100   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4101   gimple *stmt = stmts[0];
4102   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4103   unsigned nunits;
4104   unsigned j, number_of_places_left_in_vector;
4105   tree vector_type, scalar_type;
4106   tree vop;
4107   int group_size = stmts.length ();
4108   unsigned int vec_num, i;
4109   unsigned number_of_copies = 1;
4110   vec<tree> voprnds;
4111   voprnds.create (number_of_vectors);
4112   tree neutral_op = NULL;
4113   struct loop *loop;
4114
4115   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4116   scalar_type = TREE_TYPE (vector_type);
4117   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4118
4119   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4120
4121   loop = (gimple_bb (stmt))->loop_father;
4122   gcc_assert (loop);
4123   edge pe = loop_preheader_edge (loop);
4124
4125   /* op is the reduction operand of the first stmt already.  */
4126   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4127      we need either neutral operands or the original operands.  See
4128      get_initial_def_for_reduction() for details.  */
4129   switch (code)
4130     {
4131     case WIDEN_SUM_EXPR:
4132     case DOT_PROD_EXPR:
4133     case SAD_EXPR:
4134     case PLUS_EXPR:
4135     case MINUS_EXPR:
4136     case BIT_IOR_EXPR:
4137     case BIT_XOR_EXPR:
4138       neutral_op = build_zero_cst (scalar_type);
4139       break;
4140
4141     case MULT_EXPR:
4142       neutral_op = build_one_cst (scalar_type);
4143       break;
4144
4145     case BIT_AND_EXPR:
4146       neutral_op = build_all_ones_cst (scalar_type);
4147       break;
4148
4149     /* For MIN/MAX we don't have an easy neutral operand but
4150        the initial values can be used fine here.  Only for
4151        a reduction chain we have to force a neutral element.  */
4152     case MAX_EXPR:
4153     case MIN_EXPR:
4154       if (! reduc_chain)
4155         neutral_op = NULL;
4156       else
4157         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4158       break;
4159
4160     default:
4161       gcc_assert (! reduc_chain);
4162       neutral_op = NULL;
4163     }
4164
4165   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4166      created vectors. It is greater than 1 if unrolling is performed.
4167
4168      For example, we have two scalar operands, s1 and s2 (e.g., group of
4169      strided accesses of size two), while NUNITS is four (i.e., four scalars
4170      of this type can be packed in a vector).  The output vector will contain
4171      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4172      will be 2).
4173
4174      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4175      containing the operands.
4176
4177      For example, NUNITS is four as before, and the group size is 8
4178      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4179      {s5, s6, s7, s8}.  */
4180
4181   number_of_copies = nunits * number_of_vectors / group_size;
4182
4183   number_of_places_left_in_vector = nunits;
4184   auto_vec<tree, 32> elts (nunits);
4185   elts.quick_grow (nunits);
4186   for (j = 0; j < number_of_copies; j++)
4187     {
4188       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4189         {
4190           tree op;
4191           /* Get the def before the loop.  In reduction chain we have only
4192              one initial value.  */
4193           if ((j != (number_of_copies - 1)
4194                || (reduc_chain && i != 0))
4195               && neutral_op)
4196             op = neutral_op;
4197           else
4198             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4199
4200           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4201           number_of_places_left_in_vector--;
4202           elts[number_of_places_left_in_vector] = op;
4203
4204           if (number_of_places_left_in_vector == 0)
4205             {
4206               gimple_seq ctor_seq = NULL;
4207               tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4208               if (ctor_seq != NULL)
4209                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4210               voprnds.quick_push (init);
4211
4212               number_of_places_left_in_vector = nunits;
4213             }
4214         }
4215     }
4216
4217   /* Since the vectors are created in the reverse order, we should invert
4218      them.  */
4219   vec_num = voprnds.length ();
4220   for (j = vec_num; j != 0; j--)
4221     {
4222       vop = voprnds[j - 1];
4223       vec_oprnds->quick_push (vop);
4224     }
4225
4226   voprnds.release ();
4227
4228   /* In case that VF is greater than the unrolling factor needed for the SLP
4229      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4230      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4231      to replicate the vectors.  */
4232   tree neutral_vec = NULL;
4233   while (number_of_vectors > vec_oprnds->length ())
4234     {
4235       if (neutral_op)
4236         {
4237           if (!neutral_vec)
4238             {
4239               gimple_seq ctor_seq = NULL;
4240               neutral_vec = gimple_build_vector_from_val
4241                 (&ctor_seq, vector_type, neutral_op);
4242               if (ctor_seq != NULL)
4243                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4244             }
4245           vec_oprnds->quick_push (neutral_vec);
4246         }
4247       else
4248         {
4249           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4250             vec_oprnds->quick_push (vop);
4251         }
4252     }
4253 }
4254
4255
4256 /* Function vect_create_epilog_for_reduction
4257
4258    Create code at the loop-epilog to finalize the result of a reduction
4259    computation.
4260
4261    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4262      reduction statements.
4263    STMT is the scalar reduction stmt that is being vectorized.
4264    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4265      number of elements that we can fit in a vectype (nunits).  In this case
4266      we have to generate more than one vector stmt - i.e - we need to "unroll"
4267      the vector stmt by a factor VF/nunits.  For more details see documentation
4268      in vectorizable_operation.
4269    REDUC_FN is the internal function for the epilog reduction.
4270    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4271      computation.
4272    REDUC_INDEX is the index of the operand in the right hand side of the
4273      statement that is defined by REDUCTION_PHI.
4274    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4275    SLP_NODE is an SLP node containing a group of reduction statements. The
4276      first one in this group is STMT.
4277
4278    This function:
4279    1. Creates the reduction def-use cycles: sets the arguments for
4280       REDUCTION_PHIS:
4281       The loop-entry argument is the vectorized initial-value of the reduction.
4282       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4283       sums.
4284    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4285       by calling the function specified by REDUC_FN if available, or by
4286       other means (whole-vector shifts or a scalar loop).
4287       The function also creates a new phi node at the loop exit to preserve
4288       loop-closed form, as illustrated below.
4289
4290      The flow at the entry to this function:
4291
4292         loop:
4293           vec_def = phi <null, null>            # REDUCTION_PHI
4294           VECT_DEF = vector_stmt                # vectorized form of STMT
4295           s_loop = scalar_stmt                  # (scalar) STMT
4296         loop_exit:
4297           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4298           use <s_out0>
4299           use <s_out0>
4300
4301      The above is transformed by this function into:
4302
4303         loop:
4304           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4305           VECT_DEF = vector_stmt                # vectorized form of STMT
4306           s_loop = scalar_stmt                  # (scalar) STMT
4307         loop_exit:
4308           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4309           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4310           v_out2 = reduce <v_out1>
4311           s_out3 = extract_field <v_out2, 0>
4312           s_out4 = adjust_result <s_out3>
4313           use <s_out4>
4314           use <s_out4>
4315 */
4316
4317 static void
4318 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4319                                   gimple *reduc_def_stmt,
4320                                   int ncopies, internal_fn reduc_fn,
4321                                   vec<gimple *> reduction_phis,
4322                                   bool double_reduc,
4323                                   slp_tree slp_node,
4324                                   slp_instance slp_node_instance)
4325 {
4326   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4327   stmt_vec_info prev_phi_info;
4328   tree vectype;
4329   machine_mode mode;
4330   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4331   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4332   basic_block exit_bb;
4333   tree scalar_dest;
4334   tree scalar_type;
4335   gimple *new_phi = NULL, *phi;
4336   gimple_stmt_iterator exit_gsi;
4337   tree vec_dest;
4338   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4339   gimple *epilog_stmt = NULL;
4340   enum tree_code code = gimple_assign_rhs_code (stmt);
4341   gimple *exit_phi;
4342   tree bitsize;
4343   tree adjustment_def = NULL;
4344   tree vec_initial_def = NULL;
4345   tree expr, def, initial_def = NULL;
4346   tree orig_name, scalar_result;
4347   imm_use_iterator imm_iter, phi_imm_iter;
4348   use_operand_p use_p, phi_use_p;
4349   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4350   bool nested_in_vect_loop = false;
4351   auto_vec<gimple *> new_phis;
4352   auto_vec<gimple *> inner_phis;
4353   enum vect_def_type dt = vect_unknown_def_type;
4354   int j, i;
4355   auto_vec<tree> scalar_results;
4356   unsigned int group_size = 1, k, ratio;
4357   auto_vec<tree> vec_initial_defs;
4358   auto_vec<gimple *> phis;
4359   bool slp_reduc = false;
4360   tree new_phi_result;
4361   gimple *inner_phi = NULL;
4362   tree induction_index = NULL_TREE;
4363
4364   if (slp_node)
4365     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4366
4367   if (nested_in_vect_loop_p (loop, stmt))
4368     {
4369       outer_loop = loop;
4370       loop = loop->inner;
4371       nested_in_vect_loop = true;
4372       gcc_assert (!slp_node);
4373     }
4374
4375   vectype = STMT_VINFO_VECTYPE (stmt_info);
4376   gcc_assert (vectype);
4377   mode = TYPE_MODE (vectype);
4378
4379   /* 1. Create the reduction def-use cycle:
4380      Set the arguments of REDUCTION_PHIS, i.e., transform
4381
4382         loop:
4383           vec_def = phi <null, null>            # REDUCTION_PHI
4384           VECT_DEF = vector_stmt                # vectorized form of STMT
4385           ...
4386
4387      into:
4388
4389         loop:
4390           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4391           VECT_DEF = vector_stmt                # vectorized form of STMT
4392           ...
4393
4394      (in case of SLP, do it for all the phis). */
4395
4396   /* Get the loop-entry arguments.  */
4397   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4398   if (slp_node)
4399     {
4400       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4401       vec_initial_defs.reserve (vec_num);
4402       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4403                                       &vec_initial_defs, vec_num, code,
4404                                       GROUP_FIRST_ELEMENT (stmt_info));
4405     }
4406   else
4407     {
4408       /* Get at the scalar def before the loop, that defines the initial value
4409          of the reduction variable.  */
4410       gimple *def_stmt;
4411       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4412                                            loop_preheader_edge (loop));
4413       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4414       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4415                                                        &adjustment_def);
4416       vec_initial_defs.create (1);
4417       vec_initial_defs.quick_push (vec_initial_def);
4418     }
4419
4420   /* Set phi nodes arguments.  */
4421   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4422     {
4423       tree vec_init_def = vec_initial_defs[i];
4424       tree def = vect_defs[i];
4425       for (j = 0; j < ncopies; j++)
4426         {
4427           if (j != 0)
4428             {
4429               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4430               if (nested_in_vect_loop)
4431                 vec_init_def
4432                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4433                                                     vec_init_def);
4434             }
4435
4436           /* Set the loop-entry arg of the reduction-phi.  */
4437
4438           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4439               == INTEGER_INDUC_COND_REDUCTION)
4440             {
4441               /* Initialise the reduction phi to zero.  This prevents initial
4442                  values of non-zero interferring with the reduction op.  */
4443               gcc_assert (ncopies == 1);
4444               gcc_assert (i == 0);
4445
4446               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4447               tree zero_vec = build_zero_cst (vec_init_def_type);
4448
4449               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4450                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4451             }
4452           else
4453             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4454                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4455
4456           /* Set the loop-latch arg for the reduction-phi.  */
4457           if (j > 0)
4458             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4459
4460           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4461                        UNKNOWN_LOCATION);
4462
4463           if (dump_enabled_p ())
4464             {
4465               dump_printf_loc (MSG_NOTE, vect_location,
4466                                "transform reduction: created def-use cycle: ");
4467               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4468               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4469             }
4470         }
4471     }
4472
4473   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4474      which is updated with the current index of the loop for every match of
4475      the original loop's cond_expr (VEC_STMT).  This results in a vector
4476      containing the last time the condition passed for that vector lane.
4477      The first match will be a 1 to allow 0 to be used for non-matching
4478      indexes.  If there are no matches at all then the vector will be all
4479      zeroes.  */
4480   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4481     {
4482       tree indx_before_incr, indx_after_incr;
4483       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4484       int k;
4485
4486       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4487       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4488
4489       int scalar_precision
4490         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4491       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4492       tree cr_index_vector_type = build_vector_type
4493         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4494
4495       /* First we create a simple vector induction variable which starts
4496          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4497          vector size (STEP).  */
4498
4499       /* Create a {1,2,3,...} vector.  */
4500       auto_vec<tree, 32> vtemp (nunits_out);
4501       for (k = 0; k < nunits_out; ++k)
4502         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4503       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4504
4505       /* Create a vector of the step value.  */
4506       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4507       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4508
4509       /* Create an induction variable.  */
4510       gimple_stmt_iterator incr_gsi;
4511       bool insert_after;
4512       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4513       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4514                  insert_after, &indx_before_incr, &indx_after_incr);
4515
4516       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4517          filled with zeros (VEC_ZERO).  */
4518
4519       /* Create a vector of 0s.  */
4520       tree zero = build_zero_cst (cr_index_scalar_type);
4521       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4522
4523       /* Create a vector phi node.  */
4524       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4525       new_phi = create_phi_node (new_phi_tree, loop->header);
4526       set_vinfo_for_stmt (new_phi,
4527                           new_stmt_vec_info (new_phi, loop_vinfo));
4528       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4529                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4530
4531       /* Now take the condition from the loops original cond_expr
4532          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4533          every match uses values from the induction variable
4534          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4535          (NEW_PHI_TREE).
4536          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4537          the new cond_expr (INDEX_COND_EXPR).  */
4538
4539       /* Duplicate the condition from vec_stmt.  */
4540       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4541
4542       /* Create a conditional, where the condition is taken from vec_stmt
4543          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4544          else is the phi (NEW_PHI_TREE).  */
4545       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4546                                      ccompare, indx_before_incr,
4547                                      new_phi_tree);
4548       induction_index = make_ssa_name (cr_index_vector_type);
4549       gimple *index_condition = gimple_build_assign (induction_index,
4550                                                      index_cond_expr);
4551       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4552       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4553                                                         loop_vinfo);
4554       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4555       set_vinfo_for_stmt (index_condition, index_vec_info);
4556
4557       /* Update the phi with the vec cond.  */
4558       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4559                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4560     }
4561
4562   /* 2. Create epilog code.
4563         The reduction epilog code operates across the elements of the vector
4564         of partial results computed by the vectorized loop.
4565         The reduction epilog code consists of:
4566
4567         step 1: compute the scalar result in a vector (v_out2)
4568         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4569         step 3: adjust the scalar result (s_out3) if needed.
4570
4571         Step 1 can be accomplished using one the following three schemes:
4572           (scheme 1) using reduc_fn, if available.
4573           (scheme 2) using whole-vector shifts, if available.
4574           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4575                      combined.
4576
4577           The overall epilog code looks like this:
4578
4579           s_out0 = phi <s_loop>         # original EXIT_PHI
4580           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4581           v_out2 = reduce <v_out1>              # step 1
4582           s_out3 = extract_field <v_out2, 0>    # step 2
4583           s_out4 = adjust_result <s_out3>       # step 3
4584
4585           (step 3 is optional, and steps 1 and 2 may be combined).
4586           Lastly, the uses of s_out0 are replaced by s_out4.  */
4587
4588
4589   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4590          v_out1 = phi <VECT_DEF>
4591          Store them in NEW_PHIS.  */
4592
4593   exit_bb = single_exit (loop)->dest;
4594   prev_phi_info = NULL;
4595   new_phis.create (vect_defs.length ());
4596   FOR_EACH_VEC_ELT (vect_defs, i, def)
4597     {
4598       for (j = 0; j < ncopies; j++)
4599         {
4600           tree new_def = copy_ssa_name (def);
4601           phi = create_phi_node (new_def, exit_bb);
4602           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4603           if (j == 0)
4604             new_phis.quick_push (phi);
4605           else
4606             {
4607               def = vect_get_vec_def_for_stmt_copy (dt, def);
4608               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4609             }
4610
4611           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4612           prev_phi_info = vinfo_for_stmt (phi);
4613         }
4614     }
4615
4616   /* The epilogue is created for the outer-loop, i.e., for the loop being
4617      vectorized.  Create exit phis for the outer loop.  */
4618   if (double_reduc)
4619     {
4620       loop = outer_loop;
4621       exit_bb = single_exit (loop)->dest;
4622       inner_phis.create (vect_defs.length ());
4623       FOR_EACH_VEC_ELT (new_phis, i, phi)
4624         {
4625           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4626           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4627           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4628                            PHI_RESULT (phi));
4629           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4630                                                             loop_vinfo));
4631           inner_phis.quick_push (phi);
4632           new_phis[i] = outer_phi;
4633           prev_phi_info = vinfo_for_stmt (outer_phi);
4634           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4635             {
4636               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4637               new_result = copy_ssa_name (PHI_RESULT (phi));
4638               outer_phi = create_phi_node (new_result, exit_bb);
4639               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4640                                PHI_RESULT (phi));
4641               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4642                                                                 loop_vinfo));
4643               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4644               prev_phi_info = vinfo_for_stmt (outer_phi);
4645             }
4646         }
4647     }
4648
4649   exit_gsi = gsi_after_labels (exit_bb);
4650
4651   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4652          (i.e. when reduc_fn is not available) and in the final adjustment
4653          code (if needed).  Also get the original scalar reduction variable as
4654          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4655          represents a reduction pattern), the tree-code and scalar-def are
4656          taken from the original stmt that the pattern-stmt (STMT) replaces.
4657          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4658          are taken from STMT.  */
4659
4660   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4661   if (!orig_stmt)
4662     {
4663       /* Regular reduction  */
4664       orig_stmt = stmt;
4665     }
4666   else
4667     {
4668       /* Reduction pattern  */
4669       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4670       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4671       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4672     }
4673
4674   code = gimple_assign_rhs_code (orig_stmt);
4675   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4676      partial results are added and not subtracted.  */
4677   if (code == MINUS_EXPR)
4678     code = PLUS_EXPR;
4679
4680   scalar_dest = gimple_assign_lhs (orig_stmt);
4681   scalar_type = TREE_TYPE (scalar_dest);
4682   scalar_results.create (group_size);
4683   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4684   bitsize = TYPE_SIZE (scalar_type);
4685
4686   /* In case this is a reduction in an inner-loop while vectorizing an outer
4687      loop - we don't need to extract a single scalar result at the end of the
4688      inner-loop (unless it is double reduction, i.e., the use of reduction is
4689      outside the outer-loop).  The final vector of partial results will be used
4690      in the vectorized outer-loop, or reduced to a scalar result at the end of
4691      the outer-loop.  */
4692   if (nested_in_vect_loop && !double_reduc)
4693     goto vect_finalize_reduction;
4694
4695   /* SLP reduction without reduction chain, e.g.,
4696      # a1 = phi <a2, a0>
4697      # b1 = phi <b2, b0>
4698      a2 = operation (a1)
4699      b2 = operation (b1)  */
4700   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4701
4702   /* In case of reduction chain, e.g.,
4703      # a1 = phi <a3, a0>
4704      a2 = operation (a1)
4705      a3 = operation (a2),
4706
4707      we may end up with more than one vector result.  Here we reduce them to
4708      one vector.  */
4709   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4710     {
4711       tree first_vect = PHI_RESULT (new_phis[0]);
4712       gassign *new_vec_stmt = NULL;
4713       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4714       for (k = 1; k < new_phis.length (); k++)
4715         {
4716           gimple *next_phi = new_phis[k];
4717           tree second_vect = PHI_RESULT (next_phi);
4718           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4719           new_vec_stmt = gimple_build_assign (tem, code,
4720                                               first_vect, second_vect);
4721           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4722           first_vect = tem;
4723         }
4724
4725       new_phi_result = first_vect;
4726       if (new_vec_stmt)
4727         {
4728           new_phis.truncate (0);
4729           new_phis.safe_push (new_vec_stmt);
4730         }
4731     }
4732   /* Likewise if we couldn't use a single defuse cycle.  */
4733   else if (ncopies > 1)
4734     {
4735       gcc_assert (new_phis.length () == 1);
4736       tree first_vect = PHI_RESULT (new_phis[0]);
4737       gassign *new_vec_stmt = NULL;
4738       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4739       gimple *next_phi = new_phis[0];
4740       for (int k = 1; k < ncopies; ++k)
4741         {
4742           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4743           tree second_vect = PHI_RESULT (next_phi);
4744           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4745           new_vec_stmt = gimple_build_assign (tem, code,
4746                                               first_vect, second_vect);
4747           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4748           first_vect = tem;
4749         }
4750       new_phi_result = first_vect;
4751       new_phis.truncate (0);
4752       new_phis.safe_push (new_vec_stmt);
4753     }
4754   else
4755     new_phi_result = PHI_RESULT (new_phis[0]);
4756
4757   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4758       && reduc_fn != IFN_LAST)
4759     {
4760       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4761          various data values where the condition matched and another vector
4762          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4763          need to extract the last matching index (which will be the index with
4764          highest value) and use this to index into the data vector.
4765          For the case where there were no matches, the data vector will contain
4766          all default values and the index vector will be all zeros.  */
4767
4768       /* Get various versions of the type of the vector of indexes.  */
4769       tree index_vec_type = TREE_TYPE (induction_index);
4770       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4771       tree index_scalar_type = TREE_TYPE (index_vec_type);
4772       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4773         (index_vec_type);
4774
4775       /* Get an unsigned integer version of the type of the data vector.  */
4776       int scalar_precision
4777         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4778       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4779       tree vectype_unsigned = build_vector_type
4780         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4781
4782       /* First we need to create a vector (ZERO_VEC) of zeros and another
4783          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4784          can create using a MAX reduction and then expanding.
4785          In the case where the loop never made any matches, the max index will
4786          be zero.  */
4787
4788       /* Vector of {0, 0, 0,...}.  */
4789       tree zero_vec = make_ssa_name (vectype);
4790       tree zero_vec_rhs = build_zero_cst (vectype);
4791       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4792       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4793
4794       /* Find maximum value from the vector of found indexes.  */
4795       tree max_index = make_ssa_name (index_scalar_type);
4796       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4797                                                           1, induction_index);
4798       gimple_call_set_lhs (max_index_stmt, max_index);
4799       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4800
4801       /* Vector of {max_index, max_index, max_index,...}.  */
4802       tree max_index_vec = make_ssa_name (index_vec_type);
4803       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4804                                                       max_index);
4805       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4806                                                         max_index_vec_rhs);
4807       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4808
4809       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4810          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4811          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4812          otherwise.  Only one value should match, resulting in a vector
4813          (VEC_COND) with one data value and the rest zeros.
4814          In the case where the loop never made any matches, every index will
4815          match, resulting in a vector with all data values (which will all be
4816          the default value).  */
4817
4818       /* Compare the max index vector to the vector of found indexes to find
4819          the position of the max value.  */
4820       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4821       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4822                                                       induction_index,
4823                                                       max_index_vec);
4824       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4825
4826       /* Use the compare to choose either values from the data vector or
4827          zero.  */
4828       tree vec_cond = make_ssa_name (vectype);
4829       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4830                                                    vec_compare, new_phi_result,
4831                                                    zero_vec);
4832       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4833
4834       /* Finally we need to extract the data value from the vector (VEC_COND)
4835          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4836          reduction, but because this doesn't exist, we can use a MAX reduction
4837          instead.  The data value might be signed or a float so we need to cast
4838          it first.
4839          In the case where the loop never made any matches, the data values are
4840          all identical, and so will reduce down correctly.  */
4841
4842       /* Make the matched data values unsigned.  */
4843       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4844       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4845                                        vec_cond);
4846       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4847                                                         VIEW_CONVERT_EXPR,
4848                                                         vec_cond_cast_rhs);
4849       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4850
4851       /* Reduce down to a scalar value.  */
4852       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4853       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
4854                                                            1, vec_cond_cast);
4855       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
4856       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4857
4858       /* Convert the reduced value back to the result type and set as the
4859          result.  */
4860       gimple_seq stmts = NULL;
4861       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4862                                data_reduc);
4863       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4864       scalar_results.safe_push (new_temp);
4865     }
4866   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4867            && reduc_fn == IFN_LAST)
4868     {
4869       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
4870          idx = 0;
4871          idx_val = induction_index[0];
4872          val = data_reduc[0];
4873          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4874            if (induction_index[i] > idx_val)
4875              val = data_reduc[i], idx_val = induction_index[i];
4876          return val;  */
4877
4878       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4879       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4880       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4881       unsigned HOST_WIDE_INT v_size
4882         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4883       tree idx_val = NULL_TREE, val = NULL_TREE;
4884       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4885         {
4886           tree old_idx_val = idx_val;
4887           tree old_val = val;
4888           idx_val = make_ssa_name (idx_eltype);
4889           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4890                                              build3 (BIT_FIELD_REF, idx_eltype,
4891                                                      induction_index,
4892                                                      bitsize_int (el_size),
4893                                                      bitsize_int (off)));
4894           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4895           val = make_ssa_name (data_eltype);
4896           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4897                                              build3 (BIT_FIELD_REF,
4898                                                      data_eltype,
4899                                                      new_phi_result,
4900                                                      bitsize_int (el_size),
4901                                                      bitsize_int (off)));
4902           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4903           if (off != 0)
4904             {
4905               tree new_idx_val = idx_val;
4906               tree new_val = val;
4907               if (off != v_size - el_size)
4908                 {
4909                   new_idx_val = make_ssa_name (idx_eltype);
4910                   epilog_stmt = gimple_build_assign (new_idx_val,
4911                                                      MAX_EXPR, idx_val,
4912                                                      old_idx_val);
4913                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4914                 }
4915               new_val = make_ssa_name (data_eltype);
4916               epilog_stmt = gimple_build_assign (new_val,
4917                                                  COND_EXPR,
4918                                                  build2 (GT_EXPR,
4919                                                          boolean_type_node,
4920                                                          idx_val,
4921                                                          old_idx_val),
4922                                                  val, old_val);
4923               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4924               idx_val = new_idx_val;
4925               val = new_val;
4926             }
4927         }
4928       /* Convert the reduced value back to the result type and set as the
4929          result.  */
4930       gimple_seq stmts = NULL;
4931       val = gimple_convert (&stmts, scalar_type, val);
4932       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4933       scalar_results.safe_push (val);
4934     }
4935
4936   /* 2.3 Create the reduction code, using one of the three schemes described
4937          above. In SLP we simply need to extract all the elements from the
4938          vector (without reducing them), so we use scalar shifts.  */
4939   else if (reduc_fn != IFN_LAST && !slp_reduc)
4940     {
4941       tree tmp;
4942       tree vec_elem_type;
4943
4944       /* Case 1:  Create:
4945          v_out2 = reduc_expr <v_out1>  */
4946
4947       if (dump_enabled_p ())
4948         dump_printf_loc (MSG_NOTE, vect_location,
4949                          "Reduce using direct vector reduction.\n");
4950
4951       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4952       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4953         {
4954           tree tmp_dest
4955             = vect_create_destination_var (scalar_dest, vec_elem_type);
4956           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4957                                                     new_phi_result);
4958           gimple_set_lhs (epilog_stmt, tmp_dest);
4959           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4960           gimple_set_lhs (epilog_stmt, new_temp);
4961           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4962
4963           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
4964                                              new_temp);
4965         }
4966       else
4967         {
4968           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
4969                                                     new_phi_result);
4970           gimple_set_lhs (epilog_stmt, new_scalar_dest);
4971         }
4972
4973       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4974       gimple_set_lhs (epilog_stmt, new_temp);
4975       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4976
4977       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4978           == INTEGER_INDUC_COND_REDUCTION)
4979         {
4980           /* Earlier we set the initial value to be zero.  Check the result
4981              and if it is zero then replace with the original initial
4982              value.  */
4983           tree zero = build_zero_cst (scalar_type);
4984           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4985
4986           tmp = make_ssa_name (new_scalar_dest);
4987           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4988                                              initial_def, new_temp);
4989           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4990           new_temp = tmp;
4991         }
4992
4993       scalar_results.safe_push (new_temp);
4994     }
4995   else
4996     {
4997       bool reduce_with_shift = have_whole_vector_shift (mode);
4998       int element_bitsize = tree_to_uhwi (bitsize);
4999       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5000       tree vec_temp;
5001
5002       /* COND reductions all do the final reduction with MAX_EXPR.  */
5003       if (code == COND_EXPR)
5004         code = MAX_EXPR;
5005
5006       /* Regardless of whether we have a whole vector shift, if we're
5007          emulating the operation via tree-vect-generic, we don't want
5008          to use it.  Only the first round of the reduction is likely
5009          to still be profitable via emulation.  */
5010       /* ??? It might be better to emit a reduction tree code here, so that
5011          tree-vect-generic can expand the first round via bit tricks.  */
5012       if (!VECTOR_MODE_P (mode))
5013         reduce_with_shift = false;
5014       else
5015         {
5016           optab optab = optab_for_tree_code (code, vectype, optab_default);
5017           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5018             reduce_with_shift = false;
5019         }
5020
5021       if (reduce_with_shift && !slp_reduc)
5022         {
5023           int nelements = vec_size_in_bits / element_bitsize;
5024           auto_vec_perm_indices sel (nelements);
5025
5026           int elt_offset;
5027
5028           tree zero_vec = build_zero_cst (vectype);
5029           /* Case 2: Create:
5030              for (offset = nelements/2; offset >= 1; offset/=2)
5031                 {
5032                   Create:  va' = vec_shift <va, offset>
5033                   Create:  va = vop <va, va'>
5034                 }  */
5035
5036           tree rhs;
5037
5038           if (dump_enabled_p ())
5039             dump_printf_loc (MSG_NOTE, vect_location,
5040                              "Reduce using vector shifts\n");
5041
5042           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5043           new_temp = new_phi_result;
5044           for (elt_offset = nelements / 2;
5045                elt_offset >= 1;
5046                elt_offset /= 2)
5047             {
5048               sel.truncate (0);
5049               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5050               tree mask = vect_gen_perm_mask_any (vectype, sel);
5051               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5052                                                  new_temp, zero_vec, mask);
5053               new_name = make_ssa_name (vec_dest, epilog_stmt);
5054               gimple_assign_set_lhs (epilog_stmt, new_name);
5055               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5056
5057               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5058                                                  new_temp);
5059               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5060               gimple_assign_set_lhs (epilog_stmt, new_temp);
5061               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5062             }
5063
5064           /* 2.4  Extract the final scalar result.  Create:
5065              s_out3 = extract_field <v_out2, bitpos>  */
5066
5067           if (dump_enabled_p ())
5068             dump_printf_loc (MSG_NOTE, vect_location,
5069                              "extract scalar result\n");
5070
5071           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5072                         bitsize, bitsize_zero_node);
5073           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5074           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5075           gimple_assign_set_lhs (epilog_stmt, new_temp);
5076           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5077           scalar_results.safe_push (new_temp);
5078         }
5079       else
5080         {
5081           /* Case 3: Create:
5082              s = extract_field <v_out2, 0>
5083              for (offset = element_size;
5084                   offset < vector_size;
5085                   offset += element_size;)
5086                {
5087                  Create:  s' = extract_field <v_out2, offset>
5088                  Create:  s = op <s, s'>  // For non SLP cases
5089                }  */
5090
5091           if (dump_enabled_p ())
5092             dump_printf_loc (MSG_NOTE, vect_location,
5093                              "Reduce using scalar code.\n");
5094
5095           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5096           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5097             {
5098               int bit_offset;
5099               if (gimple_code (new_phi) == GIMPLE_PHI)
5100                 vec_temp = PHI_RESULT (new_phi);
5101               else
5102                 vec_temp = gimple_assign_lhs (new_phi);
5103               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5104                             bitsize_zero_node);
5105               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5106               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5107               gimple_assign_set_lhs (epilog_stmt, new_temp);
5108               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5109
5110               /* In SLP we don't need to apply reduction operation, so we just
5111                  collect s' values in SCALAR_RESULTS.  */
5112               if (slp_reduc)
5113                 scalar_results.safe_push (new_temp);
5114
5115               for (bit_offset = element_bitsize;
5116                    bit_offset < vec_size_in_bits;
5117                    bit_offset += element_bitsize)
5118                 {
5119                   tree bitpos = bitsize_int (bit_offset);
5120                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5121                                      bitsize, bitpos);
5122
5123                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5124                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5125                   gimple_assign_set_lhs (epilog_stmt, new_name);
5126                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5127
5128                   if (slp_reduc)
5129                     {
5130                       /* In SLP we don't need to apply reduction operation, so
5131                          we just collect s' values in SCALAR_RESULTS.  */
5132                       new_temp = new_name;
5133                       scalar_results.safe_push (new_name);
5134                     }
5135                   else
5136                     {
5137                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5138                                                          new_name, new_temp);
5139                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5140                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5141                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5142                     }
5143                 }
5144             }
5145
5146           /* The only case where we need to reduce scalar results in SLP, is
5147              unrolling.  If the size of SCALAR_RESULTS is greater than
5148              GROUP_SIZE, we reduce them combining elements modulo
5149              GROUP_SIZE.  */
5150           if (slp_reduc)
5151             {
5152               tree res, first_res, new_res;
5153               gimple *new_stmt;
5154
5155               /* Reduce multiple scalar results in case of SLP unrolling.  */
5156               for (j = group_size; scalar_results.iterate (j, &res);
5157                    j++)
5158                 {
5159                   first_res = scalar_results[j % group_size];
5160                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5161                                                   first_res, res);
5162                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5163                   gimple_assign_set_lhs (new_stmt, new_res);
5164                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5165                   scalar_results[j % group_size] = new_res;
5166                 }
5167             }
5168           else
5169             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5170             scalar_results.safe_push (new_temp);
5171         }
5172
5173       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5174           == INTEGER_INDUC_COND_REDUCTION)
5175         {
5176           /* Earlier we set the initial value to be zero.  Check the result
5177              and if it is zero then replace with the original initial
5178              value.  */
5179           tree zero = build_zero_cst (scalar_type);
5180           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5181
5182           tree tmp = make_ssa_name (new_scalar_dest);
5183           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5184                                              initial_def, new_temp);
5185           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5186           scalar_results[0] = tmp;
5187         }
5188     }
5189
5190 vect_finalize_reduction:
5191
5192   if (double_reduc)
5193     loop = loop->inner;
5194
5195   /* 2.5 Adjust the final result by the initial value of the reduction
5196          variable. (When such adjustment is not needed, then
5197          'adjustment_def' is zero).  For example, if code is PLUS we create:
5198          new_temp = loop_exit_def + adjustment_def  */
5199
5200   if (adjustment_def)
5201     {
5202       gcc_assert (!slp_reduc);
5203       if (nested_in_vect_loop)
5204         {
5205           new_phi = new_phis[0];
5206           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5207           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5208           new_dest = vect_create_destination_var (scalar_dest, vectype);
5209         }
5210       else
5211         {
5212           new_temp = scalar_results[0];
5213           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5214           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5215           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5216         }
5217
5218       epilog_stmt = gimple_build_assign (new_dest, expr);
5219       new_temp = make_ssa_name (new_dest, epilog_stmt);
5220       gimple_assign_set_lhs (epilog_stmt, new_temp);
5221       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5222       if (nested_in_vect_loop)
5223         {
5224           set_vinfo_for_stmt (epilog_stmt,
5225                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5226           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5227                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5228
5229           if (!double_reduc)
5230             scalar_results.quick_push (new_temp);
5231           else
5232             scalar_results[0] = new_temp;
5233         }
5234       else
5235         scalar_results[0] = new_temp;
5236
5237       new_phis[0] = epilog_stmt;
5238     }
5239
5240   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5241           phis with new adjusted scalar results, i.e., replace use <s_out0>
5242           with use <s_out4>.
5243
5244      Transform:
5245         loop_exit:
5246           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5247           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5248           v_out2 = reduce <v_out1>
5249           s_out3 = extract_field <v_out2, 0>
5250           s_out4 = adjust_result <s_out3>
5251           use <s_out0>
5252           use <s_out0>
5253
5254      into:
5255
5256         loop_exit:
5257           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5258           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5259           v_out2 = reduce <v_out1>
5260           s_out3 = extract_field <v_out2, 0>
5261           s_out4 = adjust_result <s_out3>
5262           use <s_out4>
5263           use <s_out4> */
5264
5265
5266   /* In SLP reduction chain we reduce vector results into one vector if
5267      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5268      the last stmt in the reduction chain, since we are looking for the loop
5269      exit phi node.  */
5270   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5271     {
5272       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5273       /* Handle reduction patterns.  */
5274       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5275         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5276
5277       scalar_dest = gimple_assign_lhs (dest_stmt);
5278       group_size = 1;
5279     }
5280
5281   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5282      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5283      need to match SCALAR_RESULTS with corresponding statements.  The first
5284      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5285      the first vector stmt, etc.
5286      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5287   if (group_size > new_phis.length ())
5288     {
5289       ratio = group_size / new_phis.length ();
5290       gcc_assert (!(group_size % new_phis.length ()));
5291     }
5292   else
5293     ratio = 1;
5294
5295   for (k = 0; k < group_size; k++)
5296     {
5297       if (k % ratio == 0)
5298         {
5299           epilog_stmt = new_phis[k / ratio];
5300           reduction_phi = reduction_phis[k / ratio];
5301           if (double_reduc)
5302             inner_phi = inner_phis[k / ratio];
5303         }
5304
5305       if (slp_reduc)
5306         {
5307           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5308
5309           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5310           /* SLP statements can't participate in patterns.  */
5311           gcc_assert (!orig_stmt);
5312           scalar_dest = gimple_assign_lhs (current_stmt);
5313         }
5314
5315       phis.create (3);
5316       /* Find the loop-closed-use at the loop exit of the original scalar
5317          result.  (The reduction result is expected to have two immediate uses -
5318          one at the latch block, and one at the loop exit).  */
5319       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5320         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5321             && !is_gimple_debug (USE_STMT (use_p)))
5322           phis.safe_push (USE_STMT (use_p));
5323
5324       /* While we expect to have found an exit_phi because of loop-closed-ssa
5325          form we can end up without one if the scalar cycle is dead.  */
5326
5327       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5328         {
5329           if (outer_loop)
5330             {
5331               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5332               gphi *vect_phi;
5333
5334               /* FORNOW. Currently not supporting the case that an inner-loop
5335                  reduction is not used in the outer-loop (but only outside the
5336                  outer-loop), unless it is double reduction.  */
5337               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5338                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5339                           || double_reduc);
5340
5341               if (double_reduc)
5342                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5343               else
5344                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5345               if (!double_reduc
5346                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5347                       != vect_double_reduction_def)
5348                 continue;
5349
5350               /* Handle double reduction:
5351
5352                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5353                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5354                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5355                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5356
5357                  At that point the regular reduction (stmt2 and stmt3) is
5358                  already vectorized, as well as the exit phi node, stmt4.
5359                  Here we vectorize the phi node of double reduction, stmt1, and
5360                  update all relevant statements.  */
5361
5362               /* Go through all the uses of s2 to find double reduction phi
5363                  node, i.e., stmt1 above.  */
5364               orig_name = PHI_RESULT (exit_phi);
5365               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5366                 {
5367                   stmt_vec_info use_stmt_vinfo;
5368                   stmt_vec_info new_phi_vinfo;
5369                   tree vect_phi_init, preheader_arg, vect_phi_res;
5370                   basic_block bb = gimple_bb (use_stmt);
5371                   gimple *use;
5372
5373                   /* Check that USE_STMT is really double reduction phi
5374                      node.  */
5375                   if (gimple_code (use_stmt) != GIMPLE_PHI
5376                       || gimple_phi_num_args (use_stmt) != 2
5377                       || bb->loop_father != outer_loop)
5378                     continue;
5379                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5380                   if (!use_stmt_vinfo
5381                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5382                           != vect_double_reduction_def)
5383                     continue;
5384
5385                   /* Create vector phi node for double reduction:
5386                      vs1 = phi <vs0, vs2>
5387                      vs1 was created previously in this function by a call to
5388                        vect_get_vec_def_for_operand and is stored in
5389                        vec_initial_def;
5390                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5391                      vs0 is created here.  */
5392
5393                   /* Create vector phi node.  */
5394                   vect_phi = create_phi_node (vec_initial_def, bb);
5395                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5396                                     loop_vec_info_for_loop (outer_loop));
5397                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5398
5399                   /* Create vs0 - initial def of the double reduction phi.  */
5400                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5401                                              loop_preheader_edge (outer_loop));
5402                   vect_phi_init = get_initial_def_for_reduction
5403                     (stmt, preheader_arg, NULL);
5404
5405                   /* Update phi node arguments with vs0 and vs2.  */
5406                   add_phi_arg (vect_phi, vect_phi_init,
5407                                loop_preheader_edge (outer_loop),
5408                                UNKNOWN_LOCATION);
5409                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5410                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5411                   if (dump_enabled_p ())
5412                     {
5413                       dump_printf_loc (MSG_NOTE, vect_location,
5414                                        "created double reduction phi node: ");
5415                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5416                     }
5417
5418                   vect_phi_res = PHI_RESULT (vect_phi);
5419
5420                   /* Replace the use, i.e., set the correct vs1 in the regular
5421                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5422                      loop is redundant.  */
5423                   use = reduction_phi;
5424                   for (j = 0; j < ncopies; j++)
5425                     {
5426                       edge pr_edge = loop_preheader_edge (loop);
5427                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5428                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5429                     }
5430                 }
5431             }
5432         }
5433
5434       phis.release ();
5435       if (nested_in_vect_loop)
5436         {
5437           if (double_reduc)
5438             loop = outer_loop;
5439           else
5440             continue;
5441         }
5442
5443       phis.create (3);
5444       /* Find the loop-closed-use at the loop exit of the original scalar
5445          result.  (The reduction result is expected to have two immediate uses,
5446          one at the latch block, and one at the loop exit).  For double
5447          reductions we are looking for exit phis of the outer loop.  */
5448       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5449         {
5450           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5451             {
5452               if (!is_gimple_debug (USE_STMT (use_p)))
5453                 phis.safe_push (USE_STMT (use_p));
5454             }
5455           else
5456             {
5457               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5458                 {
5459                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5460
5461                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5462                     {
5463                       if (!flow_bb_inside_loop_p (loop,
5464                                              gimple_bb (USE_STMT (phi_use_p)))
5465                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5466                         phis.safe_push (USE_STMT (phi_use_p));
5467                     }
5468                 }
5469             }
5470         }
5471
5472       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5473         {
5474           /* Replace the uses:  */
5475           orig_name = PHI_RESULT (exit_phi);
5476           scalar_result = scalar_results[k];
5477           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5478             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5479               SET_USE (use_p, scalar_result);
5480         }
5481
5482       phis.release ();
5483     }
5484 }
5485
5486
5487 /* Function is_nonwrapping_integer_induction.
5488
5489    Check if STMT (which is part of loop LOOP) both increments and
5490    does not cause overflow.  */
5491
5492 static bool
5493 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5494 {
5495   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5496   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5497   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5498   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5499   widest_int ni, max_loop_value, lhs_max;
5500   bool overflow = false;
5501
5502   /* Make sure the loop is integer based.  */
5503   if (TREE_CODE (base) != INTEGER_CST
5504       || TREE_CODE (step) != INTEGER_CST)
5505     return false;
5506
5507   /* Check that the induction increments.  */
5508   if (tree_int_cst_sgn (step) == -1)
5509     return false;
5510
5511   /* Check that the max size of the loop will not wrap.  */
5512
5513   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5514     return true;
5515
5516   if (! max_stmt_executions (loop, &ni))
5517     return false;
5518
5519   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5520                             &overflow);
5521   if (overflow)
5522     return false;
5523
5524   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5525                             TYPE_SIGN (lhs_type), &overflow);
5526   if (overflow)
5527     return false;
5528
5529   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5530           <= TYPE_PRECISION (lhs_type));
5531 }
5532
5533 /* Function vectorizable_reduction.
5534
5535    Check if STMT performs a reduction operation that can be vectorized.
5536    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5537    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5538    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5539
5540    This function also handles reduction idioms (patterns) that have been
5541    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5542    of this form:
5543      X = pattern_expr (arg0, arg1, ..., X)
5544    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5545    sequence that had been detected and replaced by the pattern-stmt (STMT).
5546
5547    This function also handles reduction of condition expressions, for example:
5548      for (int i = 0; i < N; i++)
5549        if (a[i] < value)
5550          last = a[i];
5551    This is handled by vectorising the loop and creating an additional vector
5552    containing the loop indexes for which "a[i] < value" was true.  In the
5553    function epilogue this is reduced to a single max value and then used to
5554    index into the vector of results.
5555
5556    In some cases of reduction patterns, the type of the reduction variable X is
5557    different than the type of the other arguments of STMT.
5558    In such cases, the vectype that is used when transforming STMT into a vector
5559    stmt is different than the vectype that is used to determine the
5560    vectorization factor, because it consists of a different number of elements
5561    than the actual number of elements that are being operated upon in parallel.
5562
5563    For example, consider an accumulation of shorts into an int accumulator.
5564    On some targets it's possible to vectorize this pattern operating on 8
5565    shorts at a time (hence, the vectype for purposes of determining the
5566    vectorization factor should be V8HI); on the other hand, the vectype that
5567    is used to create the vector form is actually V4SI (the type of the result).
5568
5569    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5570    indicates what is the actual level of parallelism (V8HI in the example), so
5571    that the right vectorization factor would be derived.  This vectype
5572    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5573    be used to create the vectorized stmt.  The right vectype for the vectorized
5574    stmt is obtained from the type of the result X:
5575         get_vectype_for_scalar_type (TREE_TYPE (X))
5576
5577    This means that, contrary to "regular" reductions (or "regular" stmts in
5578    general), the following equation:
5579       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5580    does *NOT* necessarily hold for reduction patterns.  */
5581
5582 bool
5583 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5584                         gimple **vec_stmt, slp_tree slp_node,
5585                         slp_instance slp_node_instance)
5586 {
5587   tree vec_dest;
5588   tree scalar_dest;
5589   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5590   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5591   tree vectype_in = NULL_TREE;
5592   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5593   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5594   enum tree_code code, orig_code;
5595   internal_fn reduc_fn;
5596   machine_mode vec_mode;
5597   int op_type;
5598   optab optab;
5599   tree new_temp = NULL_TREE;
5600   gimple *def_stmt;
5601   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5602   tree scalar_type;
5603   bool is_simple_use;
5604   gimple *orig_stmt;
5605   stmt_vec_info orig_stmt_info = NULL;
5606   int i;
5607   int ncopies;
5608   int epilog_copies;
5609   stmt_vec_info prev_stmt_info, prev_phi_info;
5610   bool single_defuse_cycle = false;
5611   gimple *new_stmt = NULL;
5612   int j;
5613   tree ops[3];
5614   enum vect_def_type dts[3];
5615   bool nested_cycle = false, found_nested_cycle_def = false;
5616   bool double_reduc = false;
5617   basic_block def_bb;
5618   struct loop * def_stmt_loop, *outer_loop = NULL;
5619   tree def_arg;
5620   gimple *def_arg_stmt;
5621   auto_vec<tree> vec_oprnds0;
5622   auto_vec<tree> vec_oprnds1;
5623   auto_vec<tree> vec_oprnds2;
5624   auto_vec<tree> vect_defs;
5625   auto_vec<gimple *> phis;
5626   int vec_num;
5627   tree def0, tem;
5628   bool first_p = true;
5629   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5630   tree cond_reduc_val = NULL_TREE;
5631
5632   /* Make sure it was already recognized as a reduction computation.  */
5633   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5634       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5635     return false;
5636
5637   if (nested_in_vect_loop_p (loop, stmt))
5638     {
5639       outer_loop = loop;
5640       loop = loop->inner;
5641       nested_cycle = true;
5642     }
5643
5644   /* In case of reduction chain we switch to the first stmt in the chain, but
5645      we don't update STMT_INFO, since only the last stmt is marked as reduction
5646      and has reduction properties.  */
5647   if (GROUP_FIRST_ELEMENT (stmt_info)
5648       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5649     {
5650       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5651       first_p = false;
5652     }
5653
5654   if (gimple_code (stmt) == GIMPLE_PHI)
5655     {
5656       /* Analysis is fully done on the reduction stmt invocation.  */
5657       if (! vec_stmt)
5658         {
5659           if (slp_node)
5660             slp_node_instance->reduc_phis = slp_node;
5661
5662           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5663           return true;
5664         }
5665
5666       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5667       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5668         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5669
5670       gcc_assert (is_gimple_assign (reduc_stmt));
5671       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5672         {
5673           tree op = gimple_op (reduc_stmt, k);
5674           if (op == gimple_phi_result (stmt))
5675             continue;
5676           if (k == 1
5677               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5678             continue;
5679           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5680           if (! vectype_in
5681               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5682             vectype_in = tem;
5683           break;
5684         }
5685       gcc_assert (vectype_in);
5686
5687       if (slp_node)
5688         ncopies = 1;
5689       else
5690         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5691
5692       use_operand_p use_p;
5693       gimple *use_stmt;
5694       if (ncopies > 1
5695           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5696               <= vect_used_only_live)
5697           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5698           && (use_stmt == reduc_stmt
5699               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5700                   == reduc_stmt)))
5701         single_defuse_cycle = true;
5702
5703       /* Create the destination vector  */
5704       scalar_dest = gimple_assign_lhs (reduc_stmt);
5705       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5706
5707       if (slp_node)
5708         /* The size vect_schedule_slp_instance computes is off for us.  */
5709         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5710                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5711                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5712       else
5713         vec_num = 1;
5714
5715       /* Generate the reduction PHIs upfront.  */
5716       prev_phi_info = NULL;
5717       for (j = 0; j < ncopies; j++)
5718         {
5719           if (j == 0 || !single_defuse_cycle)
5720             {
5721               for (i = 0; i < vec_num; i++)
5722                 {
5723                   /* Create the reduction-phi that defines the reduction
5724                      operand.  */
5725                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5726                   set_vinfo_for_stmt (new_phi,
5727                                       new_stmt_vec_info (new_phi, loop_vinfo));
5728
5729                   if (slp_node)
5730                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5731                   else
5732                     {
5733                       if (j == 0)
5734                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5735                       else
5736                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5737                       prev_phi_info = vinfo_for_stmt (new_phi);
5738                     }
5739                 }
5740             }
5741         }
5742
5743       return true;
5744     }
5745
5746   /* 1. Is vectorizable reduction?  */
5747   /* Not supportable if the reduction variable is used in the loop, unless
5748      it's a reduction chain.  */
5749   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5750       && !GROUP_FIRST_ELEMENT (stmt_info))
5751     return false;
5752
5753   /* Reductions that are not used even in an enclosing outer-loop,
5754      are expected to be "live" (used out of the loop).  */
5755   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5756       && !STMT_VINFO_LIVE_P (stmt_info))
5757     return false;
5758
5759   /* 2. Has this been recognized as a reduction pattern?
5760
5761      Check if STMT represents a pattern that has been recognized
5762      in earlier analysis stages.  For stmts that represent a pattern,
5763      the STMT_VINFO_RELATED_STMT field records the last stmt in
5764      the original sequence that constitutes the pattern.  */
5765
5766   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5767   if (orig_stmt)
5768     {
5769       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5770       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5771       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5772     }
5773
5774   /* 3. Check the operands of the operation.  The first operands are defined
5775         inside the loop body. The last operand is the reduction variable,
5776         which is defined by the loop-header-phi.  */
5777
5778   gcc_assert (is_gimple_assign (stmt));
5779
5780   /* Flatten RHS.  */
5781   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5782     {
5783     case GIMPLE_BINARY_RHS:
5784       code = gimple_assign_rhs_code (stmt);
5785       op_type = TREE_CODE_LENGTH (code);
5786       gcc_assert (op_type == binary_op);
5787       ops[0] = gimple_assign_rhs1 (stmt);
5788       ops[1] = gimple_assign_rhs2 (stmt);
5789       break;
5790
5791     case GIMPLE_TERNARY_RHS:
5792       code = gimple_assign_rhs_code (stmt);
5793       op_type = TREE_CODE_LENGTH (code);
5794       gcc_assert (op_type == ternary_op);
5795       ops[0] = gimple_assign_rhs1 (stmt);
5796       ops[1] = gimple_assign_rhs2 (stmt);
5797       ops[2] = gimple_assign_rhs3 (stmt);
5798       break;
5799
5800     case GIMPLE_UNARY_RHS:
5801       return false;
5802
5803     default:
5804       gcc_unreachable ();
5805     }
5806
5807   if (code == COND_EXPR && slp_node)
5808     return false;
5809
5810   scalar_dest = gimple_assign_lhs (stmt);
5811   scalar_type = TREE_TYPE (scalar_dest);
5812   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5813       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5814     return false;
5815
5816   /* Do not try to vectorize bit-precision reductions.  */
5817   if (!type_has_mode_precision_p (scalar_type))
5818     return false;
5819
5820   /* All uses but the last are expected to be defined in the loop.
5821      The last use is the reduction variable.  In case of nested cycle this
5822      assumption is not true: we use reduc_index to record the index of the
5823      reduction variable.  */
5824   gimple *reduc_def_stmt = NULL;
5825   int reduc_index = -1;
5826   for (i = 0; i < op_type; i++)
5827     {
5828       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5829       if (i == 0 && code == COND_EXPR)
5830         continue;
5831
5832       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5833                                           &def_stmt, &dts[i], &tem);
5834       dt = dts[i];
5835       gcc_assert (is_simple_use);
5836       if (dt == vect_reduction_def)
5837         {
5838           reduc_def_stmt = def_stmt;
5839           reduc_index = i;
5840           continue;
5841         }
5842       else if (tem)
5843         {
5844           /* To properly compute ncopies we are interested in the widest
5845              input type in case we're looking at a widening accumulation.  */
5846           if (!vectype_in
5847               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5848             vectype_in = tem;
5849         }
5850
5851       if (dt != vect_internal_def
5852           && dt != vect_external_def
5853           && dt != vect_constant_def
5854           && dt != vect_induction_def
5855           && !(dt == vect_nested_cycle && nested_cycle))
5856         return false;
5857
5858       if (dt == vect_nested_cycle)
5859         {
5860           found_nested_cycle_def = true;
5861           reduc_def_stmt = def_stmt;
5862           reduc_index = i;
5863         }
5864
5865       if (i == 1 && code == COND_EXPR)
5866         {
5867           /* Record how value of COND_EXPR is defined.  */
5868           if (dt == vect_constant_def)
5869             {
5870               cond_reduc_dt = dt;
5871               cond_reduc_val = ops[i];
5872             }
5873           if (dt == vect_induction_def && def_stmt != NULL
5874               && is_nonwrapping_integer_induction (def_stmt, loop))
5875             cond_reduc_dt = dt;
5876         }
5877     }
5878
5879   if (!vectype_in)
5880     vectype_in = vectype_out;
5881
5882   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5883      directy used in stmt.  */
5884   if (reduc_index == -1)
5885     {
5886       if (orig_stmt)
5887         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5888       else
5889         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5890     }
5891
5892   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5893     return false;
5894
5895   if (!(reduc_index == -1
5896         || dts[reduc_index] == vect_reduction_def
5897         || dts[reduc_index] == vect_nested_cycle
5898         || ((dts[reduc_index] == vect_internal_def
5899              || dts[reduc_index] == vect_external_def
5900              || dts[reduc_index] == vect_constant_def
5901              || dts[reduc_index] == vect_induction_def)
5902             && nested_cycle && found_nested_cycle_def)))
5903     {
5904       /* For pattern recognized stmts, orig_stmt might be a reduction,
5905          but some helper statements for the pattern might not, or
5906          might be COND_EXPRs with reduction uses in the condition.  */
5907       gcc_assert (orig_stmt);
5908       return false;
5909     }
5910
5911   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5912   enum vect_reduction_type v_reduc_type
5913     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5914   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5915
5916   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5917   /* If we have a condition reduction, see if we can simplify it further.  */
5918   if (v_reduc_type == COND_REDUCTION)
5919     {
5920       if (cond_reduc_dt == vect_induction_def)
5921         {
5922           if (dump_enabled_p ())
5923             dump_printf_loc (MSG_NOTE, vect_location,
5924                              "condition expression based on "
5925                              "integer induction.\n");
5926           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5927             = INTEGER_INDUC_COND_REDUCTION;
5928         }
5929
5930       /* Loop peeling modifies initial value of reduction PHI, which
5931          makes the reduction stmt to be transformed different to the
5932          original stmt analyzed.  We need to record reduction code for
5933          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5934          it can be used directly at transform stage.  */
5935       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5936           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5937         {
5938           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5939           gcc_assert (cond_reduc_dt == vect_constant_def);
5940           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5941         }
5942       else if (cond_reduc_dt == vect_constant_def)
5943         {
5944           enum vect_def_type cond_initial_dt;
5945           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5946           tree cond_initial_val
5947             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5948
5949           gcc_assert (cond_reduc_val != NULL_TREE);
5950           vect_is_simple_use (cond_initial_val, loop_vinfo,
5951                               &def_stmt, &cond_initial_dt);
5952           if (cond_initial_dt == vect_constant_def
5953               && types_compatible_p (TREE_TYPE (cond_initial_val),
5954                                      TREE_TYPE (cond_reduc_val)))
5955             {
5956               tree e = fold_binary (LE_EXPR, boolean_type_node,
5957                                     cond_initial_val, cond_reduc_val);
5958               if (e && (integer_onep (e) || integer_zerop (e)))
5959                 {
5960                   if (dump_enabled_p ())
5961                     dump_printf_loc (MSG_NOTE, vect_location,
5962                                      "condition expression based on "
5963                                      "compile time constant.\n");
5964                   /* Record reduction code at analysis stage.  */
5965                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5966                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5967                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5968                     = CONST_COND_REDUCTION;
5969                 }
5970             }
5971         }
5972     }
5973
5974   if (orig_stmt)
5975     gcc_assert (tmp == orig_stmt
5976                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5977   else
5978     /* We changed STMT to be the first stmt in reduction chain, hence we
5979        check that in this case the first element in the chain is STMT.  */
5980     gcc_assert (stmt == tmp
5981                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5982
5983   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5984     return false;
5985
5986   if (slp_node)
5987     ncopies = 1;
5988   else
5989     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5990
5991   gcc_assert (ncopies >= 1);
5992
5993   vec_mode = TYPE_MODE (vectype_in);
5994
5995   if (code == COND_EXPR)
5996     {
5997       /* Only call during the analysis stage, otherwise we'll lose
5998          STMT_VINFO_TYPE.  */
5999       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6000                                                 ops[reduc_index], 0, NULL))
6001         {
6002           if (dump_enabled_p ())
6003             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6004                              "unsupported condition in reduction\n");
6005           return false;
6006         }
6007     }
6008   else
6009     {
6010       /* 4. Supportable by target?  */
6011
6012       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6013           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6014         {
6015           /* Shifts and rotates are only supported by vectorizable_shifts,
6016              not vectorizable_reduction.  */
6017           if (dump_enabled_p ())
6018             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6019                              "unsupported shift or rotation.\n");
6020           return false;
6021         }
6022
6023       /* 4.1. check support for the operation in the loop  */
6024       optab = optab_for_tree_code (code, vectype_in, optab_default);
6025       if (!optab)
6026         {
6027           if (dump_enabled_p ())
6028             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6029                              "no optab.\n");
6030
6031           return false;
6032         }
6033
6034       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6035         {
6036           if (dump_enabled_p ())
6037             dump_printf (MSG_NOTE, "op not supported by target.\n");
6038
6039           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6040               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6041             return false;
6042
6043           if (dump_enabled_p ())
6044             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6045         }
6046
6047       /* Worthwhile without SIMD support?  */
6048       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6049           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6050         {
6051           if (dump_enabled_p ())
6052             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6053                              "not worthwhile without SIMD support.\n");
6054
6055           return false;
6056         }
6057     }
6058
6059   /* 4.2. Check support for the epilog operation.
6060
6061           If STMT represents a reduction pattern, then the type of the
6062           reduction variable may be different than the type of the rest
6063           of the arguments.  For example, consider the case of accumulation
6064           of shorts into an int accumulator; The original code:
6065                         S1: int_a = (int) short_a;
6066           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6067
6068           was replaced with:
6069                         STMT: int_acc = widen_sum <short_a, int_acc>
6070
6071           This means that:
6072           1. The tree-code that is used to create the vector operation in the
6073              epilog code (that reduces the partial results) is not the
6074              tree-code of STMT, but is rather the tree-code of the original
6075              stmt from the pattern that STMT is replacing.  I.e, in the example
6076              above we want to use 'widen_sum' in the loop, but 'plus' in the
6077              epilog.
6078           2. The type (mode) we use to check available target support
6079              for the vector operation to be created in the *epilog*, is
6080              determined by the type of the reduction variable (in the example
6081              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6082              However the type (mode) we use to check available target support
6083              for the vector operation to be created *inside the loop*, is
6084              determined by the type of the other arguments to STMT (in the
6085              example we'd check this: optab_handler (widen_sum_optab,
6086              vect_short_mode)).
6087
6088           This is contrary to "regular" reductions, in which the types of all
6089           the arguments are the same as the type of the reduction variable.
6090           For "regular" reductions we can therefore use the same vector type
6091           (and also the same tree-code) when generating the epilog code and
6092           when generating the code inside the loop.  */
6093
6094   if (orig_stmt)
6095     {
6096       /* This is a reduction pattern: get the vectype from the type of the
6097          reduction variable, and get the tree-code from orig_stmt.  */
6098       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6099                   == TREE_CODE_REDUCTION);
6100       orig_code = gimple_assign_rhs_code (orig_stmt);
6101       gcc_assert (vectype_out);
6102       vec_mode = TYPE_MODE (vectype_out);
6103     }
6104   else
6105     {
6106       /* Regular reduction: use the same vectype and tree-code as used for
6107          the vector code inside the loop can be used for the epilog code. */
6108       orig_code = code;
6109
6110       if (code == MINUS_EXPR)
6111         orig_code = PLUS_EXPR;
6112
6113       /* For simple condition reductions, replace with the actual expression
6114          we want to base our reduction around.  */
6115       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6116         {
6117           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6118           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6119         }
6120       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6121                  == INTEGER_INDUC_COND_REDUCTION)
6122         orig_code = MAX_EXPR;
6123     }
6124
6125   if (nested_cycle)
6126     {
6127       def_bb = gimple_bb (reduc_def_stmt);
6128       def_stmt_loop = def_bb->loop_father;
6129       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6130                                        loop_preheader_edge (def_stmt_loop));
6131       if (TREE_CODE (def_arg) == SSA_NAME
6132           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6133           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6134           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6135           && vinfo_for_stmt (def_arg_stmt)
6136           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6137               == vect_double_reduction_def)
6138         double_reduc = true;
6139     }
6140
6141   reduc_fn = IFN_LAST;
6142
6143   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6144     {
6145       if (reduction_fn_for_scalar_code (orig_code, &reduc_fn))
6146         {
6147           if (reduc_fn != IFN_LAST
6148               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
6149                                                   OPTIMIZE_FOR_SPEED))
6150             {
6151               if (dump_enabled_p ())
6152                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153                                  "reduc op not supported by target.\n");
6154
6155               reduc_fn = IFN_LAST;
6156             }
6157         }
6158       else
6159         {
6160           if (!nested_cycle || double_reduc)
6161             {
6162               if (dump_enabled_p ())
6163                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6164                                  "no reduc code for scalar code.\n");
6165
6166               return false;
6167             }
6168         }
6169     }
6170   else
6171     {
6172       int scalar_precision
6173         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6174       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6175       cr_index_vector_type = build_vector_type
6176         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6177
6178       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
6179                                           OPTIMIZE_FOR_SPEED))
6180         reduc_fn = IFN_REDUC_MAX;
6181     }
6182
6183   if ((double_reduc
6184        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6185       && ncopies > 1)
6186     {
6187       if (dump_enabled_p ())
6188         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6189                          "multiple types in double reduction or condition "
6190                          "reduction.\n");
6191       return false;
6192     }
6193
6194   /* In case of widenning multiplication by a constant, we update the type
6195      of the constant to be the type of the other operand.  We check that the
6196      constant fits the type in the pattern recognition pass.  */
6197   if (code == DOT_PROD_EXPR
6198       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6199     {
6200       if (TREE_CODE (ops[0]) == INTEGER_CST)
6201         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6202       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6203         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6204       else
6205         {
6206           if (dump_enabled_p ())
6207             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6208                              "invalid types in dot-prod\n");
6209
6210           return false;
6211         }
6212     }
6213
6214   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6215     {
6216       widest_int ni;
6217
6218       if (! max_loop_iterations (loop, &ni))
6219         {
6220           if (dump_enabled_p ())
6221             dump_printf_loc (MSG_NOTE, vect_location,
6222                              "loop count not known, cannot create cond "
6223                              "reduction.\n");
6224           return false;
6225         }
6226       /* Convert backedges to iterations.  */
6227       ni += 1;
6228
6229       /* The additional index will be the same type as the condition.  Check
6230          that the loop can fit into this less one (because we'll use up the
6231          zero slot for when there are no matches).  */
6232       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6233       if (wi::geu_p (ni, wi::to_widest (max_index)))
6234         {
6235           if (dump_enabled_p ())
6236             dump_printf_loc (MSG_NOTE, vect_location,
6237                              "loop size is greater than data size.\n");
6238           return false;
6239         }
6240     }
6241
6242   /* In case the vectorization factor (VF) is bigger than the number
6243      of elements that we can fit in a vectype (nunits), we have to generate
6244      more than one vector stmt - i.e - we need to "unroll" the
6245      vector stmt by a factor VF/nunits.  For more details see documentation
6246      in vectorizable_operation.  */
6247
6248   /* If the reduction is used in an outer loop we need to generate
6249      VF intermediate results, like so (e.g. for ncopies=2):
6250         r0 = phi (init, r0)
6251         r1 = phi (init, r1)
6252         r0 = x0 + r0;
6253         r1 = x1 + r1;
6254     (i.e. we generate VF results in 2 registers).
6255     In this case we have a separate def-use cycle for each copy, and therefore
6256     for each copy we get the vector def for the reduction variable from the
6257     respective phi node created for this copy.
6258
6259     Otherwise (the reduction is unused in the loop nest), we can combine
6260     together intermediate results, like so (e.g. for ncopies=2):
6261         r = phi (init, r)
6262         r = x0 + r;
6263         r = x1 + r;
6264    (i.e. we generate VF/2 results in a single register).
6265    In this case for each copy we get the vector def for the reduction variable
6266    from the vectorized reduction operation generated in the previous iteration.
6267
6268    This only works when we see both the reduction PHI and its only consumer
6269    in vectorizable_reduction and there are no intermediate stmts
6270    participating.  */
6271   use_operand_p use_p;
6272   gimple *use_stmt;
6273   if (ncopies > 1
6274       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6275       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6276       && (use_stmt == stmt
6277           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6278     {
6279       single_defuse_cycle = true;
6280       epilog_copies = 1;
6281     }
6282   else
6283     epilog_copies = ncopies;
6284
6285   /* If the reduction stmt is one of the patterns that have lane
6286      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6287   if ((ncopies > 1
6288        && ! single_defuse_cycle)
6289       && (code == DOT_PROD_EXPR
6290           || code == WIDEN_SUM_EXPR
6291           || code == SAD_EXPR))
6292     {
6293       if (dump_enabled_p ())
6294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6295                          "multi def-use cycle not possible for lane-reducing "
6296                          "reduction operation\n");
6297       return false;
6298     }
6299
6300   if (!vec_stmt) /* transformation not required.  */
6301     {
6302       if (first_p)
6303         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
6304       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6305       return true;
6306     }
6307
6308   /* Transform.  */
6309
6310   if (dump_enabled_p ())
6311     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6312
6313   /* FORNOW: Multiple types are not supported for condition.  */
6314   if (code == COND_EXPR)
6315     gcc_assert (ncopies == 1);
6316
6317   /* Create the destination vector  */
6318   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6319
6320   prev_stmt_info = NULL;
6321   prev_phi_info = NULL;
6322   if (slp_node)
6323     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6324   else
6325     {
6326       vec_num = 1;
6327       vec_oprnds0.create (1);
6328       vec_oprnds1.create (1);
6329       if (op_type == ternary_op)
6330         vec_oprnds2.create (1);
6331     }
6332
6333   phis.create (vec_num);
6334   vect_defs.create (vec_num);
6335   if (!slp_node)
6336     vect_defs.quick_push (NULL_TREE);
6337
6338   if (slp_node)
6339     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6340   else
6341     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6342
6343   for (j = 0; j < ncopies; j++)
6344     {
6345       if (code == COND_EXPR)
6346         {
6347           gcc_assert (!slp_node);
6348           vectorizable_condition (stmt, gsi, vec_stmt,
6349                                   PHI_RESULT (phis[0]),
6350                                   reduc_index, NULL);
6351           /* Multiple types are not supported for condition.  */
6352           break;
6353         }
6354
6355       /* Handle uses.  */
6356       if (j == 0)
6357         {
6358           if (slp_node)
6359             {
6360               /* Get vec defs for all the operands except the reduction index,
6361                  ensuring the ordering of the ops in the vector is kept.  */
6362               auto_vec<tree, 3> slp_ops;
6363               auto_vec<vec<tree>, 3> vec_defs;
6364
6365               slp_ops.quick_push (ops[0]);
6366               slp_ops.quick_push (ops[1]);
6367               if (op_type == ternary_op)
6368                 slp_ops.quick_push (ops[2]);
6369
6370               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6371
6372               vec_oprnds0.safe_splice (vec_defs[0]);
6373               vec_defs[0].release ();
6374               vec_oprnds1.safe_splice (vec_defs[1]);
6375               vec_defs[1].release ();
6376               if (op_type == ternary_op)
6377                 {
6378                   vec_oprnds2.safe_splice (vec_defs[2]);
6379                   vec_defs[2].release ();
6380                 }
6381             }
6382           else
6383             {
6384               vec_oprnds0.quick_push
6385                 (vect_get_vec_def_for_operand (ops[0], stmt));
6386               vec_oprnds1.quick_push
6387                 (vect_get_vec_def_for_operand (ops[1], stmt));
6388               if (op_type == ternary_op)
6389                 vec_oprnds2.quick_push
6390                   (vect_get_vec_def_for_operand (ops[2], stmt));
6391             }
6392         }
6393       else
6394         {
6395           if (!slp_node)
6396             {
6397               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6398
6399               if (single_defuse_cycle && reduc_index == 0)
6400                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6401               else
6402                 vec_oprnds0[0]
6403                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6404               if (single_defuse_cycle && reduc_index == 1)
6405                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6406               else
6407                 vec_oprnds1[0]
6408                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6409               if (op_type == ternary_op)
6410                 {
6411                   if (single_defuse_cycle && reduc_index == 2)
6412                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6413                   else
6414                     vec_oprnds2[0]
6415                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6416                 }
6417             }
6418         }
6419
6420       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6421         {
6422           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6423           if (op_type == ternary_op)
6424             vop[2] = vec_oprnds2[i];
6425
6426           new_temp = make_ssa_name (vec_dest, new_stmt);
6427           new_stmt = gimple_build_assign (new_temp, code,
6428                                           vop[0], vop[1], vop[2]);
6429           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6430
6431           if (slp_node)
6432             {
6433               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6434               vect_defs.quick_push (new_temp);
6435             }
6436           else
6437             vect_defs[0] = new_temp;
6438         }
6439
6440       if (slp_node)
6441         continue;
6442
6443       if (j == 0)
6444         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6445       else
6446         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6447
6448       prev_stmt_info = vinfo_for_stmt (new_stmt);
6449     }
6450
6451   /* Finalize the reduction-phi (set its arguments) and create the
6452      epilog reduction code.  */
6453   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6454     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6455
6456   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6457                                     epilog_copies, reduc_fn, phis,
6458                                     double_reduc, slp_node, slp_node_instance);
6459
6460   return true;
6461 }
6462
6463 /* Function vect_min_worthwhile_factor.
6464
6465    For a loop where we could vectorize the operation indicated by CODE,
6466    return the minimum vectorization factor that makes it worthwhile
6467    to use generic vectors.  */
6468 int
6469 vect_min_worthwhile_factor (enum tree_code code)
6470 {
6471   switch (code)
6472     {
6473     case PLUS_EXPR:
6474     case MINUS_EXPR:
6475     case NEGATE_EXPR:
6476       return 4;
6477
6478     case BIT_AND_EXPR:
6479     case BIT_IOR_EXPR:
6480     case BIT_XOR_EXPR:
6481     case BIT_NOT_EXPR:
6482       return 2;
6483
6484     default:
6485       return INT_MAX;
6486     }
6487 }
6488
6489 /* Return true if VINFO indicates we are doing loop vectorization and if
6490    it is worth decomposing CODE operations into scalar operations for
6491    that loop's vectorization factor.  */
6492
6493 bool
6494 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6495 {
6496   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6497   return (loop_vinfo
6498           && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6499               >= vect_min_worthwhile_factor (code)));
6500 }
6501
6502 /* Function vectorizable_induction
6503
6504    Check if PHI performs an induction computation that can be vectorized.
6505    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6506    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6507    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6508
6509 bool
6510 vectorizable_induction (gimple *phi,
6511                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6512                         gimple **vec_stmt, slp_tree slp_node)
6513 {
6514   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6515   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6516   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6517   unsigned ncopies;
6518   bool nested_in_vect_loop = false;
6519   struct loop *iv_loop;
6520   tree vec_def;
6521   edge pe = loop_preheader_edge (loop);
6522   basic_block new_bb;
6523   tree new_vec, vec_init, vec_step, t;
6524   tree new_name;
6525   gimple *new_stmt;
6526   gphi *induction_phi;
6527   tree induc_def, vec_dest;
6528   tree init_expr, step_expr;
6529   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6530   unsigned i;
6531   tree expr;
6532   gimple_seq stmts;
6533   imm_use_iterator imm_iter;
6534   use_operand_p use_p;
6535   gimple *exit_phi;
6536   edge latch_e;
6537   tree loop_arg;
6538   gimple_stmt_iterator si;
6539   basic_block bb = gimple_bb (phi);
6540
6541   if (gimple_code (phi) != GIMPLE_PHI)
6542     return false;
6543
6544   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6545     return false;
6546
6547   /* Make sure it was recognized as induction computation.  */
6548   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6549     return false;
6550
6551   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6552   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6553
6554   if (slp_node)
6555     ncopies = 1;
6556   else
6557     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6558   gcc_assert (ncopies >= 1);
6559
6560   /* FORNOW. These restrictions should be relaxed.  */
6561   if (nested_in_vect_loop_p (loop, phi))
6562     {
6563       imm_use_iterator imm_iter;
6564       use_operand_p use_p;
6565       gimple *exit_phi;
6566       edge latch_e;
6567       tree loop_arg;
6568
6569       if (ncopies > 1)
6570         {
6571           if (dump_enabled_p ())
6572             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6573                              "multiple types in nested loop.\n");
6574           return false;
6575         }
6576
6577       /* FORNOW: outer loop induction with SLP not supported.  */
6578       if (STMT_SLP_TYPE (stmt_info))
6579         return false;
6580
6581       exit_phi = NULL;
6582       latch_e = loop_latch_edge (loop->inner);
6583       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6584       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6585         {
6586           gimple *use_stmt = USE_STMT (use_p);
6587           if (is_gimple_debug (use_stmt))
6588             continue;
6589
6590           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6591             {
6592               exit_phi = use_stmt;
6593               break;
6594             }
6595         }
6596       if (exit_phi)
6597         {
6598           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6599           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6600                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6601             {
6602               if (dump_enabled_p ())
6603                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6604                                  "inner-loop induction only used outside "
6605                                  "of the outer vectorized loop.\n");
6606               return false;
6607             }
6608         }
6609
6610       nested_in_vect_loop = true;
6611       iv_loop = loop->inner;
6612     }
6613   else
6614     iv_loop = loop;
6615   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6616
6617   if (!vec_stmt) /* transformation not required.  */
6618     {
6619       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6620       if (dump_enabled_p ())
6621         dump_printf_loc (MSG_NOTE, vect_location,
6622                          "=== vectorizable_induction ===\n");
6623       vect_model_induction_cost (stmt_info, ncopies);
6624       return true;
6625     }
6626
6627   /* Transform.  */
6628
6629   /* Compute a vector variable, initialized with the first VF values of
6630      the induction variable.  E.g., for an iv with IV_PHI='X' and
6631      evolution S, for a vector of 4 units, we want to compute:
6632      [X, X + S, X + 2*S, X + 3*S].  */
6633
6634   if (dump_enabled_p ())
6635     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6636
6637   latch_e = loop_latch_edge (iv_loop);
6638   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6639
6640   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6641   gcc_assert (step_expr != NULL_TREE);
6642
6643   pe = loop_preheader_edge (iv_loop);
6644   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6645                                      loop_preheader_edge (iv_loop));
6646
6647   /* Convert the step to the desired type.  */
6648   stmts = NULL;
6649   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6650   if (stmts)
6651     {
6652       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6653       gcc_assert (!new_bb);
6654     }
6655
6656   /* Find the first insertion point in the BB.  */
6657   si = gsi_after_labels (bb);
6658
6659   /* For SLP induction we have to generate several IVs as for example
6660      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6661      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6662      [VF*S, VF*S, VF*S, VF*S] for all.  */
6663   if (slp_node)
6664     {
6665       /* Convert the init to the desired type.  */
6666       stmts = NULL;
6667       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6668       if (stmts)
6669         {
6670           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6671           gcc_assert (!new_bb);
6672         }
6673
6674       /* Generate [VF*S, VF*S, ... ].  */
6675       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6676         {
6677           expr = build_int_cst (integer_type_node, vf);
6678           expr = fold_convert (TREE_TYPE (step_expr), expr);
6679         }
6680       else
6681         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6682       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6683                               expr, step_expr);
6684       if (! CONSTANT_CLASS_P (new_name))
6685         new_name = vect_init_vector (phi, new_name,
6686                                      TREE_TYPE (step_expr), NULL);
6687       new_vec = build_vector_from_val (vectype, new_name);
6688       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6689
6690       /* Now generate the IVs.  */
6691       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6692       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6693       unsigned elts = nunits * nvects;
6694       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6695       gcc_assert (elts % group_size == 0);
6696       tree elt = init_expr;
6697       unsigned ivn;
6698       for (ivn = 0; ivn < nivs; ++ivn)
6699         {
6700           auto_vec<tree, 32> elts (nunits);
6701           stmts = NULL;
6702           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6703             {
6704               if (ivn*nunits + eltn >= group_size
6705                   && (ivn*nunits + eltn) % group_size == 0)
6706                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6707                                     elt, step_expr);
6708               elts.quick_push (elt);
6709             }
6710           vec_init = gimple_build_vector (&stmts, vectype, elts);
6711           if (stmts)
6712             {
6713               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6714               gcc_assert (!new_bb);
6715             }
6716
6717           /* Create the induction-phi that defines the induction-operand.  */
6718           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6719           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6720           set_vinfo_for_stmt (induction_phi,
6721                               new_stmt_vec_info (induction_phi, loop_vinfo));
6722           induc_def = PHI_RESULT (induction_phi);
6723
6724           /* Create the iv update inside the loop  */
6725           vec_def = make_ssa_name (vec_dest);
6726           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6727           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6728           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6729
6730           /* Set the arguments of the phi node:  */
6731           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6732           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6733                        UNKNOWN_LOCATION);
6734
6735           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6736         }
6737
6738       /* Re-use IVs when we can.  */
6739       if (ivn < nvects)
6740         {
6741           unsigned vfp
6742             = least_common_multiple (group_size, nunits) / group_size;
6743           /* Generate [VF'*S, VF'*S, ... ].  */
6744           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6745             {
6746               expr = build_int_cst (integer_type_node, vfp);
6747               expr = fold_convert (TREE_TYPE (step_expr), expr);
6748             }
6749           else
6750             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6751           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6752                                   expr, step_expr);
6753           if (! CONSTANT_CLASS_P (new_name))
6754             new_name = vect_init_vector (phi, new_name,
6755                                          TREE_TYPE (step_expr), NULL);
6756           new_vec = build_vector_from_val (vectype, new_name);
6757           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6758           for (; ivn < nvects; ++ivn)
6759             {
6760               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6761               tree def;
6762               if (gimple_code (iv) == GIMPLE_PHI)
6763                 def = gimple_phi_result (iv);
6764               else
6765                 def = gimple_assign_lhs (iv);
6766               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6767                                               PLUS_EXPR,
6768                                               def, vec_step);
6769               if (gimple_code (iv) == GIMPLE_PHI)
6770                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6771               else
6772                 {
6773                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6774                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6775                 }
6776               set_vinfo_for_stmt (new_stmt,
6777                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6778               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6779             }
6780         }
6781
6782       return true;
6783     }
6784
6785   /* Create the vector that holds the initial_value of the induction.  */
6786   if (nested_in_vect_loop)
6787     {
6788       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6789          been created during vectorization of previous stmts.  We obtain it
6790          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6791       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6792       /* If the initial value is not of proper type, convert it.  */
6793       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6794         {
6795           new_stmt
6796             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6797                                                           vect_simple_var,
6798                                                           "vec_iv_"),
6799                                    VIEW_CONVERT_EXPR,
6800                                    build1 (VIEW_CONVERT_EXPR, vectype,
6801                                            vec_init));
6802           vec_init = gimple_assign_lhs (new_stmt);
6803           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6804                                                  new_stmt);
6805           gcc_assert (!new_bb);
6806           set_vinfo_for_stmt (new_stmt,
6807                               new_stmt_vec_info (new_stmt, loop_vinfo));
6808         }
6809     }
6810   else
6811     {
6812       /* iv_loop is the loop to be vectorized. Create:
6813          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6814       stmts = NULL;
6815       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6816
6817       auto_vec<tree, 32> elts (nunits);
6818       elts.quick_push (new_name);
6819       for (i = 1; i < nunits; i++)
6820         {
6821           /* Create: new_name_i = new_name + step_expr  */
6822           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6823                                    new_name, step_expr);
6824           elts.quick_push (new_name);
6825         }
6826       /* Create a vector from [new_name_0, new_name_1, ...,
6827          new_name_nunits-1]  */
6828       vec_init = gimple_build_vector (&stmts, vectype, elts);
6829       if (stmts)
6830         {
6831           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6832           gcc_assert (!new_bb);
6833         }
6834     }
6835
6836
6837   /* Create the vector that holds the step of the induction.  */
6838   if (nested_in_vect_loop)
6839     /* iv_loop is nested in the loop to be vectorized. Generate:
6840        vec_step = [S, S, S, S]  */
6841     new_name = step_expr;
6842   else
6843     {
6844       /* iv_loop is the loop to be vectorized. Generate:
6845           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6846       gimple_seq seq = NULL;
6847       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6848         {
6849           expr = build_int_cst (integer_type_node, vf);
6850           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6851         }
6852       else
6853         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6854       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6855                                expr, step_expr);
6856       if (seq)
6857         {
6858           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6859           gcc_assert (!new_bb);
6860         }
6861     }
6862
6863   t = unshare_expr (new_name);
6864   gcc_assert (CONSTANT_CLASS_P (new_name)
6865               || TREE_CODE (new_name) == SSA_NAME);
6866   new_vec = build_vector_from_val (vectype, t);
6867   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6868
6869
6870   /* Create the following def-use cycle:
6871      loop prolog:
6872          vec_init = ...
6873          vec_step = ...
6874      loop:
6875          vec_iv = PHI <vec_init, vec_loop>
6876          ...
6877          STMT
6878          ...
6879          vec_loop = vec_iv + vec_step;  */
6880
6881   /* Create the induction-phi that defines the induction-operand.  */
6882   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6883   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6884   set_vinfo_for_stmt (induction_phi,
6885                       new_stmt_vec_info (induction_phi, loop_vinfo));
6886   induc_def = PHI_RESULT (induction_phi);
6887
6888   /* Create the iv update inside the loop  */
6889   vec_def = make_ssa_name (vec_dest);
6890   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6891   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6892   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6893
6894   /* Set the arguments of the phi node:  */
6895   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6896   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6897                UNKNOWN_LOCATION);
6898
6899   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6900
6901   /* In case that vectorization factor (VF) is bigger than the number
6902      of elements that we can fit in a vectype (nunits), we have to generate
6903      more than one vector stmt - i.e - we need to "unroll" the
6904      vector stmt by a factor VF/nunits.  For more details see documentation
6905      in vectorizable_operation.  */
6906
6907   if (ncopies > 1)
6908     {
6909       gimple_seq seq = NULL;
6910       stmt_vec_info prev_stmt_vinfo;
6911       /* FORNOW. This restriction should be relaxed.  */
6912       gcc_assert (!nested_in_vect_loop);
6913
6914       /* Create the vector that holds the step of the induction.  */
6915       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6916         {
6917           expr = build_int_cst (integer_type_node, nunits);
6918           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6919         }
6920       else
6921         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6922       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6923                                expr, step_expr);
6924       if (seq)
6925         {
6926           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6927           gcc_assert (!new_bb);
6928         }
6929
6930       t = unshare_expr (new_name);
6931       gcc_assert (CONSTANT_CLASS_P (new_name)
6932                   || TREE_CODE (new_name) == SSA_NAME);
6933       new_vec = build_vector_from_val (vectype, t);
6934       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6935
6936       vec_def = induc_def;
6937       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6938       for (i = 1; i < ncopies; i++)
6939         {
6940           /* vec_i = vec_prev + vec_step  */
6941           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6942                                           vec_def, vec_step);
6943           vec_def = make_ssa_name (vec_dest, new_stmt);
6944           gimple_assign_set_lhs (new_stmt, vec_def);
6945
6946           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6947           set_vinfo_for_stmt (new_stmt,
6948                               new_stmt_vec_info (new_stmt, loop_vinfo));
6949           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6950           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6951         }
6952     }
6953
6954   if (nested_in_vect_loop)
6955     {
6956       /* Find the loop-closed exit-phi of the induction, and record
6957          the final vector of induction results:  */
6958       exit_phi = NULL;
6959       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6960         {
6961           gimple *use_stmt = USE_STMT (use_p);
6962           if (is_gimple_debug (use_stmt))
6963             continue;
6964
6965           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6966             {
6967               exit_phi = use_stmt;
6968               break;
6969             }
6970         }
6971       if (exit_phi)
6972         {
6973           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6974           /* FORNOW. Currently not supporting the case that an inner-loop induction
6975              is not used in the outer-loop (i.e. only outside the outer-loop).  */
6976           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6977                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
6978
6979           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6980           if (dump_enabled_p ())
6981             {
6982               dump_printf_loc (MSG_NOTE, vect_location,
6983                                "vector of inductions after inner-loop:");
6984               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6985             }
6986         }
6987     }
6988
6989
6990   if (dump_enabled_p ())
6991     {
6992       dump_printf_loc (MSG_NOTE, vect_location,
6993                        "transform induction: created def-use cycle: ");
6994       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
6995       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6996                         SSA_NAME_DEF_STMT (vec_def), 0);
6997     }
6998
6999   return true;
7000 }
7001
7002 /* Function vectorizable_live_operation.
7003
7004    STMT computes a value that is used outside the loop.  Check if
7005    it can be supported.  */
7006
7007 bool
7008 vectorizable_live_operation (gimple *stmt,
7009                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7010                              slp_tree slp_node, int slp_index,
7011                              gimple **vec_stmt)
7012 {
7013   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7014   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7015   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7016   imm_use_iterator imm_iter;
7017   tree lhs, lhs_type, bitsize, vec_bitsize;
7018   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7019   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7020   int ncopies;
7021   gimple *use_stmt;
7022   auto_vec<tree> vec_oprnds;
7023
7024   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7025
7026   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7027     return false;
7028
7029   /* FORNOW.  CHECKME.  */
7030   if (nested_in_vect_loop_p (loop, stmt))
7031     return false;
7032
7033   /* If STMT is not relevant and it is a simple assignment and its inputs are
7034      invariant then it can remain in place, unvectorized.  The original last
7035      scalar value that it computes will be used.  */
7036   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7037     {
7038       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7039       if (dump_enabled_p ())
7040         dump_printf_loc (MSG_NOTE, vect_location,
7041                          "statement is simple and uses invariant.  Leaving in "
7042                          "place.\n");
7043       return true;
7044     }
7045
7046   if (slp_node)
7047     ncopies = 1;
7048   else
7049     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7050
7051   if (!vec_stmt)
7052     /* No transformation required.  */
7053     return true;
7054
7055   /* If stmt has a related stmt, then use that for getting the lhs.  */
7056   if (is_pattern_stmt_p (stmt_info))
7057     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7058
7059   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7060         : gimple_get_lhs (stmt);
7061   lhs_type = TREE_TYPE (lhs);
7062
7063   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7064              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7065              : TYPE_SIZE (TREE_TYPE (vectype)));
7066   vec_bitsize = TYPE_SIZE (vectype);
7067
7068   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7069   tree vec_lhs, bitstart;
7070   if (slp_node)
7071     {
7072       gcc_assert (slp_index >= 0);
7073
7074       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7075       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7076
7077       /* Get the last occurrence of the scalar index from the concatenation of
7078          all the slp vectors. Calculate which slp vector it is and the index
7079          within.  */
7080       int pos = (num_vec * nunits) - num_scalar + slp_index;
7081       int vec_entry = pos / nunits;
7082       int vec_index = pos % nunits;
7083
7084       /* Get the correct slp vectorized stmt.  */
7085       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7086
7087       /* Get entry to use.  */
7088       bitstart = bitsize_int (vec_index);
7089       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7090     }
7091   else
7092     {
7093       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7094       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7095
7096       /* For multiple copies, get the last copy.  */
7097       for (int i = 1; i < ncopies; ++i)
7098         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7099                                                   vec_lhs);
7100
7101       /* Get the last lane in the vector.  */
7102       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7103     }
7104
7105   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7106      loop.  */
7107   gimple_seq stmts = NULL;
7108   tree bftype = TREE_TYPE (vectype);
7109   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7110     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7111   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7112   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7113                                    true, NULL_TREE);
7114   if (stmts)
7115     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7116
7117   /* Replace use of lhs with newly computed result.  If the use stmt is a
7118      single arg PHI, just replace all uses of PHI result.  It's necessary
7119      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7120   use_operand_p use_p;
7121   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7122     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7123         && !is_gimple_debug (use_stmt))
7124     {
7125       if (gimple_code (use_stmt) == GIMPLE_PHI
7126           && gimple_phi_num_args (use_stmt) == 1)
7127         {
7128           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7129         }
7130       else
7131         {
7132           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7133             SET_USE (use_p, new_tree);
7134         }
7135       update_stmt (use_stmt);
7136     }
7137
7138   return true;
7139 }
7140
7141 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7142
7143 static void
7144 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7145 {
7146   ssa_op_iter op_iter;
7147   imm_use_iterator imm_iter;
7148   def_operand_p def_p;
7149   gimple *ustmt;
7150
7151   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7152     {
7153       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7154         {
7155           basic_block bb;
7156
7157           if (!is_gimple_debug (ustmt))
7158             continue;
7159
7160           bb = gimple_bb (ustmt);
7161
7162           if (!flow_bb_inside_loop_p (loop, bb))
7163             {
7164               if (gimple_debug_bind_p (ustmt))
7165                 {
7166                   if (dump_enabled_p ())
7167                     dump_printf_loc (MSG_NOTE, vect_location,
7168                                      "killing debug use\n");
7169
7170                   gimple_debug_bind_reset_value (ustmt);
7171                   update_stmt (ustmt);
7172                 }
7173               else
7174                 gcc_unreachable ();
7175             }
7176         }
7177     }
7178 }
7179
7180 /* Given loop represented by LOOP_VINFO, return true if computation of
7181    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7182    otherwise.  */
7183
7184 static bool
7185 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7186 {
7187   /* Constant case.  */
7188   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7189     {
7190       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7191       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7192
7193       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7194       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7195       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7196         return true;
7197     }
7198
7199   widest_int max;
7200   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7201   /* Check the upper bound of loop niters.  */
7202   if (get_max_loop_iterations (loop, &max))
7203     {
7204       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7205       signop sgn = TYPE_SIGN (type);
7206       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7207       if (max < type_max)
7208         return true;
7209     }
7210   return false;
7211 }
7212
7213 /* Scale profiling counters by estimation for LOOP which is vectorized
7214    by factor VF.  */
7215
7216 static void
7217 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7218 {
7219   edge preheader = loop_preheader_edge (loop);
7220   /* Reduce loop iterations by the vectorization factor.  */
7221   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7222   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7223
7224   if (freq_h.nonzero_p ())
7225     {
7226       profile_probability p;
7227
7228       /* Avoid dropping loop body profile counter to 0 because of zero count
7229          in loop's preheader.  */
7230       if (!(freq_e == profile_count::zero ()))
7231         freq_e = freq_e.force_nonzero ();
7232       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7233       scale_loop_frequencies (loop, p);
7234     }
7235
7236   edge exit_e = single_exit (loop);
7237   exit_e->probability = profile_probability::always ()
7238                                  .apply_scale (1, new_est_niter + 1);
7239
7240   edge exit_l = single_pred_edge (loop->latch);
7241   profile_probability prob = exit_l->probability;
7242   exit_l->probability = exit_e->probability.invert ();
7243   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7244     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7245 }
7246
7247 /* Function vect_transform_loop.
7248
7249    The analysis phase has determined that the loop is vectorizable.
7250    Vectorize the loop - created vectorized stmts to replace the scalar
7251    stmts in the loop, and update the loop exit condition.
7252    Returns scalar epilogue loop if any.  */
7253
7254 struct loop *
7255 vect_transform_loop (loop_vec_info loop_vinfo)
7256 {
7257   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7258   struct loop *epilogue = NULL;
7259   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7260   int nbbs = loop->num_nodes;
7261   int i;
7262   tree niters_vector = NULL;
7263   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7264   bool grouped_store;
7265   bool slp_scheduled = false;
7266   gimple *stmt, *pattern_stmt;
7267   gimple_seq pattern_def_seq = NULL;
7268   gimple_stmt_iterator pattern_def_si = gsi_none ();
7269   bool transform_pattern_stmt = false;
7270   bool check_profitability = false;
7271   int th;
7272
7273   if (dump_enabled_p ())
7274     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7275
7276   /* Use the more conservative vectorization threshold.  If the number
7277      of iterations is constant assume the cost check has been performed
7278      by our caller.  If the threshold makes all loops profitable that
7279      run at least the vectorization factor number of times checking
7280      is pointless, too.  */
7281   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7282   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7283       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7284     {
7285       if (dump_enabled_p ())
7286         dump_printf_loc (MSG_NOTE, vect_location,
7287                          "Profitability threshold is %d loop iterations.\n",
7288                          th);
7289       check_profitability = true;
7290     }
7291
7292   /* Make sure there exists a single-predecessor exit bb.  Do this before
7293      versioning.   */
7294   edge e = single_exit (loop);
7295   if (! single_pred_p (e->dest))
7296     {
7297       split_loop_exit_edge (e);
7298       if (dump_enabled_p ())
7299         dump_printf (MSG_NOTE, "split exit edge\n");
7300     }
7301
7302   /* Version the loop first, if required, so the profitability check
7303      comes first.  */
7304
7305   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7306     {
7307       vect_loop_versioning (loop_vinfo, th, check_profitability);
7308       check_profitability = false;
7309     }
7310
7311   /* Make sure there exists a single-predecessor exit bb also on the
7312      scalar loop copy.  Do this after versioning but before peeling
7313      so CFG structure is fine for both scalar and if-converted loop
7314      to make slpeel_duplicate_current_defs_from_edges face matched
7315      loop closed PHI nodes on the exit.  */
7316   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7317     {
7318       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7319       if (! single_pred_p (e->dest))
7320         {
7321           split_loop_exit_edge (e);
7322           if (dump_enabled_p ())
7323             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7324         }
7325     }
7326
7327   tree niters = vect_build_loop_niters (loop_vinfo);
7328   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7329   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7330   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7331   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7332                               check_profitability, niters_no_overflow);
7333   if (niters_vector == NULL_TREE)
7334     {
7335       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7336         niters_vector
7337           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7338                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7339       else
7340         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7341                                      niters_no_overflow);
7342     }
7343
7344   /* 1) Make sure the loop header has exactly two entries
7345      2) Make sure we have a preheader basic block.  */
7346
7347   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7348
7349   split_edge (loop_preheader_edge (loop));
7350
7351   /* FORNOW: the vectorizer supports only loops which body consist
7352      of one basic block (header + empty latch). When the vectorizer will
7353      support more involved loop forms, the order by which the BBs are
7354      traversed need to be reconsidered.  */
7355
7356   for (i = 0; i < nbbs; i++)
7357     {
7358       basic_block bb = bbs[i];
7359       stmt_vec_info stmt_info;
7360
7361       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7362            gsi_next (&si))
7363         {
7364           gphi *phi = si.phi ();
7365           if (dump_enabled_p ())
7366             {
7367               dump_printf_loc (MSG_NOTE, vect_location,
7368                                "------>vectorizing phi: ");
7369               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7370             }
7371           stmt_info = vinfo_for_stmt (phi);
7372           if (!stmt_info)
7373             continue;
7374
7375           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7376             vect_loop_kill_debug_uses (loop, phi);
7377
7378           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7379               && !STMT_VINFO_LIVE_P (stmt_info))
7380             continue;
7381
7382           if (STMT_VINFO_VECTYPE (stmt_info)
7383               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7384                   != (unsigned HOST_WIDE_INT) vf)
7385               && dump_enabled_p ())
7386             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7387
7388           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7389                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7390                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7391               && ! PURE_SLP_STMT (stmt_info))
7392             {
7393               if (dump_enabled_p ())
7394                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7395               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7396             }
7397         }
7398
7399       pattern_stmt = NULL;
7400       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7401            !gsi_end_p (si) || transform_pattern_stmt;)
7402         {
7403           bool is_store;
7404
7405           if (transform_pattern_stmt)
7406             stmt = pattern_stmt;
7407           else
7408             {
7409               stmt = gsi_stmt (si);
7410               /* During vectorization remove existing clobber stmts.  */
7411               if (gimple_clobber_p (stmt))
7412                 {
7413                   unlink_stmt_vdef (stmt);
7414                   gsi_remove (&si, true);
7415                   release_defs (stmt);
7416                   continue;
7417                 }
7418             }
7419
7420           if (dump_enabled_p ())
7421             {
7422               dump_printf_loc (MSG_NOTE, vect_location,
7423                                "------>vectorizing statement: ");
7424               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7425             }
7426
7427           stmt_info = vinfo_for_stmt (stmt);
7428
7429           /* vector stmts created in the outer-loop during vectorization of
7430              stmts in an inner-loop may not have a stmt_info, and do not
7431              need to be vectorized.  */
7432           if (!stmt_info)
7433             {
7434               gsi_next (&si);
7435               continue;
7436             }
7437
7438           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7439             vect_loop_kill_debug_uses (loop, stmt);
7440
7441           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7442               && !STMT_VINFO_LIVE_P (stmt_info))
7443             {
7444               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7445                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7446                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7447                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7448                 {
7449                   stmt = pattern_stmt;
7450                   stmt_info = vinfo_for_stmt (stmt);
7451                 }
7452               else
7453                 {
7454                   gsi_next (&si);
7455                   continue;
7456                 }
7457             }
7458           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7459                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7460                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7461                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7462             transform_pattern_stmt = true;
7463
7464           /* If pattern statement has def stmts, vectorize them too.  */
7465           if (is_pattern_stmt_p (stmt_info))
7466             {
7467               if (pattern_def_seq == NULL)
7468                 {
7469                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7470                   pattern_def_si = gsi_start (pattern_def_seq);
7471                 }
7472               else if (!gsi_end_p (pattern_def_si))
7473                 gsi_next (&pattern_def_si);
7474               if (pattern_def_seq != NULL)
7475                 {
7476                   gimple *pattern_def_stmt = NULL;
7477                   stmt_vec_info pattern_def_stmt_info = NULL;
7478
7479                   while (!gsi_end_p (pattern_def_si))
7480                     {
7481                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7482                       pattern_def_stmt_info
7483                         = vinfo_for_stmt (pattern_def_stmt);
7484                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7485                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7486                         break;
7487                       gsi_next (&pattern_def_si);
7488                     }
7489
7490                   if (!gsi_end_p (pattern_def_si))
7491                     {
7492                       if (dump_enabled_p ())
7493                         {
7494                           dump_printf_loc (MSG_NOTE, vect_location,
7495                                            "==> vectorizing pattern def "
7496                                            "stmt: ");
7497                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7498                                             pattern_def_stmt, 0);
7499                         }
7500
7501                       stmt = pattern_def_stmt;
7502                       stmt_info = pattern_def_stmt_info;
7503                     }
7504                   else
7505                     {
7506                       pattern_def_si = gsi_none ();
7507                       transform_pattern_stmt = false;
7508                     }
7509                 }
7510               else
7511                 transform_pattern_stmt = false;
7512             }
7513
7514           if (STMT_VINFO_VECTYPE (stmt_info))
7515             {
7516               unsigned int nunits
7517                 = (unsigned int)
7518                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7519               if (!STMT_SLP_TYPE (stmt_info)
7520                   && nunits != (unsigned int) vf
7521                   && dump_enabled_p ())
7522                   /* For SLP VF is set according to unrolling factor, and not
7523                      to vector size, hence for SLP this print is not valid.  */
7524                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7525             }
7526
7527           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7528              reached.  */
7529           if (STMT_SLP_TYPE (stmt_info))
7530             {
7531               if (!slp_scheduled)
7532                 {
7533                   slp_scheduled = true;
7534
7535                   if (dump_enabled_p ())
7536                     dump_printf_loc (MSG_NOTE, vect_location,
7537                                      "=== scheduling SLP instances ===\n");
7538
7539                   vect_schedule_slp (loop_vinfo);
7540                 }
7541
7542               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7543               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7544                 {
7545                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7546                     {
7547                       pattern_def_seq = NULL;
7548                       gsi_next (&si);
7549                     }
7550                   continue;
7551                 }
7552             }
7553
7554           /* -------- vectorize statement ------------ */
7555           if (dump_enabled_p ())
7556             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7557
7558           grouped_store = false;
7559           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7560           if (is_store)
7561             {
7562               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7563                 {
7564                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7565                      interleaving chain was completed - free all the stores in
7566                      the chain.  */
7567                   gsi_next (&si);
7568                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7569                 }
7570               else
7571                 {
7572                   /* Free the attached stmt_vec_info and remove the stmt.  */
7573                   gimple *store = gsi_stmt (si);
7574                   free_stmt_vec_info (store);
7575                   unlink_stmt_vdef (store);
7576                   gsi_remove (&si, true);
7577                   release_defs (store);
7578                 }
7579
7580               /* Stores can only appear at the end of pattern statements.  */
7581               gcc_assert (!transform_pattern_stmt);
7582               pattern_def_seq = NULL;
7583             }
7584           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7585             {
7586               pattern_def_seq = NULL;
7587               gsi_next (&si);
7588             }
7589         }                       /* stmts in BB */
7590     }                           /* BBs in loop */
7591
7592   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7593
7594   scale_profile_for_vect_loop (loop, vf);
7595
7596   /* The minimum number of iterations performed by the epilogue.  This
7597      is 1 when peeling for gaps because we always need a final scalar
7598      iteration.  */
7599   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7600   /* +1 to convert latch counts to loop iteration counts,
7601      -min_epilogue_iters to remove iterations that cannot be performed
7602        by the vector code.  */
7603   int bias = 1 - min_epilogue_iters;
7604   /* In these calculations the "- 1" converts loop iteration counts
7605      back to latch counts.  */
7606   if (loop->any_upper_bound)
7607     loop->nb_iterations_upper_bound
7608       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7609   if (loop->any_likely_upper_bound)
7610     loop->nb_iterations_likely_upper_bound
7611       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7612   if (loop->any_estimate)
7613     loop->nb_iterations_estimate
7614       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7615
7616   if (dump_enabled_p ())
7617     {
7618       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7619         {
7620           dump_printf_loc (MSG_NOTE, vect_location,
7621                            "LOOP VECTORIZED\n");
7622           if (loop->inner)
7623             dump_printf_loc (MSG_NOTE, vect_location,
7624                              "OUTER LOOP VECTORIZED\n");
7625           dump_printf (MSG_NOTE, "\n");
7626         }
7627       else
7628         dump_printf_loc (MSG_NOTE, vect_location,
7629                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7630                          current_vector_size);
7631     }
7632
7633   /* Free SLP instances here because otherwise stmt reference counting
7634      won't work.  */
7635   slp_instance instance;
7636   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7637     vect_free_slp_instance (instance);
7638   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7639   /* Clear-up safelen field since its value is invalid after vectorization
7640      since vectorized loop can have loop-carried dependencies.  */
7641   loop->safelen = 0;
7642
7643   /* Don't vectorize epilogue for epilogue.  */
7644   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7645     epilogue = NULL;
7646
7647   if (epilogue)
7648     {
7649         unsigned int vector_sizes
7650           = targetm.vectorize.autovectorize_vector_sizes ();
7651         vector_sizes &= current_vector_size - 1;
7652
7653         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7654           epilogue = NULL;
7655         else if (!vector_sizes)
7656           epilogue = NULL;
7657         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7658                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7659           {
7660             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7661             int ratio = current_vector_size / smallest_vec_size;
7662             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7663               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7664             eiters = eiters % vf;
7665
7666             epilogue->nb_iterations_upper_bound = eiters - 1;
7667
7668             if (eiters < vf / ratio)
7669               epilogue = NULL;
7670             }
7671     }
7672
7673   if (epilogue)
7674     {
7675       epilogue->force_vectorize = loop->force_vectorize;
7676       epilogue->safelen = loop->safelen;
7677       epilogue->dont_vectorize = false;
7678
7679       /* We may need to if-convert epilogue to vectorize it.  */
7680       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7681         tree_if_conversion (epilogue);
7682     }
7683
7684   return epilogue;
7685 }
7686
7687 /* The code below is trying to perform simple optimization - revert
7688    if-conversion for masked stores, i.e. if the mask of a store is zero
7689    do not perform it and all stored value producers also if possible.
7690    For example,
7691      for (i=0; i<n; i++)
7692        if (c[i])
7693         {
7694           p1[i] += 1;
7695           p2[i] = p3[i] +2;
7696         }
7697    this transformation will produce the following semi-hammock:
7698
7699    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7700      {
7701        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7702        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7703        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7704        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7705        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7706        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7707      }
7708 */
7709
7710 void
7711 optimize_mask_stores (struct loop *loop)
7712 {
7713   basic_block *bbs = get_loop_body (loop);
7714   unsigned nbbs = loop->num_nodes;
7715   unsigned i;
7716   basic_block bb;
7717   struct loop *bb_loop;
7718   gimple_stmt_iterator gsi;
7719   gimple *stmt;
7720   auto_vec<gimple *> worklist;
7721
7722   vect_location = find_loop_location (loop);
7723   /* Pick up all masked stores in loop if any.  */
7724   for (i = 0; i < nbbs; i++)
7725     {
7726       bb = bbs[i];
7727       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7728            gsi_next (&gsi))
7729         {
7730           stmt = gsi_stmt (gsi);
7731           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7732             worklist.safe_push (stmt);
7733         }
7734     }
7735
7736   free (bbs);
7737   if (worklist.is_empty ())
7738     return;
7739
7740   /* Loop has masked stores.  */
7741   while (!worklist.is_empty ())
7742     {
7743       gimple *last, *last_store;
7744       edge e, efalse;
7745       tree mask;
7746       basic_block store_bb, join_bb;
7747       gimple_stmt_iterator gsi_to;
7748       tree vdef, new_vdef;
7749       gphi *phi;
7750       tree vectype;
7751       tree zero;
7752
7753       last = worklist.pop ();
7754       mask = gimple_call_arg (last, 2);
7755       bb = gimple_bb (last);
7756       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7757          the same loop as if_bb.  It could be different to LOOP when two
7758          level loop-nest is vectorized and mask_store belongs to the inner
7759          one.  */
7760       e = split_block (bb, last);
7761       bb_loop = bb->loop_father;
7762       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7763       join_bb = e->dest;
7764       store_bb = create_empty_bb (bb);
7765       add_bb_to_loop (store_bb, bb_loop);
7766       e->flags = EDGE_TRUE_VALUE;
7767       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7768       /* Put STORE_BB to likely part.  */
7769       efalse->probability = profile_probability::unlikely ();
7770       store_bb->count = efalse->count ();
7771       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7772       if (dom_info_available_p (CDI_DOMINATORS))
7773         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7774       if (dump_enabled_p ())
7775         dump_printf_loc (MSG_NOTE, vect_location,
7776                          "Create new block %d to sink mask stores.",
7777                          store_bb->index);
7778       /* Create vector comparison with boolean result.  */
7779       vectype = TREE_TYPE (mask);
7780       zero = build_zero_cst (vectype);
7781       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7782       gsi = gsi_last_bb (bb);
7783       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7784       /* Create new PHI node for vdef of the last masked store:
7785          .MEM_2 = VDEF <.MEM_1>
7786          will be converted to
7787          .MEM.3 = VDEF <.MEM_1>
7788          and new PHI node will be created in join bb
7789          .MEM_2 = PHI <.MEM_1, .MEM_3>
7790       */
7791       vdef = gimple_vdef (last);
7792       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7793       gimple_set_vdef (last, new_vdef);
7794       phi = create_phi_node (vdef, join_bb);
7795       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7796
7797       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7798       while (true)
7799         {
7800           gimple_stmt_iterator gsi_from;
7801           gimple *stmt1 = NULL;
7802
7803           /* Move masked store to STORE_BB.  */
7804           last_store = last;
7805           gsi = gsi_for_stmt (last);
7806           gsi_from = gsi;
7807           /* Shift GSI to the previous stmt for further traversal.  */
7808           gsi_prev (&gsi);
7809           gsi_to = gsi_start_bb (store_bb);
7810           gsi_move_before (&gsi_from, &gsi_to);
7811           /* Setup GSI_TO to the non-empty block start.  */
7812           gsi_to = gsi_start_bb (store_bb);
7813           if (dump_enabled_p ())
7814             {
7815               dump_printf_loc (MSG_NOTE, vect_location,
7816                                "Move stmt to created bb\n");
7817               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7818             }
7819           /* Move all stored value producers if possible.  */
7820           while (!gsi_end_p (gsi))
7821             {
7822               tree lhs;
7823               imm_use_iterator imm_iter;
7824               use_operand_p use_p;
7825               bool res;
7826
7827               /* Skip debug statements.  */
7828               if (is_gimple_debug (gsi_stmt (gsi)))
7829                 {
7830                   gsi_prev (&gsi);
7831                   continue;
7832                 }
7833               stmt1 = gsi_stmt (gsi);
7834               /* Do not consider statements writing to memory or having
7835                  volatile operand.  */
7836               if (gimple_vdef (stmt1)
7837                   || gimple_has_volatile_ops (stmt1))
7838                 break;
7839               gsi_from = gsi;
7840               gsi_prev (&gsi);
7841               lhs = gimple_get_lhs (stmt1);
7842               if (!lhs)
7843                 break;
7844
7845               /* LHS of vectorized stmt must be SSA_NAME.  */
7846               if (TREE_CODE (lhs) != SSA_NAME)
7847                 break;
7848
7849               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7850                 {
7851                   /* Remove dead scalar statement.  */
7852                   if (has_zero_uses (lhs))
7853                     {
7854                       gsi_remove (&gsi_from, true);
7855                       continue;
7856                     }
7857                 }
7858
7859               /* Check that LHS does not have uses outside of STORE_BB.  */
7860               res = true;
7861               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7862                 {
7863                   gimple *use_stmt;
7864                   use_stmt = USE_STMT (use_p);
7865                   if (is_gimple_debug (use_stmt))
7866                     continue;
7867                   if (gimple_bb (use_stmt) != store_bb)
7868                     {
7869                       res = false;
7870                       break;
7871                     }
7872                 }
7873               if (!res)
7874                 break;
7875
7876               if (gimple_vuse (stmt1)
7877                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7878                 break;
7879
7880               /* Can move STMT1 to STORE_BB.  */
7881               if (dump_enabled_p ())
7882                 {
7883                   dump_printf_loc (MSG_NOTE, vect_location,
7884                                    "Move stmt to created bb\n");
7885                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7886                 }
7887               gsi_move_before (&gsi_from, &gsi_to);
7888               /* Shift GSI_TO for further insertion.  */
7889               gsi_prev (&gsi_to);
7890             }
7891           /* Put other masked stores with the same mask to STORE_BB.  */
7892           if (worklist.is_empty ()
7893               || gimple_call_arg (worklist.last (), 2) != mask
7894               || worklist.last () != stmt1)
7895             break;
7896           last = worklist.pop ();
7897         }
7898       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7899     }
7900 }