gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Function vect_determine_vectorization_factor
 159
 160    Determine the vectorization factor (VF).  VF is the number of data elements
 161    that are operated upon in parallel in a single iteration of the vectorized
 162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 164    elements can fit in a single vector register.
 165
 166    We currently support vectorization of loops in which all types operated upon
 167    are of the same size.  Therefore this function currently sets VF according to
 168    the size of the types operated upon, and fails if there are multiple sizes
 169    in the loop.
 170
 171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 172    original loop:
 173         for (i=0; i<N; i++){
 174           a[i] = b[i] + c[i];
 175         }
 176
 177    vectorized loop:
 178         for (i=0; i<N; i+=VF){
 179           a[i:VF] = b[i:VF] + c[i:VF];
 180         }
 181 */
 182
 183 static bool
 184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 185 {
 186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 188   unsigned nbbs = loop->num_nodes;
 189   poly_uint64 vectorization_factor = 1;
 190   tree scalar_type = NULL_TREE;
 191   gphi *phi;
 192   tree vectype;
 193   stmt_vec_info stmt_info;
 194   unsigned i;
 195   HOST_WIDE_INT dummy;
 196   gimple *stmt, *pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200   bool bool_result;
 201   auto_vec<stmt_vec_info> mask_producers;
 202
 203   if (dump_enabled_p ())
 204     dump_printf_loc (MSG_NOTE, vect_location,
 205                      "=== vect_determine_vectorization_factor ===\n");
 206
 207   for (i = 0; i < nbbs; i++)
 208     {
 209       basic_block bb = bbs[i];
 210
 211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 212            gsi_next (&si))
 213         {
 214           phi = si.phi ();
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220             }
 221
 222           gcc_assert (stmt_info);
 223
 224           if (STMT_VINFO_RELEVANT_P (stmt_info)
 225               || STMT_VINFO_LIVE_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 264                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 265                   dump_printf (MSG_NOTE, "\n");
 266                 }
 267
 268               vect_update_max_nunits (&vectorization_factor, vectype);
 269             }
 270         }
 271
 272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 273            !gsi_end_p (si) || analyze_pattern_stmt;)
 274         {
 275           tree vf_vectype;
 276
 277           if (analyze_pattern_stmt)
 278             stmt = pattern_stmt;
 279           else
 280             stmt = gsi_stmt (si);
 281
 282           stmt_info = vinfo_for_stmt (stmt);
 283
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_NOTE, vect_location,
 287                                "==> examining statement: ");
 288               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                     }
 311                 }
 312               else
 313                 {
 314                   if (dump_enabled_p ())
 315                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 316                   gsi_next (&si);
 317                   continue;
 318                 }
 319             }
 320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 324             analyze_pattern_stmt = true;
 325
 326           /* If a pattern statement has def stmts, analyze them too.  */
 327           if (is_pattern_stmt_p (stmt_info))
 328             {
 329               if (pattern_def_seq == NULL)
 330                 {
 331                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 332                   pattern_def_si = gsi_start (pattern_def_seq);
 333                 }
 334               else if (!gsi_end_p (pattern_def_si))
 335                 gsi_next (&pattern_def_si);
 336               if (pattern_def_seq != NULL)
 337                 {
 338                   gimple *pattern_def_stmt = NULL;
 339                   stmt_vec_info pattern_def_stmt_info = NULL;
 340
 341                   while (!gsi_end_p (pattern_def_si))
 342                     {
 343                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 344                       pattern_def_stmt_info
 345                         = vinfo_for_stmt (pattern_def_stmt);
 346                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 347                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 348                         break;
 349                       gsi_next (&pattern_def_si);
 350                     }
 351
 352                   if (!gsi_end_p (pattern_def_si))
 353                     {
 354                       if (dump_enabled_p ())
 355                         {
 356                           dump_printf_loc (MSG_NOTE, vect_location,
 357                                            "==> examining pattern def stmt: ");
 358                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 359                                             pattern_def_stmt, 0);
 360                         }
 361
 362                       stmt = pattern_def_stmt;
 363                       stmt_info = pattern_def_stmt_info;
 364                     }
 365                   else
 366                     {
 367                       pattern_def_si = gsi_none ();
 368                       analyze_pattern_stmt = false;
 369                     }
 370                 }
 371               else
 372                 analyze_pattern_stmt = false;
 373             }
 374
 375           if (gimple_get_lhs (stmt) == NULL_TREE
 376               /* MASK_STORE has no lhs, but is ok.  */
 377               && (!is_gimple_call (stmt)
 378                   || !gimple_call_internal_p (stmt)
 379                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 380             {
 381               if (is_gimple_call (stmt))
 382                 {
 383                   /* Ignore calls with no lhs.  These must be calls to
 384                      #pragma omp simd functions, and what vectorization factor
 385                      it really needs can't be determined until
 386                      vectorizable_simd_clone_call.  */
 387                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 388                     {
 389                       pattern_def_seq = NULL;
 390                       gsi_next (&si);
 391                     }
 392                   continue;
 393                 }
 394               if (dump_enabled_p ())
 395                 {
 396                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 397                                    "not vectorized: irregular stmt.");
 398                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 399                                     0);
 400                 }
 401               return false;
 402             }
 403
 404           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 405             {
 406               if (dump_enabled_p ())
 407                 {
 408                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 409                                    "not vectorized: vector stmt in loop:");
 410                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 411                 }
 412               return false;
 413             }
 414
 415           bool_result = false;
 416
 417           if (STMT_VINFO_VECTYPE (stmt_info))
 418             {
 419               /* The only case when a vectype had been already set is for stmts
 420                  that contain a dataref, or for "pattern-stmts" (stmts
 421                  generated by the vectorizer to represent/replace a certain
 422                  idiom).  */
 423               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 424                           || is_pattern_stmt_p (stmt_info)
 425                           || !gsi_end_p (pattern_def_si));
 426               vectype = STMT_VINFO_VECTYPE (stmt_info);
 427             }
 428           else
 429             {
 430               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 431               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 432                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 433               else
 434                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 435
 436               /* Bool ops don't participate in vectorization factor
 437                  computation.  For comparison use compared types to
 438                  compute a factor.  */
 439               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 440                   && is_gimple_assign (stmt)
 441                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 442                 {
 443                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 444                       || STMT_VINFO_LIVE_P (stmt_info))
 445                     mask_producers.safe_push (stmt_info);
 446                   bool_result = true;
 447
 448                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 449                       == tcc_comparison
 450                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 451                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 452                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 453                   else
 454                     {
 455                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 456                         {
 457                           pattern_def_seq = NULL;
 458                           gsi_next (&si);
 459                         }
 460                       continue;
 461                     }
 462                 }
 463
 464               if (dump_enabled_p ())
 465                 {
 466                   dump_printf_loc (MSG_NOTE, vect_location,
 467                                    "get vectype for scalar type:  ");
 468                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 469                   dump_printf (MSG_NOTE, "\n");
 470                 }
 471               vectype = get_vectype_for_scalar_type (scalar_type);
 472               if (!vectype)
 473                 {
 474                   if (dump_enabled_p ())
 475                     {
 476                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                                        "not vectorized: unsupported "
 478                                        "data-type ");
 479                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 480                                          scalar_type);
 481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 482                     }
 483                   return false;
 484                 }
 485
 486               if (!bool_result)
 487                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 488
 489               if (dump_enabled_p ())
 490                 {
 491                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 492                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 493                   dump_printf (MSG_NOTE, "\n");
 494                 }
 495             }
 496
 497           /* Don't try to compute VF out scalar types if we stmt
 498              produces boolean vector.  Use result vectype instead.  */
 499           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 500             vf_vectype = vectype;
 501           else
 502             {
 503               /* The vectorization factor is according to the smallest
 504                  scalar type (or the largest vector size, but we only
 505                  support one vector size per loop).  */
 506               if (!bool_result)
 507                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 508                                                              &dummy);
 509               if (dump_enabled_p ())
 510                 {
 511                   dump_printf_loc (MSG_NOTE, vect_location,
 512                                    "get vectype for scalar type:  ");
 513                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 514                   dump_printf (MSG_NOTE, "\n");
 515                 }
 516               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 517             }
 518           if (!vf_vectype)
 519             {
 520               if (dump_enabled_p ())
 521                 {
 522                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                                    "not vectorized: unsupported data-type ");
 524                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 525                                      scalar_type);
 526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 527                 }
 528               return false;
 529             }
 530
 531           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 532                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 533             {
 534               if (dump_enabled_p ())
 535                 {
 536                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 537                                    "not vectorized: different sized vector "
 538                                    "types in statement, ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 542                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 543                                      vf_vectype);
 544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 545                 }
 546               return false;
 547             }
 548
 549           if (dump_enabled_p ())
 550             {
 551               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 552               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 553               dump_printf (MSG_NOTE, "\n");
 554             }
 555
 556           if (dump_enabled_p ())
 557             {
 558               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 559               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 560               dump_printf (MSG_NOTE, "\n");
 561             }
 562
 563           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 564
 565           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 566             {
 567               pattern_def_seq = NULL;
 568               gsi_next (&si);
 569             }
 570         }
 571     }
 572
 573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 574   if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 577       dump_dec (MSG_NOTE, vectorization_factor);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   if (known_le (vectorization_factor, 1U))
 582     {
 583       if (dump_enabled_p ())
 584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 585                          "not vectorized: unsupported data-type\n");
 586       return false;
 587     }
 588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 589
 590   for (i = 0; i < mask_producers.length (); i++)
 591     {
 592       tree mask_type = NULL;
 593
 594       stmt = STMT_VINFO_STMT (mask_producers[i]);
 595
 596       if (is_gimple_assign (stmt)
 597           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 598           && !VECT_SCALAR_BOOLEAN_TYPE_P
 599                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 600         {
 601           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 602           mask_type = get_mask_type_for_scalar_type (scalar_type);
 603
 604           if (!mask_type)
 605             {
 606               if (dump_enabled_p ())
 607                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                                  "not vectorized: unsupported mask\n");
 609               return false;
 610             }
 611         }
 612       else
 613         {
 614           tree rhs;
 615           ssa_op_iter iter;
 616           gimple *def_stmt;
 617           enum vect_def_type dt;
 618
 619           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 620             {
 621               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 622                                        &def_stmt, &dt, &vectype))
 623                 {
 624                   if (dump_enabled_p ())
 625                     {
 626                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                                        "not vectorized: can't compute mask type "
 628                                        "for statement, ");
 629                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 630                                         0);
 631                     }
 632                   return false;
 633                 }
 634
 635               /* No vectype probably means external definition.
 636                  Allow it in case there is another operand which
 637                  allows to determine mask type.  */
 638               if (!vectype)
 639                 continue;
 640
 641               if (!mask_type)
 642                 mask_type = vectype;
 643               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 644                                  TYPE_VECTOR_SUBPARTS (vectype)))
 645                 {
 646                   if (dump_enabled_p ())
 647                     {
 648                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 649                                        "not vectorized: different sized masks "
 650                                        "types in statement, ");
 651                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 652                                          mask_type);
 653                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 654                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 655                                          vectype);
 656                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 657                     }
 658                   return false;
 659                 }
 660               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 661                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 662                 {
 663                   if (dump_enabled_p ())
 664                     {
 665                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 666                                        "not vectorized: mixed mask and "
 667                                        "nonmask vector types in statement, ");
 668                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 669                                          mask_type);
 670                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 671                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 672                                          vectype);
 673                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 674                     }
 675                   return false;
 676                 }
 677             }
 678
 679           /* We may compare boolean value loaded as vector of integers.
 680              Fix mask_type in such case.  */
 681           if (mask_type
 682               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 683               && gimple_code (stmt) == GIMPLE_ASSIGN
 684               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 685             mask_type = build_same_sized_truth_vector_type (mask_type);
 686         }
 687
 688       /* No mask_type should mean loop invariant predicate.
 689          This is probably a subject for optimization in
 690          if-conversion.  */
 691       if (!mask_type)
 692         {
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 696                                "not vectorized: can't compute mask type "
 697                                "for statement, ");
 698               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 699                                 0);
 700             }
 701           return false;
 702         }
 703
 704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 705     }
 706
 707   return true;
 708 }
 709
 710
 711 /* Function vect_is_simple_iv_evolution.
 712
 713    FORNOW: A simple evolution of an induction variables in the loop is
 714    considered a polynomial evolution.  */
 715
 716 static bool
 717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 718                              tree * step)
 719 {
 720   tree init_expr;
 721   tree step_expr;
 722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 723   basic_block bb;
 724
 725   /* When there is no evolution in this loop, the evolution function
 726      is not "simple".  */
 727   if (evolution_part == NULL_TREE)
 728     return false;
 729
 730   /* When the evolution is a polynomial of degree >= 2
 731      the evolution function is not "simple".  */
 732   if (tree_is_chrec (evolution_part))
 733     return false;
 734
 735   step_expr = evolution_part;
 736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 737
 738   if (dump_enabled_p ())
 739     {
 740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 742       dump_printf (MSG_NOTE, ",  init: ");
 743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 744       dump_printf (MSG_NOTE, "\n");
 745     }
 746
 747   *init = init_expr;
 748   *step = step_expr;
 749
 750   if (TREE_CODE (step_expr) != INTEGER_CST
 751       && (TREE_CODE (step_expr) != SSA_NAME
 752           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 753               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 754           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 755               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 756                   || !flag_associative_math)))
 757       && (TREE_CODE (step_expr) != REAL_CST
 758           || !flag_associative_math))
 759     {
 760       if (dump_enabled_p ())
 761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 762                          "step unknown.\n");
 763       return false;
 764     }
 765
 766   return true;
 767 }
 768
 769 /* Function vect_analyze_scalar_cycles_1.
 770
 771    Examine the cross iteration def-use cycles of scalar variables
 772    in LOOP.  LOOP_VINFO represents the loop that is now being
 773    considered for vectorization (can be LOOP, or an outer-loop
 774    enclosing LOOP).  */
 775
 776 static void
 777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 778 {
 779   basic_block bb = loop->header;
 780   tree init, step;
 781   auto_vec<gimple *, 64> worklist;
 782   gphi_iterator gsi;
 783   bool double_reduc;
 784
 785   if (dump_enabled_p ())
 786     dump_printf_loc (MSG_NOTE, vect_location,
 787                      "=== vect_analyze_scalar_cycles ===\n");
 788
 789   /* First - identify all inductions.  Reduction detection assumes that all the
 790      inductions have been identified, therefore, this order must not be
 791      changed.  */
 792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 793     {
 794       gphi *phi = gsi.phi ();
 795       tree access_fn = NULL;
 796       tree def = PHI_RESULT (phi);
 797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 798
 799       if (dump_enabled_p ())
 800         {
 801           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 802           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 803         }
 804
 805       /* Skip virtual phi's.  The data dependences that are associated with
 806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 807       if (virtual_operand_p (def))
 808         continue;
 809
 810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 811
 812       /* Analyze the evolution function.  */
 813       access_fn = analyze_scalar_evolution (loop, def);
 814       if (access_fn)
 815         {
 816           STRIP_NOPS (access_fn);
 817           if (dump_enabled_p ())
 818             {
 819               dump_printf_loc (MSG_NOTE, vect_location,
 820                                "Access function of PHI: ");
 821               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 822               dump_printf (MSG_NOTE, "\n");
 823             }
 824           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 825             = initial_condition_in_loop_num (access_fn, loop->num);
 826           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 827             = evolution_part_in_loop_num (access_fn, loop->num);
 828         }
 829
 830       if (!access_fn
 831           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 832           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 833               && TREE_CODE (step) != INTEGER_CST))
 834         {
 835           worklist.safe_push (phi);
 836           continue;
 837         }
 838
 839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 840                   != NULL_TREE);
 841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 842
 843       if (dump_enabled_p ())
 844         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 846     }
 847
 848
 849   /* Second - identify all reductions and nested cycles.  */
 850   while (worklist.length () > 0)
 851     {
 852       gimple *phi = worklist.pop ();
 853       tree def = PHI_RESULT (phi);
 854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 855       gimple *reduc_stmt;
 856
 857       if (dump_enabled_p ())
 858         {
 859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 861         }
 862
 863       gcc_assert (!virtual_operand_p (def)
 864                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 865
 866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 867                                                 &double_reduc, false);
 868       if (reduc_stmt)
 869         {
 870           if (double_reduc)
 871             {
 872               if (dump_enabled_p ())
 873                 dump_printf_loc (MSG_NOTE, vect_location,
 874                                  "Detected double reduction.\n");
 875
 876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 878                                                     vect_double_reduction_def;
 879             }
 880           else
 881             {
 882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 883                 {
 884                   if (dump_enabled_p ())
 885                     dump_printf_loc (MSG_NOTE, vect_location,
 886                                      "Detected vectorizable nested cycle.\n");
 887
 888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 890                                                              vect_nested_cycle;
 891                 }
 892               else
 893                 {
 894                   if (dump_enabled_p ())
 895                     dump_printf_loc (MSG_NOTE, vect_location,
 896                                      "Detected reduction.\n");
 897
 898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 900                                                            vect_reduction_def;
 901                   /* Store the reduction cycles for possible vectorization in
 902                      loop-aware SLP if it was not detected as reduction
 903                      chain.  */
 904                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 905                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 906                 }
 907             }
 908         }
 909       else
 910         if (dump_enabled_p ())
 911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                            "Unknown def-use cycle pattern.\n");
 913     }
 914 }
 915
 916
 917 /* Function vect_analyze_scalar_cycles.
 918
 919    Examine the cross iteration def-use cycles of scalar variables, by
 920    analyzing the loop-header PHIs of scalar variables.  Classify each
 921    cycle as one of the following: invariant, induction, reduction, unknown.
 922    We do that for the loop represented by LOOP_VINFO, and also to its
 923    inner-loop, if exists.
 924    Examples for scalar cycles:
 925
 926    Example1: reduction:
 927
 928               loop1:
 929               for (i=0; i<N; i++)
 930                  sum += a[i];
 931
 932    Example2: induction:
 933
 934               loop2:
 935               for (i=0; i<N; i++)
 936                  a[i] = i;  */
 937
 938 static void
 939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 940 {
 941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 942
 943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 944
 945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 946      Reductions in such inner-loop therefore have different properties than
 947      the reductions in the nest that gets vectorized:
 948      1. When vectorized, they are executed in the same order as in the original
 949         scalar loop, so we can't change the order of computation when
 950         vectorizing them.
 951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 952         current checks are too strict.  */
 953
 954   if (loop->inner)
 955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 956 }
 957
 958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 959
 960 static void
 961 vect_fixup_reduc_chain (gimple *stmt)
 962 {
 963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964   gimple *stmtp;
 965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 966               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 968   do
 969     {
 970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 973       if (stmt)
 974         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 975           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 976     }
 977   while (stmt);
 978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 979 }
 980
 981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 982
 983 static void
 984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 985 {
 986   gimple *first;
 987   unsigned i;
 988
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 991       {
 992         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 993         while (next)
 994           {
 995             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 996               break;
 997             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 998           }
 999         /* If not all stmt in the chain are patterns try to handle
1000            the chain without patterns.  */
1001         if (! next)
1002           {
1003             vect_fixup_reduc_chain (first);
1004             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006           }
1007       }
1008 }
1009
1010 /* Function vect_get_loop_niters.
1011
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016
1017    Return the loop exit condition.  */
1018
1019
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022                       tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034                      "=== get_loop_niters ===\n");
1035
1036   if (!exit)
1037     return cond;
1038
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056         {
1057           /* Try to combine may_be_zero with assumptions, this can simplify
1058              computation of niter expression.  */
1059           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061                                              niter_assumptions,
1062                                              fold_build1 (TRUTH_NOT_EXPR,
1063                                                           boolean_type_node,
1064                                                           may_be_zero));
1065           else
1066             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067                                  build_int_cst (TREE_TYPE (niter), 0),
1068                                  rewrite_to_non_trapping_overflow (niter));
1069
1070           may_be_zero = NULL_TREE;
1071         }
1072       else if (integer_nonzerop (may_be_zero))
1073         {
1074           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076           return cond;
1077         }
1078       else
1079         return cond;
1080     }
1081
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091                           build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093
1094   return cond;
1095 }
1096
1097 /* Function bb_in_loop_p
1098
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109
1110
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     ivexpr_map (NULL),
1132     slp_unrolling_factor (1),
1133     single_scalar_iteration_cost (0),
1134     vectorizable (false),
1135     can_fully_mask_p (true),
1136     fully_masked_p (false),
1137     peeling_for_gaps (false),
1138     peeling_for_niter (false),
1139     operands_swapped (false),
1140     no_data_dependencies (false),
1141     has_mask_store (false),
1142     scalar_loop (NULL),
1143     orig_loop_info (NULL)
1144 {
1145   /* Create/Update stmt_info for all stmts in the loop.  */
1146   basic_block *body = get_loop_body (loop);
1147   for (unsigned int i = 0; i < loop->num_nodes; i++)
1148     {
1149       basic_block bb = body[i];
1150       gimple_stmt_iterator si;
1151
1152       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153         {
1154           gimple *phi = gsi_stmt (si);
1155           gimple_set_uid (phi, 0);
1156           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1157         }
1158
1159       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160         {
1161           gimple *stmt = gsi_stmt (si);
1162           gimple_set_uid (stmt, 0);
1163           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1164         }
1165     }
1166   free (body);
1167
1168   /* CHECKME: We want to visit all BBs before their successors (except for
1169      latch blocks, for which this assertion wouldn't hold).  In the simple
1170      case of the loop forms we allow, a dfs order of the BBs would the same
1171      as reversed postorder traversal, so we are safe.  */
1172
1173   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174                                           bbs, loop->num_nodes, loop);
1175   gcc_assert (nbbs == loop->num_nodes);
1176 }
1177
1178 /* Free all levels of MASKS.  */
1179
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1182 {
1183   rgroup_masks *rgm;
1184   unsigned int i;
1185   FOR_EACH_VEC_ELT (*masks, i, rgm)
1186     rgm->masks.release ();
1187   masks->release ();
1188 }
1189
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191    stmt_vec_info structs of all the stmts in the loop.  */
1192
1193 _loop_vec_info::~_loop_vec_info ()
1194 {
1195   int nbbs;
1196   gimple_stmt_iterator si;
1197   int j;
1198
1199   nbbs = loop->num_nodes;
1200   for (j = 0; j < nbbs; j++)
1201     {
1202       basic_block bb = bbs[j];
1203       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204         free_stmt_vec_info (gsi_stmt (si));
1205
1206       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207         {
1208           gimple *stmt = gsi_stmt (si);
1209
1210           /* We may have broken canonical form by moving a constant
1211              into RHS1 of a commutative op.  Fix such occurrences.  */
1212           if (operands_swapped && is_gimple_assign (stmt))
1213             {
1214               enum tree_code code = gimple_assign_rhs_code (stmt);
1215
1216               if ((code == PLUS_EXPR
1217                    || code == POINTER_PLUS_EXPR
1218                    || code == MULT_EXPR)
1219                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220                 swap_ssa_operands (stmt,
1221                                    gimple_assign_rhs1_ptr (stmt),
1222                                    gimple_assign_rhs2_ptr (stmt));
1223               else if (code == COND_EXPR
1224                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225                 {
1226                   tree cond_expr = gimple_assign_rhs1 (stmt);
1227                   enum tree_code cond_code = TREE_CODE (cond_expr);
1228
1229                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230                     {
1231                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232                                                                   0));
1233                       cond_code = invert_tree_comparison (cond_code,
1234                                                           honor_nans);
1235                       if (cond_code != ERROR_MARK)
1236                         {
1237                           TREE_SET_CODE (cond_expr, cond_code);
1238                           swap_ssa_operands (stmt,
1239                                              gimple_assign_rhs2_ptr (stmt),
1240                                              gimple_assign_rhs3_ptr (stmt));
1241                         }
1242                     }
1243                 }
1244             }
1245
1246           /* Free stmt_vec_info.  */
1247           free_stmt_vec_info (stmt);
1248           gsi_next (&si);
1249         }
1250     }
1251
1252   free (bbs);
1253
1254   release_vec_loop_masks (&masks);
1255   delete ivexpr_map;
1256
1257   loop->aux = NULL;
1258 }
1259
1260 /* Return an invariant or register for EXPR and emit necessary
1261    computations in the LOOP_VINFO loop preheader.  */
1262
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1265 {
1266   if (is_gimple_reg (expr)
1267       || is_gimple_min_invariant (expr))
1268     return expr;
1269
1270   if (! loop_vinfo->ivexpr_map)
1271     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273   if (! cached)
1274     {
1275       gimple_seq stmts = NULL;
1276       cached = force_gimple_operand (unshare_expr (expr),
1277                                      &stmts, true, NULL_TREE);
1278       if (stmts)
1279         {
1280           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281           gsi_insert_seq_on_edge_immediate (e, stmts);
1282         }
1283     }
1284   return cached;
1285 }
1286
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288    all masks required to mask LOOP_VINFO.  */
1289
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1292 {
1293   rgroup_masks *rgm;
1294   unsigned int i;
1295   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296     if (rgm->mask_type != NULL_TREE
1297         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298                                             cmp_type, rgm->mask_type,
1299                                             OPTIMIZE_FOR_SPEED))
1300       return false;
1301   return true;
1302 }
1303
1304 /* Calculate the maximum number of scalars per iteration for every
1305    rgroup in LOOP_VINFO.  */
1306
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1309 {
1310   unsigned int res = 1;
1311   unsigned int i;
1312   rgroup_masks *rgm;
1313   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314     res = MAX (res, rgm->max_nscalars_per_iter);
1315   return res;
1316 }
1317
1318 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1319    whether we can actually generate the masks required.  Return true if so,
1320    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1321
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1324 {
1325   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326   unsigned int min_ni_width;
1327
1328   /* Use a normal loop if there are no statements that need masking.
1329      This only happens in rare degenerate cases: it means that the loop
1330      has no loads, no stores, and no live-out values.  */
1331   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332     return false;
1333
1334   /* Get the maximum number of iterations that is representable
1335      in the counter type.  */
1336   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1338
1339   /* Get a more refined estimate for the number of iterations.  */
1340   widest_int max_back_edges;
1341   if (max_loop_iterations (loop, &max_back_edges))
1342     max_ni = wi::smin (max_ni, max_back_edges + 1);
1343
1344   /* Account for rgroup masks, in which each bit is replicated N times.  */
1345   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1346
1347   /* Work out how many bits we need to represent the limit.  */
1348   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1349
1350   /* Find a scalar mode for which WHILE_ULT is supported.  */
1351   opt_scalar_int_mode cmp_mode_iter;
1352   tree cmp_type = NULL_TREE;
1353   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1354     {
1355       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356       if (cmp_bits >= min_ni_width
1357           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1358         {
1359           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360           if (this_type
1361               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1362             {
1363               /* Although we could stop as soon as we find a valid mode,
1364                  it's often better to continue until we hit Pmode, since the
1365                  operands to the WHILE are more likely to be reusable in
1366                  address calculations.  */
1367               cmp_type = this_type;
1368               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369                 break;
1370             }
1371         }
1372     }
1373
1374   if (!cmp_type)
1375     return false;
1376
1377   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378   return true;
1379 }
1380
1381 /* Calculate the cost of one scalar iteration of the loop.  */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1384 {
1385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1388   int innerloop_iters, i;
1389
1390   /* Count statements in scalar loop.  Using this as scalar cost for a single
1391      iteration for now.
1392
1393      TODO: Add outer loop support.
1394
1395      TODO: Consider assigning different costs to different scalar
1396      statements.  */
1397
1398   /* FORNOW.  */
1399   innerloop_iters = 1;
1400   if (loop->inner)
1401     innerloop_iters = 50; /* FIXME */
1402
1403   for (i = 0; i < nbbs; i++)
1404     {
1405       gimple_stmt_iterator si;
1406       basic_block bb = bbs[i];
1407
1408       if (bb->loop_father == loop->inner)
1409         factor = innerloop_iters;
1410       else
1411         factor = 1;
1412
1413       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1414         {
1415           gimple *stmt = gsi_stmt (si);
1416           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1417
1418           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1419             continue;
1420
1421           /* Skip stmts that are not vectorized inside the loop.  */
1422           if (stmt_info
1423               && !STMT_VINFO_RELEVANT_P (stmt_info)
1424               && (!STMT_VINFO_LIVE_P (stmt_info)
1425                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1426               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1427             continue;
1428
1429           vect_cost_for_stmt kind;
1430           if (STMT_VINFO_DATA_REF (stmt_info))
1431             {
1432               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1433                kind = scalar_load;
1434              else
1435                kind = scalar_store;
1436             }
1437           else
1438             kind = scalar_stmt;
1439
1440           scalar_single_iter_cost
1441             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1442                                  factor, kind, stmt_info, 0, vect_prologue);
1443         }
1444     }
1445   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1446     = scalar_single_iter_cost;
1447 }
1448
1449
1450 /* Function vect_analyze_loop_form_1.
1451
1452    Verify that certain CFG restrictions hold, including:
1453    - the loop has a pre-header
1454    - the loop has a single entry and exit
1455    - the loop exit condition is simple enough
1456    - the number of iterations can be analyzed, i.e, a countable loop.  The
1457      niter could be analyzed under some assumptions.  */
1458
1459 bool
1460 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1461                           tree *assumptions, tree *number_of_iterationsm1,
1462                           tree *number_of_iterations, gcond **inner_loop_cond)
1463 {
1464   if (dump_enabled_p ())
1465     dump_printf_loc (MSG_NOTE, vect_location,
1466                      "=== vect_analyze_loop_form ===\n");
1467
1468   /* Different restrictions apply when we are considering an inner-most loop,
1469      vs. an outer (nested) loop.
1470      (FORNOW. May want to relax some of these restrictions in the future).  */
1471
1472   if (!loop->inner)
1473     {
1474       /* Inner-most loop.  We currently require that the number of BBs is
1475          exactly 2 (the header and latch).  Vectorizable inner-most loops
1476          look like this:
1477
1478                         (pre-header)
1479                            |
1480                           header <--------+
1481                            | |            |
1482                            | +--> latch --+
1483                            |
1484                         (exit-bb)  */
1485
1486       if (loop->num_nodes != 2)
1487         {
1488           if (dump_enabled_p ())
1489             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490                              "not vectorized: control flow in loop.\n");
1491           return false;
1492         }
1493
1494       if (empty_block_p (loop->header))
1495         {
1496           if (dump_enabled_p ())
1497             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1498                              "not vectorized: empty loop.\n");
1499           return false;
1500         }
1501     }
1502   else
1503     {
1504       struct loop *innerloop = loop->inner;
1505       edge entryedge;
1506
1507       /* Nested loop. We currently require that the loop is doubly-nested,
1508          contains a single inner loop, and the number of BBs is exactly 5.
1509          Vectorizable outer-loops look like this:
1510
1511                         (pre-header)
1512                            |
1513                           header <---+
1514                            |         |
1515                           inner-loop |
1516                            |         |
1517                           tail ------+
1518                            |
1519                         (exit-bb)
1520
1521          The inner-loop has the properties expected of inner-most loops
1522          as described above.  */
1523
1524       if ((loop->inner)->inner || (loop->inner)->next)
1525         {
1526           if (dump_enabled_p ())
1527             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1528                              "not vectorized: multiple nested loops.\n");
1529           return false;
1530         }
1531
1532       if (loop->num_nodes != 5)
1533         {
1534           if (dump_enabled_p ())
1535             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1536                              "not vectorized: control flow in loop.\n");
1537           return false;
1538         }
1539
1540       entryedge = loop_preheader_edge (innerloop);
1541       if (entryedge->src != loop->header
1542           || !single_exit (innerloop)
1543           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1544         {
1545           if (dump_enabled_p ())
1546             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1547                              "not vectorized: unsupported outerloop form.\n");
1548           return false;
1549         }
1550
1551       /* Analyze the inner-loop.  */
1552       tree inner_niterm1, inner_niter, inner_assumptions;
1553       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1554                                       &inner_assumptions, &inner_niterm1,
1555                                       &inner_niter, NULL)
1556           /* Don't support analyzing niter under assumptions for inner
1557              loop.  */
1558           || !integer_onep (inner_assumptions))
1559         {
1560           if (dump_enabled_p ())
1561             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1562                              "not vectorized: Bad inner loop.\n");
1563           return false;
1564         }
1565
1566       if (!expr_invariant_in_loop_p (loop, inner_niter))
1567         {
1568           if (dump_enabled_p ())
1569             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1570                              "not vectorized: inner-loop count not"
1571                              " invariant.\n");
1572           return false;
1573         }
1574
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_NOTE, vect_location,
1577                          "Considering outer-loop vectorization.\n");
1578     }
1579
1580   if (!single_exit (loop)
1581       || EDGE_COUNT (loop->header->preds) != 2)
1582     {
1583       if (dump_enabled_p ())
1584         {
1585           if (!single_exit (loop))
1586             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1587                              "not vectorized: multiple exits.\n");
1588           else if (EDGE_COUNT (loop->header->preds) != 2)
1589             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1590                              "not vectorized: too many incoming edges.\n");
1591         }
1592       return false;
1593     }
1594
1595   /* We assume that the loop exit condition is at the end of the loop. i.e,
1596      that the loop is represented as a do-while (with a proper if-guard
1597      before the loop if needed), where the loop header contains all the
1598      executable statements, and the latch is empty.  */
1599   if (!empty_block_p (loop->latch)
1600       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1601     {
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1604                          "not vectorized: latch block not empty.\n");
1605       return false;
1606     }
1607
1608   /* Make sure the exit is not abnormal.  */
1609   edge e = single_exit (loop);
1610   if (e->flags & EDGE_ABNORMAL)
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614                          "not vectorized: abnormal loop exit edge.\n");
1615       return false;
1616     }
1617
1618   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1619                                      number_of_iterationsm1);
1620   if (!*loop_cond)
1621     {
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1624                          "not vectorized: complicated exit condition.\n");
1625       return false;
1626     }
1627
1628   if (integer_zerop (*assumptions)
1629       || !*number_of_iterations
1630       || chrec_contains_undetermined (*number_of_iterations))
1631     {
1632       if (dump_enabled_p ())
1633         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1634                          "not vectorized: number of iterations cannot be "
1635                          "computed.\n");
1636       return false;
1637     }
1638
1639   if (integer_zerop (*number_of_iterations))
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "not vectorized: number of iterations = 0.\n");
1644       return false;
1645     }
1646
1647   return true;
1648 }
1649
1650 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1651
1652 loop_vec_info
1653 vect_analyze_loop_form (struct loop *loop)
1654 {
1655   tree assumptions, number_of_iterations, number_of_iterationsm1;
1656   gcond *loop_cond, *inner_loop_cond = NULL;
1657
1658   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1659                                   &assumptions, &number_of_iterationsm1,
1660                                   &number_of_iterations, &inner_loop_cond))
1661     return NULL;
1662
1663   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1664   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1665   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1666   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1667   if (!integer_onep (assumptions))
1668     {
1669       /* We consider to vectorize this loop by versioning it under
1670          some assumptions.  In order to do this, we need to clear
1671          existing information computed by scev and niter analyzer.  */
1672       scev_reset_htab ();
1673       free_numbers_of_iterations_estimates (loop);
1674       /* Also set flag for this loop so that following scev and niter
1675          analysis are done under the assumptions.  */
1676       loop_constraint_set (loop, LOOP_C_FINITE);
1677       /* Also record the assumptions for versioning.  */
1678       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1679     }
1680
1681   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1682     {
1683       if (dump_enabled_p ())
1684         {
1685           dump_printf_loc (MSG_NOTE, vect_location,
1686                            "Symbolic number of iterations is ");
1687           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1688           dump_printf (MSG_NOTE, "\n");
1689         }
1690     }
1691
1692   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1693   if (inner_loop_cond)
1694     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1695       = loop_exit_ctrl_vec_info_type;
1696
1697   gcc_assert (!loop->aux);
1698   loop->aux = loop_vinfo;
1699   return loop_vinfo;
1700 }
1701
1702
1703
1704 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1705    statements update the vectorization factor.  */
1706
1707 static void
1708 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1709 {
1710   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1711   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1712   int nbbs = loop->num_nodes;
1713   poly_uint64 vectorization_factor;
1714   int i;
1715
1716   if (dump_enabled_p ())
1717     dump_printf_loc (MSG_NOTE, vect_location,
1718                      "=== vect_update_vf_for_slp ===\n");
1719
1720   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1721   gcc_assert (known_ne (vectorization_factor, 0U));
1722
1723   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1724      vectorization factor of the loop is the unrolling factor required by
1725      the SLP instances.  If that unrolling factor is 1, we say, that we
1726      perform pure SLP on loop - cross iteration parallelism is not
1727      exploited.  */
1728   bool only_slp_in_loop = true;
1729   for (i = 0; i < nbbs; i++)
1730     {
1731       basic_block bb = bbs[i];
1732       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1733            gsi_next (&si))
1734         {
1735           gimple *stmt = gsi_stmt (si);
1736           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1737           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1738               && STMT_VINFO_RELATED_STMT (stmt_info))
1739             {
1740               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1741               stmt_info = vinfo_for_stmt (stmt);
1742             }
1743           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1744                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1745               && !PURE_SLP_STMT (stmt_info))
1746             /* STMT needs both SLP and loop-based vectorization.  */
1747             only_slp_in_loop = false;
1748         }
1749     }
1750
1751   if (only_slp_in_loop)
1752     {
1753       dump_printf_loc (MSG_NOTE, vect_location,
1754                        "Loop contains only SLP stmts\n");
1755       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1756     }
1757   else
1758     {
1759       dump_printf_loc (MSG_NOTE, vect_location,
1760                        "Loop contains SLP and non-SLP stmts\n");
1761       /* Both the vectorization factor and unroll factor have the form
1762          current_vector_size * X for some rational X, so they must have
1763          a common multiple.  */
1764       vectorization_factor
1765         = force_common_multiple (vectorization_factor,
1766                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1767     }
1768
1769   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1770   if (dump_enabled_p ())
1771     {
1772       dump_printf_loc (MSG_NOTE, vect_location,
1773                        "Updating vectorization factor to ");
1774       dump_dec (MSG_NOTE, vectorization_factor);
1775       dump_printf (MSG_NOTE, ".\n");
1776     }
1777 }
1778
1779 /* Return true if STMT_INFO describes a double reduction phi and if
1780    the other phi in the reduction is also relevant for vectorization.
1781    This rejects cases such as:
1782
1783       outer1:
1784         x_1 = PHI <x_3(outer2), ...>;
1785         ...
1786
1787       inner:
1788         x_2 = ...;
1789         ...
1790
1791       outer2:
1792         x_3 = PHI <x_2(inner)>;
1793
1794    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1795
1796 static bool
1797 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1798 {
1799   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1800     return false;
1801
1802   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1803   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1804 }
1805
1806 /* Function vect_analyze_loop_operations.
1807
1808    Scan the loop stmts and make sure they are all vectorizable.  */
1809
1810 static bool
1811 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1812 {
1813   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1814   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1815   int nbbs = loop->num_nodes;
1816   int i;
1817   stmt_vec_info stmt_info;
1818   bool need_to_vectorize = false;
1819   bool ok;
1820
1821   if (dump_enabled_p ())
1822     dump_printf_loc (MSG_NOTE, vect_location,
1823                      "=== vect_analyze_loop_operations ===\n");
1824
1825   for (i = 0; i < nbbs; i++)
1826     {
1827       basic_block bb = bbs[i];
1828
1829       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1830            gsi_next (&si))
1831         {
1832           gphi *phi = si.phi ();
1833           ok = true;
1834
1835           stmt_info = vinfo_for_stmt (phi);
1836           if (dump_enabled_p ())
1837             {
1838               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1839               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1840             }
1841           if (virtual_operand_p (gimple_phi_result (phi)))
1842             continue;
1843
1844           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1845              (i.e., a phi in the tail of the outer-loop).  */
1846           if (! is_loop_header_bb_p (bb))
1847             {
1848               /* FORNOW: we currently don't support the case that these phis
1849                  are not used in the outerloop (unless it is double reduction,
1850                  i.e., this phi is vect_reduction_def), cause this case
1851                  requires to actually do something here.  */
1852               if (STMT_VINFO_LIVE_P (stmt_info)
1853                   && !vect_active_double_reduction_p (stmt_info))
1854                 {
1855                   if (dump_enabled_p ())
1856                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857                                      "Unsupported loop-closed phi in "
1858                                      "outer-loop.\n");
1859                   return false;
1860                 }
1861
1862               /* If PHI is used in the outer loop, we check that its operand
1863                  is defined in the inner loop.  */
1864               if (STMT_VINFO_RELEVANT_P (stmt_info))
1865                 {
1866                   tree phi_op;
1867                   gimple *op_def_stmt;
1868
1869                   if (gimple_phi_num_args (phi) != 1)
1870                     return false;
1871
1872                   phi_op = PHI_ARG_DEF (phi, 0);
1873                   if (TREE_CODE (phi_op) != SSA_NAME)
1874                     return false;
1875
1876                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1877                   if (gimple_nop_p (op_def_stmt)
1878                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1879                       || !vinfo_for_stmt (op_def_stmt))
1880                     return false;
1881
1882                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1883                         != vect_used_in_outer
1884                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1885                            != vect_used_in_outer_by_reduction)
1886                     return false;
1887                 }
1888
1889               continue;
1890             }
1891
1892           gcc_assert (stmt_info);
1893
1894           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1895                || STMT_VINFO_LIVE_P (stmt_info))
1896               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1897             {
1898               /* A scalar-dependence cycle that we don't support.  */
1899               if (dump_enabled_p ())
1900                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1901                                  "not vectorized: scalar dependence cycle.\n");
1902               return false;
1903             }
1904
1905           if (STMT_VINFO_RELEVANT_P (stmt_info))
1906             {
1907               need_to_vectorize = true;
1908               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1909                   && ! PURE_SLP_STMT (stmt_info))
1910                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1911               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1912                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1913                        && ! PURE_SLP_STMT (stmt_info))
1914                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1915             }
1916
1917           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1918           if (ok
1919               && STMT_VINFO_LIVE_P (stmt_info)
1920               && !PURE_SLP_STMT (stmt_info))
1921             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1922
1923           if (!ok)
1924             {
1925               if (dump_enabled_p ())
1926                 {
1927                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1928                                    "not vectorized: relevant phi not "
1929                                    "supported: ");
1930                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1931                 }
1932               return false;
1933             }
1934         }
1935
1936       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1937            gsi_next (&si))
1938         {
1939           gimple *stmt = gsi_stmt (si);
1940           if (!gimple_clobber_p (stmt)
1941               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1942             return false;
1943         }
1944     } /* bbs */
1945
1946   /* All operations in the loop are either irrelevant (deal with loop
1947      control, or dead), or only used outside the loop and can be moved
1948      out of the loop (e.g. invariants, inductions).  The loop can be
1949      optimized away by scalar optimizations.  We're better off not
1950      touching this loop.  */
1951   if (!need_to_vectorize)
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_NOTE, vect_location,
1955                          "All the computation can be taken out of the loop.\n");
1956       if (dump_enabled_p ())
1957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958                          "not vectorized: redundant loop. no profit to "
1959                          "vectorize.\n");
1960       return false;
1961     }
1962
1963   return true;
1964 }
1965
1966 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1967    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1968    definitely no, or -1 if it's worth retrying.  */
1969
1970 static int
1971 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1972 {
1973   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1974   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1975
1976   /* Only fully-masked loops can have iteration counts less than the
1977      vectorization factor.  */
1978   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1979     {
1980       HOST_WIDE_INT max_niter;
1981
1982       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1983         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1984       else
1985         max_niter = max_stmt_executions_int (loop);
1986
1987       if (max_niter != -1
1988           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1989         {
1990           if (dump_enabled_p ())
1991             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1992                              "not vectorized: iteration count smaller than "
1993                              "vectorization factor.\n");
1994           return 0;
1995         }
1996     }
1997
1998   int min_profitable_iters, min_profitable_estimate;
1999   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2000                                       &min_profitable_estimate);
2001
2002   if (min_profitable_iters < 0)
2003     {
2004       if (dump_enabled_p ())
2005         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2006                          "not vectorized: vectorization not profitable.\n");
2007       if (dump_enabled_p ())
2008         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2009                          "not vectorized: vector version will never be "
2010                          "profitable.\n");
2011       return -1;
2012     }
2013
2014   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2015                                * assumed_vf);
2016
2017   /* Use the cost model only if it is more conservative than user specified
2018      threshold.  */
2019   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2020                                     min_profitable_iters);
2021
2022   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2023
2024   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2025       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2026     {
2027       if (dump_enabled_p ())
2028         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2029                          "not vectorized: vectorization not profitable.\n");
2030       if (dump_enabled_p ())
2031         dump_printf_loc (MSG_NOTE, vect_location,
2032                          "not vectorized: iteration count smaller than user "
2033                          "specified loop bound parameter or minimum profitable "
2034                          "iterations (whichever is more conservative).\n");
2035       return 0;
2036     }
2037
2038   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2039   if (estimated_niter == -1)
2040     estimated_niter = likely_max_stmt_executions_int (loop);
2041   if (estimated_niter != -1
2042       && ((unsigned HOST_WIDE_INT) estimated_niter
2043           < MAX (th, (unsigned) min_profitable_estimate)))
2044     {
2045       if (dump_enabled_p ())
2046         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047                          "not vectorized: estimated iteration count too "
2048                          "small.\n");
2049       if (dump_enabled_p ())
2050         dump_printf_loc (MSG_NOTE, vect_location,
2051                          "not vectorized: estimated iteration count smaller "
2052                          "than specified loop bound parameter or minimum "
2053                          "profitable iterations (whichever is more "
2054                          "conservative).\n");
2055       return -1;
2056     }
2057
2058   return 1;
2059 }
2060
2061
2062 /* Function vect_analyze_loop_2.
2063
2064    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2065    for it.  The different analyses will record information in the
2066    loop_vec_info struct.  */
2067 static bool
2068 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2069 {
2070   bool ok;
2071   int res;
2072   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2073   poly_uint64 min_vf = 2;
2074   unsigned int n_stmts = 0;
2075
2076   /* The first group of checks is independent of the vector size.  */
2077   fatal = true;
2078
2079   /* Find all data references in the loop (which correspond to vdefs/vuses)
2080      and analyze their evolution in the loop.  */
2081
2082   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2083
2084   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2085   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2086     {
2087       if (dump_enabled_p ())
2088         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2089                          "not vectorized: loop nest containing two "
2090                          "or more consecutive inner loops cannot be "
2091                          "vectorized\n");
2092       return false;
2093     }
2094
2095   for (unsigned i = 0; i < loop->num_nodes; i++)
2096     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2097          !gsi_end_p (gsi); gsi_next (&gsi))
2098       {
2099         gimple *stmt = gsi_stmt (gsi);
2100         if (is_gimple_debug (stmt))
2101           continue;
2102         ++n_stmts;
2103         if (!find_data_references_in_stmt (loop, stmt,
2104                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2105           {
2106             if (is_gimple_call (stmt) && loop->safelen)
2107               {
2108                 tree fndecl = gimple_call_fndecl (stmt), op;
2109                 if (fndecl != NULL_TREE)
2110                   {
2111                     cgraph_node *node = cgraph_node::get (fndecl);
2112                     if (node != NULL && node->simd_clones != NULL)
2113                       {
2114                         unsigned int j, n = gimple_call_num_args (stmt);
2115                         for (j = 0; j < n; j++)
2116                           {
2117                             op = gimple_call_arg (stmt, j);
2118                             if (DECL_P (op)
2119                                 || (REFERENCE_CLASS_P (op)
2120                                     && get_base_address (op)))
2121                               break;
2122                           }
2123                         op = gimple_call_lhs (stmt);
2124                         /* Ignore #pragma omp declare simd functions
2125                            if they don't have data references in the
2126                            call stmt itself.  */
2127                         if (j == n
2128                             && !(op
2129                                  && (DECL_P (op)
2130                                      || (REFERENCE_CLASS_P (op)
2131                                          && get_base_address (op)))))
2132                           continue;
2133                       }
2134                   }
2135               }
2136             if (dump_enabled_p ())
2137               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2138                                "not vectorized: loop contains function "
2139                                "calls or data references that cannot "
2140                                "be analyzed\n");
2141             return false;
2142           }
2143       }
2144
2145   /* Analyze the data references and also adjust the minimal
2146      vectorization factor according to the loads and stores.  */
2147
2148   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2149   if (!ok)
2150     {
2151       if (dump_enabled_p ())
2152         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2153                          "bad data references.\n");
2154       return false;
2155     }
2156
2157   /* Classify all cross-iteration scalar data-flow cycles.
2158      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2159   vect_analyze_scalar_cycles (loop_vinfo);
2160
2161   vect_pattern_recog (loop_vinfo);
2162
2163   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2164
2165   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2166      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2167
2168   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2169   if (!ok)
2170     {
2171       if (dump_enabled_p ())
2172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2173                          "bad data access.\n");
2174       return false;
2175     }
2176
2177   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2178
2179   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2180   if (!ok)
2181     {
2182       if (dump_enabled_p ())
2183         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184                          "unexpected pattern.\n");
2185       return false;
2186     }
2187
2188   /* While the rest of the analysis below depends on it in some way.  */
2189   fatal = false;
2190
2191   /* Analyze data dependences between the data-refs in the loop
2192      and adjust the maximum vectorization factor according to
2193      the dependences.
2194      FORNOW: fail at the first data dependence that we encounter.  */
2195
2196   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2197   if (!ok
2198       || (max_vf != MAX_VECTORIZATION_FACTOR
2199           && maybe_lt (max_vf, min_vf)))
2200     {
2201       if (dump_enabled_p ())
2202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2203                              "bad data dependence.\n");
2204       return false;
2205     }
2206   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2207
2208   ok = vect_determine_vectorization_factor (loop_vinfo);
2209   if (!ok)
2210     {
2211       if (dump_enabled_p ())
2212         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2213                          "can't determine vectorization factor.\n");
2214       return false;
2215     }
2216   if (max_vf != MAX_VECTORIZATION_FACTOR
2217       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2218     {
2219       if (dump_enabled_p ())
2220         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2221                          "bad data dependence.\n");
2222       return false;
2223     }
2224
2225   /* Compute the scalar iteration cost.  */
2226   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2227
2228   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2229   unsigned th;
2230
2231   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2232   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2233   if (!ok)
2234     return false;
2235
2236   /* If there are any SLP instances mark them as pure_slp.  */
2237   bool slp = vect_make_slp_decision (loop_vinfo);
2238   if (slp)
2239     {
2240       /* Find stmts that need to be both vectorized and SLPed.  */
2241       vect_detect_hybrid_slp (loop_vinfo);
2242
2243       /* Update the vectorization factor based on the SLP decision.  */
2244       vect_update_vf_for_slp (loop_vinfo);
2245     }
2246
2247   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2248
2249   /* We don't expect to have to roll back to anything other than an empty
2250      set of rgroups.  */
2251   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2252
2253   /* This is the point where we can re-start analysis with SLP forced off.  */
2254 start_over:
2255
2256   /* Now the vectorization factor is final.  */
2257   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2258   gcc_assert (known_ne (vectorization_factor, 0U));
2259
2260   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2261     {
2262       dump_printf_loc (MSG_NOTE, vect_location,
2263                        "vectorization_factor = ");
2264       dump_dec (MSG_NOTE, vectorization_factor);
2265       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2266                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2267     }
2268
2269   HOST_WIDE_INT max_niter
2270     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2271
2272   /* Analyze the alignment of the data-refs in the loop.
2273      Fail if a data reference is found that cannot be vectorized.  */
2274
2275   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2276   if (!ok)
2277     {
2278       if (dump_enabled_p ())
2279         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2280                          "bad data alignment.\n");
2281       return false;
2282     }
2283
2284   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2285      It is important to call pruning after vect_analyze_data_ref_accesses,
2286      since we use grouping information gathered by interleaving analysis.  */
2287   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2288   if (!ok)
2289     return false;
2290
2291   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2292      vectorization.  */
2293   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2294     {
2295     /* This pass will decide on using loop versioning and/or loop peeling in
2296        order to enhance the alignment of data references in the loop.  */
2297     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2298     if (!ok)
2299       {
2300         if (dump_enabled_p ())
2301           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2302                            "bad data alignment.\n");
2303         return false;
2304       }
2305     }
2306
2307   if (slp)
2308     {
2309       /* Analyze operations in the SLP instances.  Note this may
2310          remove unsupported SLP instances which makes the above
2311          SLP kind detection invalid.  */
2312       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2313       vect_slp_analyze_operations (loop_vinfo);
2314       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2315         goto again;
2316     }
2317
2318   /* Scan all the remaining operations in the loop that are not subject
2319      to SLP and make sure they are vectorizable.  */
2320   ok = vect_analyze_loop_operations (loop_vinfo);
2321   if (!ok)
2322     {
2323       if (dump_enabled_p ())
2324         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2325                          "bad operation or unsupported loop bound.\n");
2326       return false;
2327     }
2328
2329   /* Decide whether to use a fully-masked loop for this vectorization
2330      factor.  */
2331   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2332     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2333        && vect_verify_full_masking (loop_vinfo));
2334   if (dump_enabled_p ())
2335     {
2336       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2337         dump_printf_loc (MSG_NOTE, vect_location,
2338                          "using a fully-masked loop.\n");
2339       else
2340         dump_printf_loc (MSG_NOTE, vect_location,
2341                          "not using a fully-masked loop.\n");
2342     }
2343
2344   /* If epilog loop is required because of data accesses with gaps,
2345      one additional iteration needs to be peeled.  Check if there is
2346      enough iterations for vectorization.  */
2347   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2348       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2349       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2350     {
2351       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2352       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2353
2354       if (known_lt (wi::to_widest (scalar_niters), vf))
2355         {
2356           if (dump_enabled_p ())
2357             dump_printf_loc (MSG_NOTE, vect_location,
2358                              "loop has no enough iterations to support"
2359                              " peeling for gaps.\n");
2360           return false;
2361         }
2362     }
2363
2364   /* Check the costings of the loop make vectorizing worthwhile.  */
2365   res = vect_analyze_loop_costing (loop_vinfo);
2366   if (res < 0)
2367     goto again;
2368   if (!res)
2369     {
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2372                          "Loop costings not worthwhile.\n");
2373       return false;
2374     }
2375
2376   /* Decide whether we need to create an epilogue loop to handle
2377      remaining scalar iterations.  */
2378   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2379
2380   unsigned HOST_WIDE_INT const_vf;
2381   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2382     /* The main loop handles all iterations.  */
2383     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2384   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2385            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2386     {
2387       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2388                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2389                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2390         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2391     }
2392   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2393            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2394            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2395                 < (unsigned) exact_log2 (const_vf))
2396                /* In case of versioning, check if the maximum number of
2397                   iterations is greater than th.  If they are identical,
2398                   the epilogue is unnecessary.  */
2399                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2400                    || ((unsigned HOST_WIDE_INT) max_niter
2401                        > (th / const_vf) * const_vf))))
2402     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2403
2404   /* If an epilogue loop is required make sure we can create one.  */
2405   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2406       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2407     {
2408       if (dump_enabled_p ())
2409         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2410       if (!vect_can_advance_ivs_p (loop_vinfo)
2411           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2412                                            single_exit (LOOP_VINFO_LOOP
2413                                                          (loop_vinfo))))
2414         {
2415           if (dump_enabled_p ())
2416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2417                              "not vectorized: can't create required "
2418                              "epilog loop\n");
2419           goto again;
2420         }
2421     }
2422
2423   /* During peeling, we need to check if number of loop iterations is
2424      enough for both peeled prolog loop and vector loop.  This check
2425      can be merged along with threshold check of loop versioning, so
2426      increase threshold for this case if necessary.  */
2427   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2428     {
2429       poly_uint64 niters_th = 0;
2430
2431       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2432         {
2433           /* Niters for peeled prolog loop.  */
2434           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2435             {
2436               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2437               tree vectype
2438                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2439               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2440             }
2441           else
2442             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2443         }
2444
2445       /* Niters for at least one iteration of vectorized loop.  */
2446       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2447         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2448       /* One additional iteration because of peeling for gap.  */
2449       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2450         niters_th += 1;
2451       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2452     }
2453
2454   gcc_assert (known_eq (vectorization_factor,
2455                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2456
2457   /* Ok to vectorize!  */
2458   return true;
2459
2460 again:
2461   /* Try again with SLP forced off but if we didn't do any SLP there is
2462      no point in re-trying.  */
2463   if (!slp)
2464     return false;
2465
2466   /* If there are reduction chains re-trying will fail anyway.  */
2467   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2468     return false;
2469
2470   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2471      via interleaving or lane instructions.  */
2472   slp_instance instance;
2473   slp_tree node;
2474   unsigned i, j;
2475   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2476     {
2477       stmt_vec_info vinfo;
2478       vinfo = vinfo_for_stmt
2479           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2480       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2481         continue;
2482       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2483       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2484       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2485       if (! vect_store_lanes_supported (vectype, size, false)
2486           && ! vect_grouped_store_supported (vectype, size))
2487         return false;
2488       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2489         {
2490           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2491           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2493           size = STMT_VINFO_GROUP_SIZE (vinfo);
2494           vectype = STMT_VINFO_VECTYPE (vinfo);
2495           if (! vect_load_lanes_supported (vectype, size, false)
2496               && ! vect_grouped_load_supported (vectype, single_element_p,
2497                                                 size))
2498             return false;
2499         }
2500     }
2501
2502   if (dump_enabled_p ())
2503     dump_printf_loc (MSG_NOTE, vect_location,
2504                      "re-trying with SLP disabled\n");
2505
2506   /* Roll back state appropriately.  No SLP this time.  */
2507   slp = false;
2508   /* Restore vectorization factor as it were without SLP.  */
2509   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2510   /* Free the SLP instances.  */
2511   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2512     vect_free_slp_instance (instance);
2513   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2514   /* Reset SLP type to loop_vect on all stmts.  */
2515   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2516     {
2517       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2518       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2519            !gsi_end_p (si); gsi_next (&si))
2520         {
2521           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2522           STMT_SLP_TYPE (stmt_info) = loop_vect;
2523         }
2524       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2525            !gsi_end_p (si); gsi_next (&si))
2526         {
2527           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2528           STMT_SLP_TYPE (stmt_info) = loop_vect;
2529           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2530             {
2531               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2532               STMT_SLP_TYPE (stmt_info) = loop_vect;
2533               for (gimple_stmt_iterator pi
2534                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2535                    !gsi_end_p (pi); gsi_next (&pi))
2536                 {
2537                   gimple *pstmt = gsi_stmt (pi);
2538                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2539                 }
2540             }
2541         }
2542     }
2543   /* Free optimized alias test DDRS.  */
2544   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2545   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2546   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2547   /* Reset target cost data.  */
2548   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2549   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2550     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2551   /* Reset accumulated rgroup information.  */
2552   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2553   /* Reset assorted flags.  */
2554   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2555   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2556   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2557   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2558   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2559
2560   goto start_over;
2561 }
2562
2563 /* Function vect_analyze_loop.
2564
2565    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2566    for it.  The different analyses will record information in the
2567    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2568    be vectorized.  */
2569 loop_vec_info
2570 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2571 {
2572   loop_vec_info loop_vinfo;
2573   auto_vector_sizes vector_sizes;
2574
2575   /* Autodetect first vector size we try.  */
2576   current_vector_size = 0;
2577   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2578   unsigned int next_size = 0;
2579
2580   if (dump_enabled_p ())
2581     dump_printf_loc (MSG_NOTE, vect_location,
2582                      "===== analyze_loop_nest =====\n");
2583
2584   if (loop_outer (loop)
2585       && loop_vec_info_for_loop (loop_outer (loop))
2586       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2587     {
2588       if (dump_enabled_p ())
2589         dump_printf_loc (MSG_NOTE, vect_location,
2590                          "outer-loop already vectorized.\n");
2591       return NULL;
2592     }
2593
2594   poly_uint64 autodetected_vector_size = 0;
2595   while (1)
2596     {
2597       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2598       loop_vinfo = vect_analyze_loop_form (loop);
2599       if (!loop_vinfo)
2600         {
2601           if (dump_enabled_p ())
2602             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2603                              "bad loop form.\n");
2604           return NULL;
2605         }
2606
2607       bool fatal = false;
2608
2609       if (orig_loop_vinfo)
2610         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2611
2612       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2613         {
2614           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2615
2616           return loop_vinfo;
2617         }
2618
2619       delete loop_vinfo;
2620
2621       if (next_size == 0)
2622         autodetected_vector_size = current_vector_size;
2623
2624       if (next_size < vector_sizes.length ()
2625           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2626         next_size += 1;
2627
2628       if (fatal
2629           || next_size == vector_sizes.length ()
2630           || known_eq (current_vector_size, 0U))
2631         return NULL;
2632
2633       /* Try the next biggest vector size.  */
2634       current_vector_size = vector_sizes[next_size++];
2635       if (dump_enabled_p ())
2636         {
2637           dump_printf_loc (MSG_NOTE, vect_location,
2638                            "***** Re-trying analysis with "
2639                            "vector size ");
2640           dump_dec (MSG_NOTE, current_vector_size);
2641           dump_printf (MSG_NOTE, "\n");
2642         }
2643     }
2644 }
2645
2646 /* Return true if there is an in-order reduction function for CODE, storing
2647    it in *REDUC_FN if so.  */
2648
2649 static bool
2650 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2651 {
2652   switch (code)
2653     {
2654     case PLUS_EXPR:
2655       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2656       return true;
2657
2658     default:
2659       return false;
2660     }
2661 }
2662
2663 /* Function reduction_fn_for_scalar_code
2664
2665    Input:
2666    CODE - tree_code of a reduction operations.
2667
2668    Output:
2669    REDUC_FN - the corresponding internal function to be used to reduce the
2670       vector of partial results into a single scalar result, or IFN_LAST
2671       if the operation is a supported reduction operation, but does not have
2672       such an internal function.
2673
2674    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2675
2676 static bool
2677 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2678 {
2679   switch (code)
2680     {
2681       case MAX_EXPR:
2682         *reduc_fn = IFN_REDUC_MAX;
2683         return true;
2684
2685       case MIN_EXPR:
2686         *reduc_fn = IFN_REDUC_MIN;
2687         return true;
2688
2689       case PLUS_EXPR:
2690         *reduc_fn = IFN_REDUC_PLUS;
2691         return true;
2692
2693       case BIT_AND_EXPR:
2694         *reduc_fn = IFN_REDUC_AND;
2695         return true;
2696
2697       case BIT_IOR_EXPR:
2698         *reduc_fn = IFN_REDUC_IOR;
2699         return true;
2700
2701       case BIT_XOR_EXPR:
2702         *reduc_fn = IFN_REDUC_XOR;
2703         return true;
2704
2705       case MULT_EXPR:
2706       case MINUS_EXPR:
2707         *reduc_fn = IFN_LAST;
2708         return true;
2709
2710       default:
2711        return false;
2712     }
2713 }
2714
2715 /* If there is a neutral value X such that SLP reduction NODE would not
2716    be affected by the introduction of additional X elements, return that X,
2717    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2718    is true if the SLP statements perform a single reduction, false if each
2719    statement performs an independent reduction.  */
2720
2721 static tree
2722 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2723                               bool reduc_chain)
2724 {
2725   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2726   gimple *stmt = stmts[0];
2727   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2728   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2729   tree scalar_type = TREE_TYPE (vector_type);
2730   struct loop *loop = gimple_bb (stmt)->loop_father;
2731   gcc_assert (loop);
2732
2733   switch (code)
2734     {
2735     case WIDEN_SUM_EXPR:
2736     case DOT_PROD_EXPR:
2737     case SAD_EXPR:
2738     case PLUS_EXPR:
2739     case MINUS_EXPR:
2740     case BIT_IOR_EXPR:
2741     case BIT_XOR_EXPR:
2742       return build_zero_cst (scalar_type);
2743
2744     case MULT_EXPR:
2745       return build_one_cst (scalar_type);
2746
2747     case BIT_AND_EXPR:
2748       return build_all_ones_cst (scalar_type);
2749
2750     case MAX_EXPR:
2751     case MIN_EXPR:
2752       /* For MIN/MAX the initial values are neutral.  A reduction chain
2753          has only a single initial value, so that value is neutral for
2754          all statements.  */
2755       if (reduc_chain)
2756         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2757       return NULL_TREE;
2758
2759     default:
2760       return NULL_TREE;
2761     }
2762 }
2763
2764 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2765    STMT is printed with a message MSG. */
2766
2767 static void
2768 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2769 {
2770   dump_printf_loc (msg_type, vect_location, "%s", msg);
2771   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2772 }
2773
2774
2775 /* Detect SLP reduction of the form:
2776
2777    #a1 = phi <a5, a0>
2778    a2 = operation (a1)
2779    a3 = operation (a2)
2780    a4 = operation (a3)
2781    a5 = operation (a4)
2782
2783    #a = phi <a5>
2784
2785    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2786    FIRST_STMT is the first reduction stmt in the chain
2787    (a2 = operation (a1)).
2788
2789    Return TRUE if a reduction chain was detected.  */
2790
2791 static bool
2792 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2793                        gimple *first_stmt)
2794 {
2795   struct loop *loop = (gimple_bb (phi))->loop_father;
2796   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2797   enum tree_code code;
2798   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2799   stmt_vec_info use_stmt_info, current_stmt_info;
2800   tree lhs;
2801   imm_use_iterator imm_iter;
2802   use_operand_p use_p;
2803   int nloop_uses, size = 0, n_out_of_loop_uses;
2804   bool found = false;
2805
2806   if (loop != vect_loop)
2807     return false;
2808
2809   lhs = PHI_RESULT (phi);
2810   code = gimple_assign_rhs_code (first_stmt);
2811   while (1)
2812     {
2813       nloop_uses = 0;
2814       n_out_of_loop_uses = 0;
2815       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2816         {
2817           gimple *use_stmt = USE_STMT (use_p);
2818           if (is_gimple_debug (use_stmt))
2819             continue;
2820
2821           /* Check if we got back to the reduction phi.  */
2822           if (use_stmt == phi)
2823             {
2824               loop_use_stmt = use_stmt;
2825               found = true;
2826               break;
2827             }
2828
2829           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2830             {
2831               loop_use_stmt = use_stmt;
2832               nloop_uses++;
2833             }
2834            else
2835              n_out_of_loop_uses++;
2836
2837            /* There are can be either a single use in the loop or two uses in
2838               phi nodes.  */
2839            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2840              return false;
2841         }
2842
2843       if (found)
2844         break;
2845
2846       /* We reached a statement with no loop uses.  */
2847       if (nloop_uses == 0)
2848         return false;
2849
2850       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2851       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2852         return false;
2853
2854       if (!is_gimple_assign (loop_use_stmt)
2855           || code != gimple_assign_rhs_code (loop_use_stmt)
2856           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2857         return false;
2858
2859       /* Insert USE_STMT into reduction chain.  */
2860       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2861       if (current_stmt)
2862         {
2863           current_stmt_info = vinfo_for_stmt (current_stmt);
2864           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2865           GROUP_FIRST_ELEMENT (use_stmt_info)
2866             = GROUP_FIRST_ELEMENT (current_stmt_info);
2867         }
2868       else
2869         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2870
2871       lhs = gimple_assign_lhs (loop_use_stmt);
2872       current_stmt = loop_use_stmt;
2873       size++;
2874    }
2875
2876   if (!found || loop_use_stmt != phi || size < 2)
2877     return false;
2878
2879   /* Swap the operands, if needed, to make the reduction operand be the second
2880      operand.  */
2881   lhs = PHI_RESULT (phi);
2882   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2883   while (next_stmt)
2884     {
2885       if (gimple_assign_rhs2 (next_stmt) == lhs)
2886         {
2887           tree op = gimple_assign_rhs1 (next_stmt);
2888           gimple *def_stmt = NULL;
2889
2890           if (TREE_CODE (op) == SSA_NAME)
2891             def_stmt = SSA_NAME_DEF_STMT (op);
2892
2893           /* Check that the other def is either defined in the loop
2894              ("vect_internal_def"), or it's an induction (defined by a
2895              loop-header phi-node).  */
2896           if (def_stmt
2897               && gimple_bb (def_stmt)
2898               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2899               && (is_gimple_assign (def_stmt)
2900                   || is_gimple_call (def_stmt)
2901                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2902                            == vect_induction_def
2903                   || (gimple_code (def_stmt) == GIMPLE_PHI
2904                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2905                                   == vect_internal_def
2906                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2907             {
2908               lhs = gimple_assign_lhs (next_stmt);
2909               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2910               continue;
2911             }
2912
2913           return false;
2914         }
2915       else
2916         {
2917           tree op = gimple_assign_rhs2 (next_stmt);
2918           gimple *def_stmt = NULL;
2919
2920           if (TREE_CODE (op) == SSA_NAME)
2921             def_stmt = SSA_NAME_DEF_STMT (op);
2922
2923           /* Check that the other def is either defined in the loop
2924             ("vect_internal_def"), or it's an induction (defined by a
2925             loop-header phi-node).  */
2926           if (def_stmt
2927               && gimple_bb (def_stmt)
2928               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2929               && (is_gimple_assign (def_stmt)
2930                   || is_gimple_call (def_stmt)
2931                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2932                               == vect_induction_def
2933                   || (gimple_code (def_stmt) == GIMPLE_PHI
2934                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2935                                   == vect_internal_def
2936                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2937             {
2938               if (dump_enabled_p ())
2939                 {
2940                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2941                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2942                 }
2943
2944               swap_ssa_operands (next_stmt,
2945                                  gimple_assign_rhs1_ptr (next_stmt),
2946                                  gimple_assign_rhs2_ptr (next_stmt));
2947               update_stmt (next_stmt);
2948
2949               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2950                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2951             }
2952           else
2953             return false;
2954         }
2955
2956       lhs = gimple_assign_lhs (next_stmt);
2957       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2958     }
2959
2960   /* Save the chain for further analysis in SLP detection.  */
2961   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2962   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2963   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2964
2965   return true;
2966 }
2967
2968 /* Return true if we need an in-order reduction for operation CODE
2969    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2970    overflow must wrap.  */
2971
2972 static bool
2973 needs_fold_left_reduction_p (tree type, tree_code code,
2974                              bool need_wrapping_integral_overflow)
2975 {
2976   /* CHECKME: check for !flag_finite_math_only too?  */
2977   if (SCALAR_FLOAT_TYPE_P (type))
2978     switch (code)
2979       {
2980       case MIN_EXPR:
2981       case MAX_EXPR:
2982         return false;
2983
2984       default:
2985         return !flag_associative_math;
2986       }
2987
2988   if (INTEGRAL_TYPE_P (type))
2989     {
2990       if (!operation_no_trapping_overflow (type, code))
2991         return true;
2992       if (need_wrapping_integral_overflow
2993           && !TYPE_OVERFLOW_WRAPS (type)
2994           && operation_can_overflow (code))
2995         return true;
2996       return false;
2997     }
2998
2999   if (SAT_FIXED_POINT_TYPE_P (type))
3000     return true;
3001
3002   return false;
3003 }
3004
3005 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3006    reduction operation CODE has a handled computation expression.  */
3007
3008 bool
3009 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3010                       enum tree_code code)
3011 {
3012   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3013   auto_bitmap visited;
3014   tree lookfor = PHI_RESULT (phi);
3015   ssa_op_iter curri;
3016   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3017   while (USE_FROM_PTR (curr) != loop_arg)
3018     curr = op_iter_next_use (&curri);
3019   curri.i = curri.numops;
3020   do
3021     {
3022       path.safe_push (std::make_pair (curri, curr));
3023       tree use = USE_FROM_PTR (curr);
3024       if (use == lookfor)
3025         break;
3026       gimple *def = SSA_NAME_DEF_STMT (use);
3027       if (gimple_nop_p (def)
3028           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3029         {
3030 pop:
3031           do
3032             {
3033               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3034               curri = x.first;
3035               curr = x.second;
3036               do
3037                 curr = op_iter_next_use (&curri);
3038               /* Skip already visited or non-SSA operands (from iterating
3039                  over PHI args).  */
3040               while (curr != NULL_USE_OPERAND_P
3041                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3042                          || ! bitmap_set_bit (visited,
3043                                               SSA_NAME_VERSION
3044                                                 (USE_FROM_PTR (curr)))));
3045             }
3046           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3047           if (curr == NULL_USE_OPERAND_P)
3048             break;
3049         }
3050       else
3051         {
3052           if (gimple_code (def) == GIMPLE_PHI)
3053             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3054           else
3055             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3056           while (curr != NULL_USE_OPERAND_P
3057                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3058                      || ! bitmap_set_bit (visited,
3059                                           SSA_NAME_VERSION
3060                                             (USE_FROM_PTR (curr)))))
3061             curr = op_iter_next_use (&curri);
3062           if (curr == NULL_USE_OPERAND_P)
3063             goto pop;
3064         }
3065     }
3066   while (1);
3067   if (dump_file && (dump_flags & TDF_DETAILS))
3068     {
3069       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3070       unsigned i;
3071       std::pair<ssa_op_iter, use_operand_p> *x;
3072       FOR_EACH_VEC_ELT (path, i, x)
3073         {
3074           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3075           dump_printf (MSG_NOTE, " ");
3076         }
3077       dump_printf (MSG_NOTE, "\n");
3078     }
3079
3080   /* Check whether the reduction path detected is valid.  */
3081   bool fail = path.length () == 0;
3082   bool neg = false;
3083   for (unsigned i = 1; i < path.length (); ++i)
3084     {
3085       gimple *use_stmt = USE_STMT (path[i].second);
3086       tree op = USE_FROM_PTR (path[i].second);
3087       if (! has_single_use (op)
3088           || ! is_gimple_assign (use_stmt))
3089         {
3090           fail = true;
3091           break;
3092         }
3093       if (gimple_assign_rhs_code (use_stmt) != code)
3094         {
3095           if (code == PLUS_EXPR
3096               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3097             {
3098               /* Track whether we negate the reduction value each iteration.  */
3099               if (gimple_assign_rhs2 (use_stmt) == op)
3100                 neg = ! neg;
3101             }
3102           else
3103             {
3104               fail = true;
3105               break;
3106             }
3107         }
3108     }
3109   return ! fail && ! neg;
3110 }
3111
3112
3113 /* Function vect_is_simple_reduction
3114
3115    (1) Detect a cross-iteration def-use cycle that represents a simple
3116    reduction computation.  We look for the following pattern:
3117
3118    loop_header:
3119      a1 = phi < a0, a2 >
3120      a3 = ...
3121      a2 = operation (a3, a1)
3122
3123    or
3124
3125    a3 = ...
3126    loop_header:
3127      a1 = phi < a0, a2 >
3128      a2 = operation (a3, a1)
3129
3130    such that:
3131    1. operation is commutative and associative and it is safe to
3132       change the order of the computation
3133    2. no uses for a2 in the loop (a2 is used out of the loop)
3134    3. no uses of a1 in the loop besides the reduction operation
3135    4. no uses of a1 outside the loop.
3136
3137    Conditions 1,4 are tested here.
3138    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3139
3140    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3141    nested cycles.
3142
3143    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3144    reductions:
3145
3146      a1 = phi < a0, a2 >
3147      inner loop (def of a3)
3148      a2 = phi < a3 >
3149
3150    (4) Detect condition expressions, ie:
3151      for (int i = 0; i < N; i++)
3152        if (a[i] < val)
3153         ret_val = a[i];
3154
3155 */
3156
3157 static gimple *
3158 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3159                           bool *double_reduc,
3160                           bool need_wrapping_integral_overflow,
3161                           enum vect_reduction_type *v_reduc_type)
3162 {
3163   struct loop *loop = (gimple_bb (phi))->loop_father;
3164   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3165   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3166   enum tree_code orig_code, code;
3167   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3168   tree type;
3169   int nloop_uses;
3170   tree name;
3171   imm_use_iterator imm_iter;
3172   use_operand_p use_p;
3173   bool phi_def;
3174
3175   *double_reduc = false;
3176   *v_reduc_type = TREE_CODE_REDUCTION;
3177
3178   tree phi_name = PHI_RESULT (phi);
3179   /* ???  If there are no uses of the PHI result the inner loop reduction
3180      won't be detected as possibly double-reduction by vectorizable_reduction
3181      because that tries to walk the PHI arg from the preheader edge which
3182      can be constant.  See PR60382.  */
3183   if (has_zero_uses (phi_name))
3184     return NULL;
3185   nloop_uses = 0;
3186   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3187     {
3188       gimple *use_stmt = USE_STMT (use_p);
3189       if (is_gimple_debug (use_stmt))
3190         continue;
3191
3192       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3193         {
3194           if (dump_enabled_p ())
3195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3196                              "intermediate value used outside loop.\n");
3197
3198           return NULL;
3199         }
3200
3201       nloop_uses++;
3202       if (nloop_uses > 1)
3203         {
3204           if (dump_enabled_p ())
3205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3206                              "reduction value used in loop.\n");
3207           return NULL;
3208         }
3209
3210       phi_use_stmt = use_stmt;
3211     }
3212
3213   edge latch_e = loop_latch_edge (loop);
3214   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3215   if (TREE_CODE (loop_arg) != SSA_NAME)
3216     {
3217       if (dump_enabled_p ())
3218         {
3219           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3220                            "reduction: not ssa_name: ");
3221           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3222           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3223         }
3224       return NULL;
3225     }
3226
3227   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3228   if (is_gimple_assign (def_stmt))
3229     {
3230       name = gimple_assign_lhs (def_stmt);
3231       phi_def = false;
3232     }
3233   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3234     {
3235       name = PHI_RESULT (def_stmt);
3236       phi_def = true;
3237     }
3238   else
3239     {
3240       if (dump_enabled_p ())
3241         {
3242           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3243                            "reduction: unhandled reduction operation: ");
3244           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3245         }
3246       return NULL;
3247     }
3248
3249   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3250     return NULL;
3251
3252   nloop_uses = 0;
3253   auto_vec<gphi *, 3> lcphis;
3254   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3255     {
3256       gimple *use_stmt = USE_STMT (use_p);
3257       if (is_gimple_debug (use_stmt))
3258         continue;
3259       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3260         nloop_uses++;
3261       else
3262         /* We can have more than one loop-closed PHI.  */
3263         lcphis.safe_push (as_a <gphi *> (use_stmt));
3264       if (nloop_uses > 1)
3265         {
3266           if (dump_enabled_p ())
3267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3268                              "reduction used in loop.\n");
3269           return NULL;
3270         }
3271     }
3272
3273   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3274      defined in the inner loop.  */
3275   if (phi_def)
3276     {
3277       op1 = PHI_ARG_DEF (def_stmt, 0);
3278
3279       if (gimple_phi_num_args (def_stmt) != 1
3280           || TREE_CODE (op1) != SSA_NAME)
3281         {
3282           if (dump_enabled_p ())
3283             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3284                              "unsupported phi node definition.\n");
3285
3286           return NULL;
3287         }
3288
3289       def1 = SSA_NAME_DEF_STMT (op1);
3290       if (gimple_bb (def1)
3291           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3292           && loop->inner
3293           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3294           && is_gimple_assign (def1)
3295           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3296         {
3297           if (dump_enabled_p ())
3298             report_vect_op (MSG_NOTE, def_stmt,
3299                             "detected double reduction: ");
3300
3301           *double_reduc = true;
3302           return def_stmt;
3303         }
3304
3305       return NULL;
3306     }
3307
3308   /* If we are vectorizing an inner reduction we are executing that
3309      in the original order only in case we are not dealing with a
3310      double reduction.  */
3311   bool check_reduction = true;
3312   if (flow_loop_nested_p (vect_loop, loop))
3313     {
3314       gphi *lcphi;
3315       unsigned i;
3316       check_reduction = false;
3317       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3318         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3319           {
3320             gimple *use_stmt = USE_STMT (use_p);
3321             if (is_gimple_debug (use_stmt))
3322               continue;
3323             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3324               check_reduction = true;
3325           }
3326     }
3327
3328   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3329   code = orig_code = gimple_assign_rhs_code (def_stmt);
3330
3331   /* We can handle "res -= x[i]", which is non-associative by
3332      simply rewriting this into "res += -x[i]".  Avoid changing
3333      gimple instruction for the first simple tests and only do this
3334      if we're allowed to change code at all.  */
3335   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3336     code = PLUS_EXPR;
3337
3338   if (code == COND_EXPR)
3339     {
3340       if (! nested_in_vect_loop)
3341         *v_reduc_type = COND_REDUCTION;
3342
3343       op3 = gimple_assign_rhs1 (def_stmt);
3344       if (COMPARISON_CLASS_P (op3))
3345         {
3346           op4 = TREE_OPERAND (op3, 1);
3347           op3 = TREE_OPERAND (op3, 0);
3348         }
3349       if (op3 == phi_name || op4 == phi_name)
3350         {
3351           if (dump_enabled_p ())
3352             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3353                             "reduction: condition depends on previous"
3354                             " iteration: ");
3355           return NULL;
3356         }
3357
3358       op1 = gimple_assign_rhs2 (def_stmt);
3359       op2 = gimple_assign_rhs3 (def_stmt);
3360     }
3361   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3362     {
3363       if (dump_enabled_p ())
3364         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3365                         "reduction: not commutative/associative: ");
3366       return NULL;
3367     }
3368   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3369     {
3370       op1 = gimple_assign_rhs1 (def_stmt);
3371       op2 = gimple_assign_rhs2 (def_stmt);
3372     }
3373   else
3374     {
3375       if (dump_enabled_p ())
3376         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3377                         "reduction: not handled operation: ");
3378       return NULL;
3379     }
3380
3381   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3382     {
3383       if (dump_enabled_p ())
3384         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3385                         "reduction: both uses not ssa_names: ");
3386
3387       return NULL;
3388     }
3389
3390   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3391   if ((TREE_CODE (op1) == SSA_NAME
3392        && !types_compatible_p (type,TREE_TYPE (op1)))
3393       || (TREE_CODE (op2) == SSA_NAME
3394           && !types_compatible_p (type, TREE_TYPE (op2)))
3395       || (op3 && TREE_CODE (op3) == SSA_NAME
3396           && !types_compatible_p (type, TREE_TYPE (op3)))
3397       || (op4 && TREE_CODE (op4) == SSA_NAME
3398           && !types_compatible_p (type, TREE_TYPE (op4))))
3399     {
3400       if (dump_enabled_p ())
3401         {
3402           dump_printf_loc (MSG_NOTE, vect_location,
3403                            "reduction: multiple types: operation type: ");
3404           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3405           dump_printf (MSG_NOTE, ", operands types: ");
3406           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3407                              TREE_TYPE (op1));
3408           dump_printf (MSG_NOTE, ",");
3409           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3410                              TREE_TYPE (op2));
3411           if (op3)
3412             {
3413               dump_printf (MSG_NOTE, ",");
3414               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3415                                  TREE_TYPE (op3));
3416             }
3417
3418           if (op4)
3419             {
3420               dump_printf (MSG_NOTE, ",");
3421               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3422                                  TREE_TYPE (op4));
3423             }
3424           dump_printf (MSG_NOTE, "\n");
3425         }
3426
3427       return NULL;
3428     }
3429
3430   /* Check whether it's ok to change the order of the computation.
3431      Generally, when vectorizing a reduction we change the order of the
3432      computation.  This may change the behavior of the program in some
3433      cases, so we need to check that this is ok.  One exception is when
3434      vectorizing an outer-loop: the inner-loop is executed sequentially,
3435      and therefore vectorizing reductions in the inner-loop during
3436      outer-loop vectorization is safe.  */
3437   if (check_reduction
3438       && *v_reduc_type == TREE_CODE_REDUCTION
3439       && needs_fold_left_reduction_p (type, code,
3440                                       need_wrapping_integral_overflow))
3441     *v_reduc_type = FOLD_LEFT_REDUCTION;
3442
3443   /* Reduction is safe. We're dealing with one of the following:
3444      1) integer arithmetic and no trapv
3445      2) floating point arithmetic, and special flags permit this optimization
3446      3) nested cycle (i.e., outer loop vectorization).  */
3447   if (TREE_CODE (op1) == SSA_NAME)
3448     def1 = SSA_NAME_DEF_STMT (op1);
3449
3450   if (TREE_CODE (op2) == SSA_NAME)
3451     def2 = SSA_NAME_DEF_STMT (op2);
3452
3453   if (code != COND_EXPR
3454       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3455     {
3456       if (dump_enabled_p ())
3457         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3458       return NULL;
3459     }
3460
3461   /* Check that one def is the reduction def, defined by PHI,
3462      the other def is either defined in the loop ("vect_internal_def"),
3463      or it's an induction (defined by a loop-header phi-node).  */
3464
3465   if (def2 && def2 == phi
3466       && (code == COND_EXPR
3467           || !def1 || gimple_nop_p (def1)
3468           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3469           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3470               && (is_gimple_assign (def1)
3471                   || is_gimple_call (def1)
3472                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3473                       == vect_induction_def
3474                   || (gimple_code (def1) == GIMPLE_PHI
3475                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3476                           == vect_internal_def
3477                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3478     {
3479       if (dump_enabled_p ())
3480         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3481       return def_stmt;
3482     }
3483
3484   if (def1 && def1 == phi
3485       && (code == COND_EXPR
3486           || !def2 || gimple_nop_p (def2)
3487           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3488           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3489               && (is_gimple_assign (def2)
3490                   || is_gimple_call (def2)
3491                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3492                        == vect_induction_def
3493                   || (gimple_code (def2) == GIMPLE_PHI
3494                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3495                            == vect_internal_def
3496                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3497     {
3498       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3499         {
3500           /* Check if we can swap operands (just for simplicity - so that
3501              the rest of the code can assume that the reduction variable
3502              is always the last (second) argument).  */
3503           if (code == COND_EXPR)
3504             {
3505               /* Swap cond_expr by inverting the condition.  */
3506               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3507               enum tree_code invert_code = ERROR_MARK;
3508               enum tree_code cond_code = TREE_CODE (cond_expr);
3509
3510               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3511                 {
3512                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3513                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3514                 }
3515               if (invert_code != ERROR_MARK)
3516                 {
3517                   TREE_SET_CODE (cond_expr, invert_code);
3518                   swap_ssa_operands (def_stmt,
3519                                      gimple_assign_rhs2_ptr (def_stmt),
3520                                      gimple_assign_rhs3_ptr (def_stmt));
3521                 }
3522               else
3523                 {
3524                   if (dump_enabled_p ())
3525                     report_vect_op (MSG_NOTE, def_stmt,
3526                                     "detected reduction: cannot swap operands "
3527                                     "for cond_expr");
3528                   return NULL;
3529                 }
3530             }
3531           else
3532             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3533                                gimple_assign_rhs2_ptr (def_stmt));
3534
3535           if (dump_enabled_p ())
3536             report_vect_op (MSG_NOTE, def_stmt,
3537                             "detected reduction: need to swap operands: ");
3538
3539           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3540             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3541         }
3542       else
3543         {
3544           if (dump_enabled_p ())
3545             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3546         }
3547
3548       return def_stmt;
3549     }
3550
3551   /* Try to find SLP reduction chain.  */
3552   if (! nested_in_vect_loop
3553       && code != COND_EXPR
3554       && orig_code != MINUS_EXPR
3555       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3556     {
3557       if (dump_enabled_p ())
3558         report_vect_op (MSG_NOTE, def_stmt,
3559                         "reduction: detected reduction chain: ");
3560
3561       return def_stmt;
3562     }
3563
3564   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3565   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3566   while (first)
3567     {
3568       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3569       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3570       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3571       first = next;
3572     }
3573
3574   /* Look for the expression computing loop_arg from loop PHI result.  */
3575   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3576                             code))
3577     return def_stmt;
3578
3579   if (dump_enabled_p ())
3580     {
3581       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3582                       "reduction: unknown pattern: ");
3583     }
3584
3585   return NULL;
3586 }
3587
3588 /* Wrapper around vect_is_simple_reduction, which will modify code
3589    in-place if it enables detection of more reductions.  Arguments
3590    as there.  */
3591
3592 gimple *
3593 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3594                              bool *double_reduc,
3595                              bool need_wrapping_integral_overflow)
3596 {
3597   enum vect_reduction_type v_reduc_type;
3598   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3599                                           need_wrapping_integral_overflow,
3600                                           &v_reduc_type);
3601   if (def)
3602     {
3603       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3604       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3605       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3606       reduc_def_info = vinfo_for_stmt (def);
3607       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3608       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3609     }
3610   return def;
3611 }
3612
3613 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3614 int
3615 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3616                              int *peel_iters_epilogue,
3617                              stmt_vector_for_cost *scalar_cost_vec,
3618                              stmt_vector_for_cost *prologue_cost_vec,
3619                              stmt_vector_for_cost *epilogue_cost_vec)
3620 {
3621   int retval = 0;
3622   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3623
3624   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3625     {
3626       *peel_iters_epilogue = assumed_vf / 2;
3627       if (dump_enabled_p ())
3628         dump_printf_loc (MSG_NOTE, vect_location,
3629                          "cost model: epilogue peel iters set to vf/2 "
3630                          "because loop iterations are unknown .\n");
3631
3632       /* If peeled iterations are known but number of scalar loop
3633          iterations are unknown, count a taken branch per peeled loop.  */
3634       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3635                                  NULL, 0, vect_prologue);
3636       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3637                                  NULL, 0, vect_epilogue);
3638     }
3639   else
3640     {
3641       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3642       peel_iters_prologue = niters < peel_iters_prologue ?
3643                             niters : peel_iters_prologue;
3644       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3645       /* If we need to peel for gaps, but no peeling is required, we have to
3646          peel VF iterations.  */
3647       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3648         *peel_iters_epilogue = assumed_vf;
3649     }
3650
3651   stmt_info_for_cost *si;
3652   int j;
3653   if (peel_iters_prologue)
3654     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3655         {
3656           stmt_vec_info stmt_info
3657             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3658           retval += record_stmt_cost (prologue_cost_vec,
3659                                       si->count * peel_iters_prologue,
3660                                       si->kind, stmt_info, si->misalign,
3661                                       vect_prologue);
3662         }
3663   if (*peel_iters_epilogue)
3664     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3665         {
3666           stmt_vec_info stmt_info
3667             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3668           retval += record_stmt_cost (epilogue_cost_vec,
3669                                       si->count * *peel_iters_epilogue,
3670                                       si->kind, stmt_info, si->misalign,
3671                                       vect_epilogue);
3672         }
3673
3674   return retval;
3675 }
3676
3677 /* Function vect_estimate_min_profitable_iters
3678
3679    Return the number of iterations required for the vector version of the
3680    loop to be profitable relative to the cost of the scalar version of the
3681    loop.
3682
3683    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3684    of iterations for vectorization.  -1 value means loop vectorization
3685    is not profitable.  This returned value may be used for dynamic
3686    profitability check.
3687
3688    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3689    for static check against estimated number of iterations.  */
3690
3691 static void
3692 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3693                                     int *ret_min_profitable_niters,
3694                                     int *ret_min_profitable_estimate)
3695 {
3696   int min_profitable_iters;
3697   int min_profitable_estimate;
3698   int peel_iters_prologue;
3699   int peel_iters_epilogue;
3700   unsigned vec_inside_cost = 0;
3701   int vec_outside_cost = 0;
3702   unsigned vec_prologue_cost = 0;
3703   unsigned vec_epilogue_cost = 0;
3704   int scalar_single_iter_cost = 0;
3705   int scalar_outside_cost = 0;
3706   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3707   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3708   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3709
3710   /* Cost model disabled.  */
3711   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3712     {
3713       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3714       *ret_min_profitable_niters = 0;
3715       *ret_min_profitable_estimate = 0;
3716       return;
3717     }
3718
3719   /* Requires loop versioning tests to handle misalignment.  */
3720   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3721     {
3722       /*  FIXME: Make cost depend on complexity of individual check.  */
3723       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3724       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3725                             vect_prologue);
3726       dump_printf (MSG_NOTE,
3727                    "cost model: Adding cost of checks for loop "
3728                    "versioning to treat misalignment.\n");
3729     }
3730
3731   /* Requires loop versioning with alias checks.  */
3732   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3733     {
3734       /*  FIXME: Make cost depend on complexity of individual check.  */
3735       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3736       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3737                             vect_prologue);
3738       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3739       if (len)
3740         /* Count LEN - 1 ANDs and LEN comparisons.  */
3741         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3742                               NULL, 0, vect_prologue);
3743       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3744       if (len)
3745         {
3746           /* Count LEN - 1 ANDs and LEN comparisons.  */
3747           unsigned int nstmts = len * 2 - 1;
3748           /* +1 for each bias that needs adding.  */
3749           for (unsigned int i = 0; i < len; ++i)
3750             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3751               nstmts += 1;
3752           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3753                                 NULL, 0, vect_prologue);
3754         }
3755       dump_printf (MSG_NOTE,
3756                    "cost model: Adding cost of checks for loop "
3757                    "versioning aliasing.\n");
3758     }
3759
3760   /* Requires loop versioning with niter checks.  */
3761   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3762     {
3763       /*  FIXME: Make cost depend on complexity of individual check.  */
3764       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3765                             vect_prologue);
3766       dump_printf (MSG_NOTE,
3767                    "cost model: Adding cost of checks for loop "
3768                    "versioning niters.\n");
3769     }
3770
3771   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3772     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3773                           vect_prologue);
3774
3775   /* Count statements in scalar loop.  Using this as scalar cost for a single
3776      iteration for now.
3777
3778      TODO: Add outer loop support.
3779
3780      TODO: Consider assigning different costs to different scalar
3781      statements.  */
3782
3783   scalar_single_iter_cost
3784     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3785
3786   /* Add additional cost for the peeled instructions in prologue and epilogue
3787      loop.  (For fully-masked loops there will be no peeling.)
3788
3789      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3790      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3791
3792      TODO: Build an expression that represents peel_iters for prologue and
3793      epilogue to be used in a run-time test.  */
3794
3795   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3796     {
3797       peel_iters_prologue = 0;
3798       peel_iters_epilogue = 0;
3799
3800       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3801         {
3802           /* We need to peel exactly one iteration.  */
3803           peel_iters_epilogue += 1;
3804           stmt_info_for_cost *si;
3805           int j;
3806           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3807                             j, si)
3808             {
3809               struct _stmt_vec_info *stmt_info
3810                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3811               (void) add_stmt_cost (target_cost_data, si->count,
3812                                     si->kind, stmt_info, si->misalign,
3813                                     vect_epilogue);
3814             }
3815         }
3816     }
3817   else if (npeel < 0)
3818     {
3819       peel_iters_prologue = assumed_vf / 2;
3820       dump_printf (MSG_NOTE, "cost model: "
3821                    "prologue peel iters set to vf/2.\n");
3822
3823       /* If peeling for alignment is unknown, loop bound of main loop becomes
3824          unknown.  */
3825       peel_iters_epilogue = assumed_vf / 2;
3826       dump_printf (MSG_NOTE, "cost model: "
3827                    "epilogue peel iters set to vf/2 because "
3828                    "peeling for alignment is unknown.\n");
3829
3830       /* If peeled iterations are unknown, count a taken branch and a not taken
3831          branch per peeled loop. Even if scalar loop iterations are known,
3832          vector iterations are not known since peeled prologue iterations are
3833          not known. Hence guards remain the same.  */
3834       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3835                             NULL, 0, vect_prologue);
3836       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3837                             NULL, 0, vect_prologue);
3838       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3839                             NULL, 0, vect_epilogue);
3840       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3841                             NULL, 0, vect_epilogue);
3842       stmt_info_for_cost *si;
3843       int j;
3844       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3845         {
3846           struct _stmt_vec_info *stmt_info
3847             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3848           (void) add_stmt_cost (target_cost_data,
3849                                 si->count * peel_iters_prologue,
3850                                 si->kind, stmt_info, si->misalign,
3851                                 vect_prologue);
3852           (void) add_stmt_cost (target_cost_data,
3853                                 si->count * peel_iters_epilogue,
3854                                 si->kind, stmt_info, si->misalign,
3855                                 vect_epilogue);
3856         }
3857     }
3858   else
3859     {
3860       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3861       stmt_info_for_cost *si;
3862       int j;
3863       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3864
3865       prologue_cost_vec.create (2);
3866       epilogue_cost_vec.create (2);
3867       peel_iters_prologue = npeel;
3868
3869       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3870                                           &peel_iters_epilogue,
3871                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3872                                             (loop_vinfo),
3873                                           &prologue_cost_vec,
3874                                           &epilogue_cost_vec);
3875
3876       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3877         {
3878           struct _stmt_vec_info *stmt_info
3879             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3880           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3881                                 si->misalign, vect_prologue);
3882         }
3883
3884       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3885         {
3886           struct _stmt_vec_info *stmt_info
3887             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3888           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3889                                 si->misalign, vect_epilogue);
3890         }
3891
3892       prologue_cost_vec.release ();
3893       epilogue_cost_vec.release ();
3894     }
3895
3896   /* FORNOW: The scalar outside cost is incremented in one of the
3897      following ways:
3898
3899      1. The vectorizer checks for alignment and aliasing and generates
3900      a condition that allows dynamic vectorization.  A cost model
3901      check is ANDED with the versioning condition.  Hence scalar code
3902      path now has the added cost of the versioning check.
3903
3904        if (cost > th & versioning_check)
3905          jmp to vector code
3906
3907      Hence run-time scalar is incremented by not-taken branch cost.
3908
3909      2. The vectorizer then checks if a prologue is required.  If the
3910      cost model check was not done before during versioning, it has to
3911      be done before the prologue check.
3912
3913        if (cost <= th)
3914          prologue = scalar_iters
3915        if (prologue == 0)
3916          jmp to vector code
3917        else
3918          execute prologue
3919        if (prologue == num_iters)
3920          go to exit
3921
3922      Hence the run-time scalar cost is incremented by a taken branch,
3923      plus a not-taken branch, plus a taken branch cost.
3924
3925      3. The vectorizer then checks if an epilogue is required.  If the
3926      cost model check was not done before during prologue check, it
3927      has to be done with the epilogue check.
3928
3929        if (prologue == 0)
3930          jmp to vector code
3931        else
3932          execute prologue
3933        if (prologue == num_iters)
3934          go to exit
3935        vector code:
3936          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3937            jmp to epilogue
3938
3939      Hence the run-time scalar cost should be incremented by 2 taken
3940      branches.
3941
3942      TODO: The back end may reorder the BBS's differently and reverse
3943      conditions/branch directions.  Change the estimates below to
3944      something more reasonable.  */
3945
3946   /* If the number of iterations is known and we do not do versioning, we can
3947      decide whether to vectorize at compile time.  Hence the scalar version
3948      do not carry cost model guard costs.  */
3949   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3950       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3951     {
3952       /* Cost model check occurs at versioning.  */
3953       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3954         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3955       else
3956         {
3957           /* Cost model check occurs at prologue generation.  */
3958           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3959             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3960               + vect_get_stmt_cost (cond_branch_not_taken);
3961           /* Cost model check occurs at epilogue generation.  */
3962           else
3963             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3964         }
3965     }
3966
3967   /* Complete the target-specific cost calculations.  */
3968   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3969                &vec_inside_cost, &vec_epilogue_cost);
3970
3971   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3972
3973   if (dump_enabled_p ())
3974     {
3975       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3976       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3977                    vec_inside_cost);
3978       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3979                    vec_prologue_cost);
3980       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3981                    vec_epilogue_cost);
3982       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3983                    scalar_single_iter_cost);
3984       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3985                    scalar_outside_cost);
3986       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3987                    vec_outside_cost);
3988       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3989                    peel_iters_prologue);
3990       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3991                    peel_iters_epilogue);
3992     }
3993
3994   /* Calculate number of iterations required to make the vector version
3995      profitable, relative to the loop bodies only.  The following condition
3996      must hold true:
3997      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3998      where
3999      SIC = scalar iteration cost, VIC = vector iteration cost,
4000      VOC = vector outside cost, VF = vectorization factor,
4001      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4002      SOC = scalar outside cost for run time cost model check.  */
4003
4004   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4005     {
4006       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4007                               * assumed_vf
4008                               - vec_inside_cost * peel_iters_prologue
4009                               - vec_inside_cost * peel_iters_epilogue);
4010       if (min_profitable_iters <= 0)
4011         min_profitable_iters = 0;
4012       else
4013         {
4014           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4015                                    - vec_inside_cost);
4016
4017           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4018               <= (((int) vec_inside_cost * min_profitable_iters)
4019                   + (((int) vec_outside_cost - scalar_outside_cost)
4020                      * assumed_vf)))
4021             min_profitable_iters++;
4022         }
4023     }
4024   /* vector version will never be profitable.  */
4025   else
4026     {
4027       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4028         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4029                     "did not happen for a simd loop");
4030
4031       if (dump_enabled_p ())
4032         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4033                          "cost model: the vector iteration cost = %d "
4034                          "divided by the scalar iteration cost = %d "
4035                          "is greater or equal to the vectorization factor = %d"
4036                          ".\n",
4037                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4038       *ret_min_profitable_niters = -1;
4039       *ret_min_profitable_estimate = -1;
4040       return;
4041     }
4042
4043   dump_printf (MSG_NOTE,
4044                "  Calculated minimum iters for profitability: %d\n",
4045                min_profitable_iters);
4046
4047   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4048       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4049     /* We want the vectorized loop to execute at least once.  */
4050     min_profitable_iters = assumed_vf + peel_iters_prologue;
4051
4052   if (dump_enabled_p ())
4053     dump_printf_loc (MSG_NOTE, vect_location,
4054                      "  Runtime profitability threshold = %d\n",
4055                      min_profitable_iters);
4056
4057   *ret_min_profitable_niters = min_profitable_iters;
4058
4059   /* Calculate number of iterations required to make the vector version
4060      profitable, relative to the loop bodies only.
4061
4062      Non-vectorized variant is SIC * niters and it must win over vector
4063      variant on the expected loop trip count.  The following condition must hold true:
4064      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4065
4066   if (vec_outside_cost <= 0)
4067     min_profitable_estimate = 0;
4068   else
4069     {
4070       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4071                                  * assumed_vf
4072                                  - vec_inside_cost * peel_iters_prologue
4073                                  - vec_inside_cost * peel_iters_epilogue)
4074                                  / ((scalar_single_iter_cost * assumed_vf)
4075                                    - vec_inside_cost);
4076     }
4077   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4078   if (dump_enabled_p ())
4079     dump_printf_loc (MSG_NOTE, vect_location,
4080                      "  Static estimate profitability threshold = %d\n",
4081                      min_profitable_estimate);
4082
4083   *ret_min_profitable_estimate = min_profitable_estimate;
4084 }
4085
4086 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4087    vector elements (not bits) for a vector with NELT elements.  */
4088 static void
4089 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4090                               vec_perm_builder *sel)
4091 {
4092   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4093      by vec_perm_indices.  */
4094   sel->new_vector (nelt, 1, 3);
4095   for (unsigned int i = 0; i < 3; i++)
4096     sel->quick_push (i + offset);
4097 }
4098
4099 /* Checks whether the target supports whole-vector shifts for vectors of mode
4100    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4101    it supports vec_perm_const with masks for all necessary shift amounts.  */
4102 static bool
4103 have_whole_vector_shift (machine_mode mode)
4104 {
4105   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4106     return true;
4107
4108   /* Variable-length vectors should be handled via the optab.  */
4109   unsigned int nelt;
4110   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4111     return false;
4112
4113   vec_perm_builder sel;
4114   vec_perm_indices indices;
4115   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4116     {
4117       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4118       indices.new_vector (sel, 2, nelt);
4119       if (!can_vec_perm_const_p (mode, indices, false))
4120         return false;
4121     }
4122   return true;
4123 }
4124
4125 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4126    functions. Design better to avoid maintenance issues.  */
4127
4128 /* Function vect_model_reduction_cost.
4129
4130    Models cost for a reduction operation, including the vector ops
4131    generated within the strip-mine loop, the initial definition before
4132    the loop, and the epilogue code that must be generated.  */
4133
4134 static void
4135 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4136                            int ncopies)
4137 {
4138   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4139   enum tree_code code;
4140   optab optab;
4141   tree vectype;
4142   gimple *orig_stmt;
4143   machine_mode mode;
4144   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4145   struct loop *loop = NULL;
4146   void *target_cost_data;
4147
4148   if (loop_vinfo)
4149     {
4150       loop = LOOP_VINFO_LOOP (loop_vinfo);
4151       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4152     }
4153   else
4154     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4155
4156   /* Condition reductions generate two reductions in the loop.  */
4157   vect_reduction_type reduction_type
4158     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4159   if (reduction_type == COND_REDUCTION)
4160     ncopies *= 2;
4161
4162   vectype = STMT_VINFO_VECTYPE (stmt_info);
4163   mode = TYPE_MODE (vectype);
4164   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4165
4166   if (!orig_stmt)
4167     orig_stmt = STMT_VINFO_STMT (stmt_info);
4168
4169   code = gimple_assign_rhs_code (orig_stmt);
4170
4171   if (reduction_type == EXTRACT_LAST_REDUCTION
4172       || reduction_type == FOLD_LEFT_REDUCTION)
4173     {
4174       /* No extra instructions needed in the prologue.  */
4175       prologue_cost = 0;
4176
4177       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4178         /* Count one reduction-like operation per vector.  */
4179         inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4180                                      stmt_info, 0, vect_body);
4181       else
4182         {
4183           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4184           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4185           inside_cost = add_stmt_cost (target_cost_data,  nelements,
4186                                        vec_to_scalar, stmt_info, 0,
4187                                        vect_body);
4188           inside_cost += add_stmt_cost (target_cost_data,  nelements,
4189                                         scalar_stmt, stmt_info, 0,
4190                                         vect_body);
4191         }
4192     }
4193   else
4194     {
4195       /* Add in cost for initial definition.
4196          For cond reduction we have four vectors: initial index, step,
4197          initial result of the data reduction, initial value of the index
4198          reduction.  */
4199       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4200       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4201                                       scalar_to_vec, stmt_info, 0,
4202                                       vect_prologue);
4203
4204       /* Cost of reduction op inside loop.  */
4205       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4206                                    stmt_info, 0, vect_body);
4207     }
4208
4209   /* Determine cost of epilogue code.
4210
4211      We have a reduction operator that will reduce the vector in one statement.
4212      Also requires scalar extract.  */
4213
4214   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4215     {
4216       if (reduc_fn != IFN_LAST)
4217         {
4218           if (reduction_type == COND_REDUCTION)
4219             {
4220               /* An EQ stmt and an COND_EXPR stmt.  */
4221               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4222                                               vector_stmt, stmt_info, 0,
4223                                               vect_epilogue);
4224               /* Reduction of the max index and a reduction of the found
4225                  values.  */
4226               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4227                                               vec_to_scalar, stmt_info, 0,
4228                                               vect_epilogue);
4229               /* A broadcast of the max value.  */
4230               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4231                                               scalar_to_vec, stmt_info, 0,
4232                                               vect_epilogue);
4233             }
4234           else
4235             {
4236               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4237                                               stmt_info, 0, vect_epilogue);
4238               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4239                                               vec_to_scalar, stmt_info, 0,
4240                                               vect_epilogue);
4241             }
4242         }
4243       else if (reduction_type == COND_REDUCTION)
4244         {
4245           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4246           /* Extraction of scalar elements.  */
4247           epilogue_cost += add_stmt_cost (target_cost_data,
4248                                           2 * estimated_nunits,
4249                                           vec_to_scalar, stmt_info, 0,
4250                                           vect_epilogue);
4251           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4252           epilogue_cost += add_stmt_cost (target_cost_data,
4253                                           2 * estimated_nunits - 3,
4254                                           scalar_stmt, stmt_info, 0,
4255                                           vect_epilogue);
4256         }
4257       else if (reduction_type == EXTRACT_LAST_REDUCTION
4258                || reduction_type == FOLD_LEFT_REDUCTION)
4259         /* No extra instructions need in the epilogue.  */
4260         ;
4261       else
4262         {
4263           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4264           tree bitsize =
4265             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4266           int element_bitsize = tree_to_uhwi (bitsize);
4267           int nelements = vec_size_in_bits / element_bitsize;
4268
4269           if (code == COND_EXPR)
4270             code = MAX_EXPR;
4271
4272           optab = optab_for_tree_code (code, vectype, optab_default);
4273
4274           /* We have a whole vector shift available.  */
4275           if (optab != unknown_optab
4276               && VECTOR_MODE_P (mode)
4277               && optab_handler (optab, mode) != CODE_FOR_nothing
4278               && have_whole_vector_shift (mode))
4279             {
4280               /* Final reduction via vector shifts and the reduction operator.
4281                  Also requires scalar extract.  */
4282               epilogue_cost += add_stmt_cost (target_cost_data,
4283                                               exact_log2 (nelements) * 2,
4284                                               vector_stmt, stmt_info, 0,
4285                                               vect_epilogue);
4286               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4287                                               vec_to_scalar, stmt_info, 0,
4288                                               vect_epilogue);
4289             }
4290           else
4291             /* Use extracts and reduction op for final reduction.  For N
4292                elements, we have N extracts and N-1 reduction ops.  */
4293             epilogue_cost += add_stmt_cost (target_cost_data,
4294                                             nelements + nelements - 1,
4295                                             vector_stmt, stmt_info, 0,
4296                                             vect_epilogue);
4297         }
4298     }
4299
4300   if (dump_enabled_p ())
4301     dump_printf (MSG_NOTE,
4302                  "vect_model_reduction_cost: inside_cost = %d, "
4303                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4304                  prologue_cost, epilogue_cost);
4305 }
4306
4307
4308 /* Function vect_model_induction_cost.
4309
4310    Models cost for induction operations.  */
4311
4312 static void
4313 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4314 {
4315   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4316   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4317   unsigned inside_cost, prologue_cost;
4318
4319   if (PURE_SLP_STMT (stmt_info))
4320     return;
4321
4322   /* loop cost for vec_loop.  */
4323   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4324                                stmt_info, 0, vect_body);
4325
4326   /* prologue cost for vec_init and vec_step.  */
4327   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4328                                  stmt_info, 0, vect_prologue);
4329
4330   if (dump_enabled_p ())
4331     dump_printf_loc (MSG_NOTE, vect_location,
4332                      "vect_model_induction_cost: inside_cost = %d, "
4333                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4334 }
4335
4336
4337
4338 /* Function get_initial_def_for_reduction
4339
4340    Input:
4341    STMT - a stmt that performs a reduction operation in the loop.
4342    INIT_VAL - the initial value of the reduction variable
4343
4344    Output:
4345    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4346         of the reduction (used for adjusting the epilog - see below).
4347    Return a vector variable, initialized according to the operation that STMT
4348         performs. This vector will be used as the initial value of the
4349         vector of partial results.
4350
4351    Option1 (adjust in epilog): Initialize the vector as follows:
4352      add/bit or/xor:    [0,0,...,0,0]
4353      mult/bit and:      [1,1,...,1,1]
4354      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4355    and when necessary (e.g. add/mult case) let the caller know
4356    that it needs to adjust the result by init_val.
4357
4358    Option2: Initialize the vector as follows:
4359      add/bit or/xor:    [init_val,0,0,...,0]
4360      mult/bit and:      [init_val,1,1,...,1]
4361      min/max/cond_expr: [init_val,init_val,...,init_val]
4362    and no adjustments are needed.
4363
4364    For example, for the following code:
4365
4366    s = init_val;
4367    for (i=0;i<n;i++)
4368      s = s + a[i];
4369
4370    STMT is 's = s + a[i]', and the reduction variable is 's'.
4371    For a vector of 4 units, we want to return either [0,0,0,init_val],
4372    or [0,0,0,0] and let the caller know that it needs to adjust
4373    the result at the end by 'init_val'.
4374
4375    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4376    initialization vector is simpler (same element in all entries), if
4377    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4378
4379    A cost model should help decide between these two schemes.  */
4380
4381 tree
4382 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4383                                tree *adjustment_def)
4384 {
4385   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4386   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4387   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4388   tree scalar_type = TREE_TYPE (init_val);
4389   tree vectype = get_vectype_for_scalar_type (scalar_type);
4390   enum tree_code code = gimple_assign_rhs_code (stmt);
4391   tree def_for_init;
4392   tree init_def;
4393   bool nested_in_vect_loop = false;
4394   REAL_VALUE_TYPE real_init_val = dconst0;
4395   int int_init_val = 0;
4396   gimple *def_stmt = NULL;
4397   gimple_seq stmts = NULL;
4398
4399   gcc_assert (vectype);
4400
4401   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4402               || SCALAR_FLOAT_TYPE_P (scalar_type));
4403
4404   if (nested_in_vect_loop_p (loop, stmt))
4405     nested_in_vect_loop = true;
4406   else
4407     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4408
4409   /* In case of double reduction we only create a vector variable to be put
4410      in the reduction phi node.  The actual statement creation is done in
4411      vect_create_epilog_for_reduction.  */
4412   if (adjustment_def && nested_in_vect_loop
4413       && TREE_CODE (init_val) == SSA_NAME
4414       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4415       && gimple_code (def_stmt) == GIMPLE_PHI
4416       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4417       && vinfo_for_stmt (def_stmt)
4418       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4419           == vect_double_reduction_def)
4420     {
4421       *adjustment_def = NULL;
4422       return vect_create_destination_var (init_val, vectype);
4423     }
4424
4425   vect_reduction_type reduction_type
4426     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4427
4428   /* In case of a nested reduction do not use an adjustment def as
4429      that case is not supported by the epilogue generation correctly
4430      if ncopies is not one.  */
4431   if (adjustment_def && nested_in_vect_loop)
4432     {
4433       *adjustment_def = NULL;
4434       return vect_get_vec_def_for_operand (init_val, stmt);
4435     }
4436
4437   switch (code)
4438     {
4439     case WIDEN_SUM_EXPR:
4440     case DOT_PROD_EXPR:
4441     case SAD_EXPR:
4442     case PLUS_EXPR:
4443     case MINUS_EXPR:
4444     case BIT_IOR_EXPR:
4445     case BIT_XOR_EXPR:
4446     case MULT_EXPR:
4447     case BIT_AND_EXPR:
4448       {
4449         /* ADJUSTMENT_DEF is NULL when called from
4450            vect_create_epilog_for_reduction to vectorize double reduction.  */
4451         if (adjustment_def)
4452           *adjustment_def = init_val;
4453
4454         if (code == MULT_EXPR)
4455           {
4456             real_init_val = dconst1;
4457             int_init_val = 1;
4458           }
4459
4460         if (code == BIT_AND_EXPR)
4461           int_init_val = -1;
4462
4463         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4464           def_for_init = build_real (scalar_type, real_init_val);
4465         else
4466           def_for_init = build_int_cst (scalar_type, int_init_val);
4467
4468         if (adjustment_def)
4469           /* Option1: the first element is '0' or '1' as well.  */
4470           init_def = gimple_build_vector_from_val (&stmts, vectype,
4471                                                    def_for_init);
4472         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4473           {
4474             /* Option2 (variable length): the first element is INIT_VAL.  */
4475             init_def = build_vector_from_val (vectype, def_for_init);
4476             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4477                                                       2, init_def, init_val);
4478             init_def = make_ssa_name (vectype);
4479             gimple_call_set_lhs (call, init_def);
4480             gimple_seq_add_stmt (&stmts, call);
4481           }
4482         else
4483           {
4484             /* Option2: the first element is INIT_VAL.  */
4485             tree_vector_builder elts (vectype, 1, 2);
4486             elts.quick_push (init_val);
4487             elts.quick_push (def_for_init);
4488             init_def = gimple_build_vector (&stmts, &elts);
4489           }
4490       }
4491       break;
4492
4493     case MIN_EXPR:
4494     case MAX_EXPR:
4495     case COND_EXPR:
4496       {
4497         if (adjustment_def)
4498           {
4499             *adjustment_def = NULL_TREE;
4500             if (reduction_type != COND_REDUCTION
4501                 && reduction_type != EXTRACT_LAST_REDUCTION)
4502               {
4503                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4504                 break;
4505               }
4506           }
4507         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4508         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4509       }
4510       break;
4511
4512     default:
4513       gcc_unreachable ();
4514     }
4515
4516   if (stmts)
4517     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4518   return init_def;
4519 }
4520
4521 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4522    NUMBER_OF_VECTORS is the number of vector defs to create.
4523    If NEUTRAL_OP is nonnull, introducing extra elements of that
4524    value will not change the result.  */
4525
4526 static void
4527 get_initial_defs_for_reduction (slp_tree slp_node,
4528                                 vec<tree> *vec_oprnds,
4529                                 unsigned int number_of_vectors,
4530                                 bool reduc_chain, tree neutral_op)
4531 {
4532   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4533   gimple *stmt = stmts[0];
4534   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4535   unsigned HOST_WIDE_INT nunits;
4536   unsigned j, number_of_places_left_in_vector;
4537   tree vector_type;
4538   tree vop;
4539   int group_size = stmts.length ();
4540   unsigned int vec_num, i;
4541   unsigned number_of_copies = 1;
4542   vec<tree> voprnds;
4543   voprnds.create (number_of_vectors);
4544   struct loop *loop;
4545   auto_vec<tree, 16> permute_results;
4546
4547   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4548
4549   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4550
4551   loop = (gimple_bb (stmt))->loop_father;
4552   gcc_assert (loop);
4553   edge pe = loop_preheader_edge (loop);
4554
4555   gcc_assert (!reduc_chain || neutral_op);
4556
4557   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4558      created vectors. It is greater than 1 if unrolling is performed.
4559
4560      For example, we have two scalar operands, s1 and s2 (e.g., group of
4561      strided accesses of size two), while NUNITS is four (i.e., four scalars
4562      of this type can be packed in a vector).  The output vector will contain
4563      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4564      will be 2).
4565
4566      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4567      containing the operands.
4568
4569      For example, NUNITS is four as before, and the group size is 8
4570      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4571      {s5, s6, s7, s8}.  */
4572
4573   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4574     nunits = group_size;
4575
4576   number_of_copies = nunits * number_of_vectors / group_size;
4577
4578   number_of_places_left_in_vector = nunits;
4579   bool constant_p = true;
4580   tree_vector_builder elts (vector_type, nunits, 1);
4581   elts.quick_grow (nunits);
4582   for (j = 0; j < number_of_copies; j++)
4583     {
4584       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4585         {
4586           tree op;
4587           /* Get the def before the loop.  In reduction chain we have only
4588              one initial value.  */
4589           if ((j != (number_of_copies - 1)
4590                || (reduc_chain && i != 0))
4591               && neutral_op)
4592             op = neutral_op;
4593           else
4594             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4595
4596           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4597           number_of_places_left_in_vector--;
4598           elts[number_of_places_left_in_vector] = op;
4599           if (!CONSTANT_CLASS_P (op))
4600             constant_p = false;
4601
4602           if (number_of_places_left_in_vector == 0)
4603             {
4604               gimple_seq ctor_seq = NULL;
4605               tree init;
4606               if (constant_p && !neutral_op
4607                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4608                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4609                 /* Build the vector directly from ELTS.  */
4610                 init = gimple_build_vector (&ctor_seq, &elts);
4611               else if (neutral_op)
4612                 {
4613                   /* Build a vector of the neutral value and shift the
4614                      other elements into place.  */
4615                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4616                                                        neutral_op);
4617                   int k = nunits;
4618                   while (k > 0 && elts[k - 1] == neutral_op)
4619                     k -= 1;
4620                   while (k > 0)
4621                     {
4622                       k -= 1;
4623                       gcall *call = gimple_build_call_internal
4624                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4625                       init = make_ssa_name (vector_type);
4626                       gimple_call_set_lhs (call, init);
4627                       gimple_seq_add_stmt (&ctor_seq, call);
4628                     }
4629                 }
4630               else
4631                 {
4632                   /* First time round, duplicate ELTS to fill the
4633                      required number of vectors, then cherry pick the
4634                      appropriate result for each iteration.  */
4635                   if (vec_oprnds->is_empty ())
4636                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4637                                               number_of_vectors,
4638                                               permute_results);
4639                   init = permute_results[number_of_vectors - j - 1];
4640                 }
4641               if (ctor_seq != NULL)
4642                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4643               voprnds.quick_push (init);
4644
4645               number_of_places_left_in_vector = nunits;
4646               elts.new_vector (vector_type, nunits, 1);
4647               elts.quick_grow (nunits);
4648               constant_p = true;
4649             }
4650         }
4651     }
4652
4653   /* Since the vectors are created in the reverse order, we should invert
4654      them.  */
4655   vec_num = voprnds.length ();
4656   for (j = vec_num; j != 0; j--)
4657     {
4658       vop = voprnds[j - 1];
4659       vec_oprnds->quick_push (vop);
4660     }
4661
4662   voprnds.release ();
4663
4664   /* In case that VF is greater than the unrolling factor needed for the SLP
4665      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4666      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4667      to replicate the vectors.  */
4668   tree neutral_vec = NULL;
4669   while (number_of_vectors > vec_oprnds->length ())
4670     {
4671       if (neutral_op)
4672         {
4673           if (!neutral_vec)
4674             {
4675               gimple_seq ctor_seq = NULL;
4676               neutral_vec = gimple_build_vector_from_val
4677                 (&ctor_seq, vector_type, neutral_op);
4678               if (ctor_seq != NULL)
4679                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4680             }
4681           vec_oprnds->quick_push (neutral_vec);
4682         }
4683       else
4684         {
4685           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4686             vec_oprnds->quick_push (vop);
4687         }
4688     }
4689 }
4690
4691
4692 /* Function vect_create_epilog_for_reduction
4693
4694    Create code at the loop-epilog to finalize the result of a reduction
4695    computation.
4696
4697    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4698      reduction statements.
4699    STMT is the scalar reduction stmt that is being vectorized.
4700    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4701      number of elements that we can fit in a vectype (nunits).  In this case
4702      we have to generate more than one vector stmt - i.e - we need to "unroll"
4703      the vector stmt by a factor VF/nunits.  For more details see documentation
4704      in vectorizable_operation.
4705    REDUC_FN is the internal function for the epilog reduction.
4706    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4707      computation.
4708    REDUC_INDEX is the index of the operand in the right hand side of the
4709      statement that is defined by REDUCTION_PHI.
4710    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4711    SLP_NODE is an SLP node containing a group of reduction statements. The
4712      first one in this group is STMT.
4713    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4714      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4715      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4716      any value of the IV in the loop.
4717    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4718    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4719      null if this is not an SLP reduction
4720
4721    This function:
4722    1. Creates the reduction def-use cycles: sets the arguments for
4723       REDUCTION_PHIS:
4724       The loop-entry argument is the vectorized initial-value of the reduction.
4725       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4726       sums.
4727    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4728       by calling the function specified by REDUC_FN if available, or by
4729       other means (whole-vector shifts or a scalar loop).
4730       The function also creates a new phi node at the loop exit to preserve
4731       loop-closed form, as illustrated below.
4732
4733      The flow at the entry to this function:
4734
4735         loop:
4736           vec_def = phi <null, null>            # REDUCTION_PHI
4737           VECT_DEF = vector_stmt                # vectorized form of STMT
4738           s_loop = scalar_stmt                  # (scalar) STMT
4739         loop_exit:
4740           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4741           use <s_out0>
4742           use <s_out0>
4743
4744      The above is transformed by this function into:
4745
4746         loop:
4747           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4748           VECT_DEF = vector_stmt                # vectorized form of STMT
4749           s_loop = scalar_stmt                  # (scalar) STMT
4750         loop_exit:
4751           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4752           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4753           v_out2 = reduce <v_out1>
4754           s_out3 = extract_field <v_out2, 0>
4755           s_out4 = adjust_result <s_out3>
4756           use <s_out4>
4757           use <s_out4>
4758 */
4759
4760 static void
4761 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4762                                   gimple *reduc_def_stmt,
4763                                   int ncopies, internal_fn reduc_fn,
4764                                   vec<gimple *> reduction_phis,
4765                                   bool double_reduc,
4766                                   slp_tree slp_node,
4767                                   slp_instance slp_node_instance,
4768                                   tree induc_val, enum tree_code induc_code,
4769                                   tree neutral_op)
4770 {
4771   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4772   stmt_vec_info prev_phi_info;
4773   tree vectype;
4774   machine_mode mode;
4775   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4776   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4777   basic_block exit_bb;
4778   tree scalar_dest;
4779   tree scalar_type;
4780   gimple *new_phi = NULL, *phi;
4781   gimple_stmt_iterator exit_gsi;
4782   tree vec_dest;
4783   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4784   gimple *epilog_stmt = NULL;
4785   enum tree_code code = gimple_assign_rhs_code (stmt);
4786   gimple *exit_phi;
4787   tree bitsize;
4788   tree adjustment_def = NULL;
4789   tree vec_initial_def = NULL;
4790   tree expr, def, initial_def = NULL;
4791   tree orig_name, scalar_result;
4792   imm_use_iterator imm_iter, phi_imm_iter;
4793   use_operand_p use_p, phi_use_p;
4794   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4795   bool nested_in_vect_loop = false;
4796   auto_vec<gimple *> new_phis;
4797   auto_vec<gimple *> inner_phis;
4798   enum vect_def_type dt = vect_unknown_def_type;
4799   int j, i;
4800   auto_vec<tree> scalar_results;
4801   unsigned int group_size = 1, k, ratio;
4802   auto_vec<tree> vec_initial_defs;
4803   auto_vec<gimple *> phis;
4804   bool slp_reduc = false;
4805   bool direct_slp_reduc;
4806   tree new_phi_result;
4807   gimple *inner_phi = NULL;
4808   tree induction_index = NULL_TREE;
4809
4810   if (slp_node)
4811     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4812
4813   if (nested_in_vect_loop_p (loop, stmt))
4814     {
4815       outer_loop = loop;
4816       loop = loop->inner;
4817       nested_in_vect_loop = true;
4818       gcc_assert (!slp_node);
4819     }
4820
4821   vectype = STMT_VINFO_VECTYPE (stmt_info);
4822   gcc_assert (vectype);
4823   mode = TYPE_MODE (vectype);
4824
4825   /* 1. Create the reduction def-use cycle:
4826      Set the arguments of REDUCTION_PHIS, i.e., transform
4827
4828         loop:
4829           vec_def = phi <null, null>            # REDUCTION_PHI
4830           VECT_DEF = vector_stmt                # vectorized form of STMT
4831           ...
4832
4833      into:
4834
4835         loop:
4836           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4837           VECT_DEF = vector_stmt                # vectorized form of STMT
4838           ...
4839
4840      (in case of SLP, do it for all the phis). */
4841
4842   /* Get the loop-entry arguments.  */
4843   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4844   if (slp_node)
4845     {
4846       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4847       vec_initial_defs.reserve (vec_num);
4848       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4849                                       &vec_initial_defs, vec_num,
4850                                       GROUP_FIRST_ELEMENT (stmt_info),
4851                                       neutral_op);
4852     }
4853   else
4854     {
4855       /* Get at the scalar def before the loop, that defines the initial value
4856          of the reduction variable.  */
4857       gimple *def_stmt;
4858       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4859                                            loop_preheader_edge (loop));
4860       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4861          and we can't use zero for induc_val, use initial_def.  Similarly
4862          for REDUC_MIN and initial_def larger than the base.  */
4863       if (TREE_CODE (initial_def) == INTEGER_CST
4864           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4865               == INTEGER_INDUC_COND_REDUCTION)
4866           && !integer_zerop (induc_val)
4867           && ((induc_code == MAX_EXPR
4868                && tree_int_cst_lt (initial_def, induc_val))
4869               || (induc_code == MIN_EXPR
4870                   && tree_int_cst_lt (induc_val, initial_def))))
4871         induc_val = initial_def;
4872       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4873       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4874                                                        &adjustment_def);
4875       vec_initial_defs.create (1);
4876       vec_initial_defs.quick_push (vec_initial_def);
4877     }
4878
4879   /* Set phi nodes arguments.  */
4880   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4881     {
4882       tree vec_init_def = vec_initial_defs[i];
4883       tree def = vect_defs[i];
4884       for (j = 0; j < ncopies; j++)
4885         {
4886           if (j != 0)
4887             {
4888               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4889               if (nested_in_vect_loop)
4890                 vec_init_def
4891                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4892                                                     vec_init_def);
4893             }
4894
4895           /* Set the loop-entry arg of the reduction-phi.  */
4896
4897           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4898               == INTEGER_INDUC_COND_REDUCTION)
4899             {
4900               /* Initialise the reduction phi to zero.  This prevents initial
4901                  values of non-zero interferring with the reduction op.  */
4902               gcc_assert (ncopies == 1);
4903               gcc_assert (i == 0);
4904
4905               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4906               tree induc_val_vec
4907                 = build_vector_from_val (vec_init_def_type, induc_val);
4908
4909               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4910                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4911             }
4912           else
4913             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4914                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4915
4916           /* Set the loop-latch arg for the reduction-phi.  */
4917           if (j > 0)
4918             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4919
4920           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4921                        UNKNOWN_LOCATION);
4922
4923           if (dump_enabled_p ())
4924             {
4925               dump_printf_loc (MSG_NOTE, vect_location,
4926                                "transform reduction: created def-use cycle: ");
4927               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4928               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4929             }
4930         }
4931     }
4932
4933   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4934      which is updated with the current index of the loop for every match of
4935      the original loop's cond_expr (VEC_STMT).  This results in a vector
4936      containing the last time the condition passed for that vector lane.
4937      The first match will be a 1 to allow 0 to be used for non-matching
4938      indexes.  If there are no matches at all then the vector will be all
4939      zeroes.  */
4940   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4941     {
4942       tree indx_before_incr, indx_after_incr;
4943       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4944
4945       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4946       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4947
4948       int scalar_precision
4949         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4950       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4951       tree cr_index_vector_type = build_vector_type
4952         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4953
4954       /* First we create a simple vector induction variable which starts
4955          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4956          vector size (STEP).  */
4957
4958       /* Create a {1,2,3,...} vector.  */
4959       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4960
4961       /* Create a vector of the step value.  */
4962       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4963       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4964
4965       /* Create an induction variable.  */
4966       gimple_stmt_iterator incr_gsi;
4967       bool insert_after;
4968       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4969       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4970                  insert_after, &indx_before_incr, &indx_after_incr);
4971
4972       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4973          filled with zeros (VEC_ZERO).  */
4974
4975       /* Create a vector of 0s.  */
4976       tree zero = build_zero_cst (cr_index_scalar_type);
4977       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4978
4979       /* Create a vector phi node.  */
4980       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4981       new_phi = create_phi_node (new_phi_tree, loop->header);
4982       set_vinfo_for_stmt (new_phi,
4983                           new_stmt_vec_info (new_phi, loop_vinfo));
4984       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4985                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4986
4987       /* Now take the condition from the loops original cond_expr
4988          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4989          every match uses values from the induction variable
4990          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4991          (NEW_PHI_TREE).
4992          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4993          the new cond_expr (INDEX_COND_EXPR).  */
4994
4995       /* Duplicate the condition from vec_stmt.  */
4996       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4997
4998       /* Create a conditional, where the condition is taken from vec_stmt
4999          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5000          else is the phi (NEW_PHI_TREE).  */
5001       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5002                                      ccompare, indx_before_incr,
5003                                      new_phi_tree);
5004       induction_index = make_ssa_name (cr_index_vector_type);
5005       gimple *index_condition = gimple_build_assign (induction_index,
5006                                                      index_cond_expr);
5007       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5008       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5009                                                         loop_vinfo);
5010       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5011       set_vinfo_for_stmt (index_condition, index_vec_info);
5012
5013       /* Update the phi with the vec cond.  */
5014       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5015                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5016     }
5017
5018   /* 2. Create epilog code.
5019         The reduction epilog code operates across the elements of the vector
5020         of partial results computed by the vectorized loop.
5021         The reduction epilog code consists of:
5022
5023         step 1: compute the scalar result in a vector (v_out2)
5024         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5025         step 3: adjust the scalar result (s_out3) if needed.
5026
5027         Step 1 can be accomplished using one the following three schemes:
5028           (scheme 1) using reduc_fn, if available.
5029           (scheme 2) using whole-vector shifts, if available.
5030           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5031                      combined.
5032
5033           The overall epilog code looks like this:
5034
5035           s_out0 = phi <s_loop>         # original EXIT_PHI
5036           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5037           v_out2 = reduce <v_out1>              # step 1
5038           s_out3 = extract_field <v_out2, 0>    # step 2
5039           s_out4 = adjust_result <s_out3>       # step 3
5040
5041           (step 3 is optional, and steps 1 and 2 may be combined).
5042           Lastly, the uses of s_out0 are replaced by s_out4.  */
5043
5044
5045   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5046          v_out1 = phi <VECT_DEF>
5047          Store them in NEW_PHIS.  */
5048
5049   exit_bb = single_exit (loop)->dest;
5050   prev_phi_info = NULL;
5051   new_phis.create (vect_defs.length ());
5052   FOR_EACH_VEC_ELT (vect_defs, i, def)
5053     {
5054       for (j = 0; j < ncopies; j++)
5055         {
5056           tree new_def = copy_ssa_name (def);
5057           phi = create_phi_node (new_def, exit_bb);
5058           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5059           if (j == 0)
5060             new_phis.quick_push (phi);
5061           else
5062             {
5063               def = vect_get_vec_def_for_stmt_copy (dt, def);
5064               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5065             }
5066
5067           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5068           prev_phi_info = vinfo_for_stmt (phi);
5069         }
5070     }
5071
5072   /* The epilogue is created for the outer-loop, i.e., for the loop being
5073      vectorized.  Create exit phis for the outer loop.  */
5074   if (double_reduc)
5075     {
5076       loop = outer_loop;
5077       exit_bb = single_exit (loop)->dest;
5078       inner_phis.create (vect_defs.length ());
5079       FOR_EACH_VEC_ELT (new_phis, i, phi)
5080         {
5081           tree new_result = copy_ssa_name (PHI_RESULT (phi));
5082           gphi *outer_phi = create_phi_node (new_result, exit_bb);
5083           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5084                            PHI_RESULT (phi));
5085           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5086                                                             loop_vinfo));
5087           inner_phis.quick_push (phi);
5088           new_phis[i] = outer_phi;
5089           prev_phi_info = vinfo_for_stmt (outer_phi);
5090           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5091             {
5092               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5093               new_result = copy_ssa_name (PHI_RESULT (phi));
5094               outer_phi = create_phi_node (new_result, exit_bb);
5095               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5096                                PHI_RESULT (phi));
5097               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5098                                                                 loop_vinfo));
5099               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5100               prev_phi_info = vinfo_for_stmt (outer_phi);
5101             }
5102         }
5103     }
5104
5105   exit_gsi = gsi_after_labels (exit_bb);
5106
5107   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5108          (i.e. when reduc_fn is not available) and in the final adjustment
5109          code (if needed).  Also get the original scalar reduction variable as
5110          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5111          represents a reduction pattern), the tree-code and scalar-def are
5112          taken from the original stmt that the pattern-stmt (STMT) replaces.
5113          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5114          are taken from STMT.  */
5115
5116   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5117   if (!orig_stmt)
5118     {
5119       /* Regular reduction  */
5120       orig_stmt = stmt;
5121     }
5122   else
5123     {
5124       /* Reduction pattern  */
5125       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5126       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5127       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5128     }
5129
5130   code = gimple_assign_rhs_code (orig_stmt);
5131   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5132      partial results are added and not subtracted.  */
5133   if (code == MINUS_EXPR)
5134     code = PLUS_EXPR;
5135
5136   scalar_dest = gimple_assign_lhs (orig_stmt);
5137   scalar_type = TREE_TYPE (scalar_dest);
5138   scalar_results.create (group_size);
5139   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5140   bitsize = TYPE_SIZE (scalar_type);
5141
5142   /* In case this is a reduction in an inner-loop while vectorizing an outer
5143      loop - we don't need to extract a single scalar result at the end of the
5144      inner-loop (unless it is double reduction, i.e., the use of reduction is
5145      outside the outer-loop).  The final vector of partial results will be used
5146      in the vectorized outer-loop, or reduced to a scalar result at the end of
5147      the outer-loop.  */
5148   if (nested_in_vect_loop && !double_reduc)
5149     goto vect_finalize_reduction;
5150
5151   /* SLP reduction without reduction chain, e.g.,
5152      # a1 = phi <a2, a0>
5153      # b1 = phi <b2, b0>
5154      a2 = operation (a1)
5155      b2 = operation (b1)  */
5156   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5157
5158   /* True if we should implement SLP_REDUC using native reduction operations
5159      instead of scalar operations.  */
5160   direct_slp_reduc = (reduc_fn != IFN_LAST
5161                       && slp_reduc
5162                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5163
5164   /* In case of reduction chain, e.g.,
5165      # a1 = phi <a3, a0>
5166      a2 = operation (a1)
5167      a3 = operation (a2),
5168
5169      we may end up with more than one vector result.  Here we reduce them to
5170      one vector.  */
5171   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5172     {
5173       tree first_vect = PHI_RESULT (new_phis[0]);
5174       gassign *new_vec_stmt = NULL;
5175       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5176       for (k = 1; k < new_phis.length (); k++)
5177         {
5178           gimple *next_phi = new_phis[k];
5179           tree second_vect = PHI_RESULT (next_phi);
5180           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5181           new_vec_stmt = gimple_build_assign (tem, code,
5182                                               first_vect, second_vect);
5183           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5184           first_vect = tem;
5185         }
5186
5187       new_phi_result = first_vect;
5188       if (new_vec_stmt)
5189         {
5190           new_phis.truncate (0);
5191           new_phis.safe_push (new_vec_stmt);
5192         }
5193     }
5194   /* Likewise if we couldn't use a single defuse cycle.  */
5195   else if (ncopies > 1)
5196     {
5197       gcc_assert (new_phis.length () == 1);
5198       tree first_vect = PHI_RESULT (new_phis[0]);
5199       gassign *new_vec_stmt = NULL;
5200       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5201       gimple *next_phi = new_phis[0];
5202       for (int k = 1; k < ncopies; ++k)
5203         {
5204           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5205           tree second_vect = PHI_RESULT (next_phi);
5206           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5207           new_vec_stmt = gimple_build_assign (tem, code,
5208                                               first_vect, second_vect);
5209           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5210           first_vect = tem;
5211         }
5212       new_phi_result = first_vect;
5213       new_phis.truncate (0);
5214       new_phis.safe_push (new_vec_stmt);
5215     }
5216   else
5217     new_phi_result = PHI_RESULT (new_phis[0]);
5218
5219   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5220       && reduc_fn != IFN_LAST)
5221     {
5222       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5223          various data values where the condition matched and another vector
5224          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5225          need to extract the last matching index (which will be the index with
5226          highest value) and use this to index into the data vector.
5227          For the case where there were no matches, the data vector will contain
5228          all default values and the index vector will be all zeros.  */
5229
5230       /* Get various versions of the type of the vector of indexes.  */
5231       tree index_vec_type = TREE_TYPE (induction_index);
5232       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5233       tree index_scalar_type = TREE_TYPE (index_vec_type);
5234       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5235         (index_vec_type);
5236
5237       /* Get an unsigned integer version of the type of the data vector.  */
5238       int scalar_precision
5239         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5240       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5241       tree vectype_unsigned = build_vector_type
5242         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5243
5244       /* First we need to create a vector (ZERO_VEC) of zeros and another
5245          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5246          can create using a MAX reduction and then expanding.
5247          In the case where the loop never made any matches, the max index will
5248          be zero.  */
5249
5250       /* Vector of {0, 0, 0,...}.  */
5251       tree zero_vec = make_ssa_name (vectype);
5252       tree zero_vec_rhs = build_zero_cst (vectype);
5253       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5254       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5255
5256       /* Find maximum value from the vector of found indexes.  */
5257       tree max_index = make_ssa_name (index_scalar_type);
5258       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5259                                                           1, induction_index);
5260       gimple_call_set_lhs (max_index_stmt, max_index);
5261       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5262
5263       /* Vector of {max_index, max_index, max_index,...}.  */
5264       tree max_index_vec = make_ssa_name (index_vec_type);
5265       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5266                                                       max_index);
5267       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5268                                                         max_index_vec_rhs);
5269       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5270
5271       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5272          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5273          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5274          otherwise.  Only one value should match, resulting in a vector
5275          (VEC_COND) with one data value and the rest zeros.
5276          In the case where the loop never made any matches, every index will
5277          match, resulting in a vector with all data values (which will all be
5278          the default value).  */
5279
5280       /* Compare the max index vector to the vector of found indexes to find
5281          the position of the max value.  */
5282       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5283       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5284                                                       induction_index,
5285                                                       max_index_vec);
5286       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5287
5288       /* Use the compare to choose either values from the data vector or
5289          zero.  */
5290       tree vec_cond = make_ssa_name (vectype);
5291       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5292                                                    vec_compare, new_phi_result,
5293                                                    zero_vec);
5294       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5295
5296       /* Finally we need to extract the data value from the vector (VEC_COND)
5297          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5298          reduction, but because this doesn't exist, we can use a MAX reduction
5299          instead.  The data value might be signed or a float so we need to cast
5300          it first.
5301          In the case where the loop never made any matches, the data values are
5302          all identical, and so will reduce down correctly.  */
5303
5304       /* Make the matched data values unsigned.  */
5305       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5306       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5307                                        vec_cond);
5308       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5309                                                         VIEW_CONVERT_EXPR,
5310                                                         vec_cond_cast_rhs);
5311       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5312
5313       /* Reduce down to a scalar value.  */
5314       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5315       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5316                                                            1, vec_cond_cast);
5317       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5318       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5319
5320       /* Convert the reduced value back to the result type and set as the
5321          result.  */
5322       gimple_seq stmts = NULL;
5323       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5324                                data_reduc);
5325       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5326       scalar_results.safe_push (new_temp);
5327     }
5328   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5329            && reduc_fn == IFN_LAST)
5330     {
5331       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5332          idx = 0;
5333          idx_val = induction_index[0];
5334          val = data_reduc[0];
5335          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5336            if (induction_index[i] > idx_val)
5337              val = data_reduc[i], idx_val = induction_index[i];
5338          return val;  */
5339
5340       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5341       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5342       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5343       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5344       /* Enforced by vectorizable_reduction, which ensures we have target
5345          support before allowing a conditional reduction on variable-length
5346          vectors.  */
5347       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5348       tree idx_val = NULL_TREE, val = NULL_TREE;
5349       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5350         {
5351           tree old_idx_val = idx_val;
5352           tree old_val = val;
5353           idx_val = make_ssa_name (idx_eltype);
5354           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5355                                              build3 (BIT_FIELD_REF, idx_eltype,
5356                                                      induction_index,
5357                                                      bitsize_int (el_size),
5358                                                      bitsize_int (off)));
5359           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360           val = make_ssa_name (data_eltype);
5361           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5362                                              build3 (BIT_FIELD_REF,
5363                                                      data_eltype,
5364                                                      new_phi_result,
5365                                                      bitsize_int (el_size),
5366                                                      bitsize_int (off)));
5367           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5368           if (off != 0)
5369             {
5370               tree new_idx_val = idx_val;
5371               tree new_val = val;
5372               if (off != v_size - el_size)
5373                 {
5374                   new_idx_val = make_ssa_name (idx_eltype);
5375                   epilog_stmt = gimple_build_assign (new_idx_val,
5376                                                      MAX_EXPR, idx_val,
5377                                                      old_idx_val);
5378                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5379                 }
5380               new_val = make_ssa_name (data_eltype);
5381               epilog_stmt = gimple_build_assign (new_val,
5382                                                  COND_EXPR,
5383                                                  build2 (GT_EXPR,
5384                                                          boolean_type_node,
5385                                                          idx_val,
5386                                                          old_idx_val),
5387                                                  val, old_val);
5388               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5389               idx_val = new_idx_val;
5390               val = new_val;
5391             }
5392         }
5393       /* Convert the reduced value back to the result type and set as the
5394          result.  */
5395       gimple_seq stmts = NULL;
5396       val = gimple_convert (&stmts, scalar_type, val);
5397       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5398       scalar_results.safe_push (val);
5399     }
5400
5401   /* 2.3 Create the reduction code, using one of the three schemes described
5402          above. In SLP we simply need to extract all the elements from the
5403          vector (without reducing them), so we use scalar shifts.  */
5404   else if (reduc_fn != IFN_LAST && !slp_reduc)
5405     {
5406       tree tmp;
5407       tree vec_elem_type;
5408
5409       /* Case 1:  Create:
5410          v_out2 = reduc_expr <v_out1>  */
5411
5412       if (dump_enabled_p ())
5413         dump_printf_loc (MSG_NOTE, vect_location,
5414                          "Reduce using direct vector reduction.\n");
5415
5416       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5417       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5418         {
5419           tree tmp_dest
5420             = vect_create_destination_var (scalar_dest, vec_elem_type);
5421           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5422                                                     new_phi_result);
5423           gimple_set_lhs (epilog_stmt, tmp_dest);
5424           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5425           gimple_set_lhs (epilog_stmt, new_temp);
5426           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5427
5428           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5429                                              new_temp);
5430         }
5431       else
5432         {
5433           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5434                                                     new_phi_result);
5435           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5436         }
5437
5438       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5439       gimple_set_lhs (epilog_stmt, new_temp);
5440       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5441
5442       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5443            == INTEGER_INDUC_COND_REDUCTION)
5444           && !operand_equal_p (initial_def, induc_val, 0))
5445         {
5446           /* Earlier we set the initial value to be a vector if induc_val
5447              values.  Check the result and if it is induc_val then replace
5448              with the original initial value, unless induc_val is
5449              the same as initial_def already.  */
5450           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5451                                   induc_val);
5452
5453           tmp = make_ssa_name (new_scalar_dest);
5454           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5455                                              initial_def, new_temp);
5456           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5457           new_temp = tmp;
5458         }
5459
5460       scalar_results.safe_push (new_temp);
5461     }
5462   else if (direct_slp_reduc)
5463     {
5464       /* Here we create one vector for each of the GROUP_SIZE results,
5465          with the elements for other SLP statements replaced with the
5466          neutral value.  We can then do a normal reduction on each vector.  */
5467
5468       /* Enforced by vectorizable_reduction.  */
5469       gcc_assert (new_phis.length () == 1);
5470       gcc_assert (pow2p_hwi (group_size));
5471
5472       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5473       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5474       gimple_seq seq = NULL;
5475
5476       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5477          and the same element size as VECTYPE.  */
5478       tree index = build_index_vector (vectype, 0, 1);
5479       tree index_type = TREE_TYPE (index);
5480       tree index_elt_type = TREE_TYPE (index_type);
5481       tree mask_type = build_same_sized_truth_vector_type (index_type);
5482
5483       /* Create a vector that, for each element, identifies which of
5484          the GROUP_SIZE results should use it.  */
5485       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5486       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5487                             build_vector_from_val (index_type, index_mask));
5488
5489       /* Get a neutral vector value.  This is simply a splat of the neutral
5490          scalar value if we have one, otherwise the initial scalar value
5491          is itself a neutral value.  */
5492       tree vector_identity = NULL_TREE;
5493       if (neutral_op)
5494         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5495                                                         neutral_op);
5496       for (unsigned int i = 0; i < group_size; ++i)
5497         {
5498           /* If there's no univeral neutral value, we can use the
5499              initial scalar value from the original PHI.  This is used
5500              for MIN and MAX reduction, for example.  */
5501           if (!neutral_op)
5502             {
5503               tree scalar_value
5504                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5505                                          loop_preheader_edge (loop));
5506               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5507                                                               scalar_value);
5508             }
5509
5510           /* Calculate the equivalent of:
5511
5512              sel[j] = (index[j] == i);
5513
5514              which selects the elements of NEW_PHI_RESULT that should
5515              be included in the result.  */
5516           tree compare_val = build_int_cst (index_elt_type, i);
5517           compare_val = build_vector_from_val (index_type, compare_val);
5518           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5519                                    index, compare_val);
5520
5521           /* Calculate the equivalent of:
5522
5523              vec = seq ? new_phi_result : vector_identity;
5524
5525              VEC is now suitable for a full vector reduction.  */
5526           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5527                                    sel, new_phi_result, vector_identity);
5528
5529           /* Do the reduction and convert it to the appropriate type.  */
5530           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5531           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5532           gimple_call_set_lhs (call, scalar);
5533           gimple_seq_add_stmt (&seq, call);
5534           scalar = gimple_convert (&seq, scalar_type, scalar);
5535           scalar_results.safe_push (scalar);
5536         }
5537       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5538     }
5539   else
5540     {
5541       bool reduce_with_shift;
5542       tree vec_temp;
5543
5544       /* COND reductions all do the final reduction with MAX_EXPR
5545          or MIN_EXPR.  */
5546       if (code == COND_EXPR)
5547         {
5548           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5549               == INTEGER_INDUC_COND_REDUCTION)
5550             code = induc_code;
5551           else
5552             code = MAX_EXPR;
5553         }
5554
5555       /* See if the target wants to do the final (shift) reduction
5556          in a vector mode of smaller size and first reduce upper/lower
5557          halves against each other.  */
5558       enum machine_mode mode1 = mode;
5559       tree vectype1 = vectype;
5560       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5561       unsigned sz1 = sz;
5562       if (!slp_reduc
5563           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5564         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5565
5566       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5567       reduce_with_shift = have_whole_vector_shift (mode1);
5568       if (!VECTOR_MODE_P (mode1))
5569         reduce_with_shift = false;
5570       else
5571         {
5572           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5573           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5574             reduce_with_shift = false;
5575         }
5576
5577       /* First reduce the vector to the desired vector size we should
5578          do shift reduction on by combining upper and lower halves.  */
5579       new_temp = new_phi_result;
5580       while (sz > sz1)
5581         {
5582           gcc_assert (!slp_reduc);
5583           sz /= 2;
5584           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5585
5586           /* The target has to make sure we support lowpart/highpart
5587              extraction, either via direct vector extract or through
5588              an integer mode punning.  */
5589           tree dst1, dst2;
5590           if (convert_optab_handler (vec_extract_optab,
5591                                      TYPE_MODE (TREE_TYPE (new_temp)),
5592                                      TYPE_MODE (vectype1))
5593               != CODE_FOR_nothing)
5594             {
5595               /* Extract sub-vectors directly once vec_extract becomes
5596                  a conversion optab.  */
5597               dst1 = make_ssa_name (vectype1);
5598               epilog_stmt
5599                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5600                                          build3 (BIT_FIELD_REF, vectype1,
5601                                                  new_temp, TYPE_SIZE (vectype1),
5602                                                  bitsize_int (0)));
5603               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5604               dst2 =  make_ssa_name (vectype1);
5605               epilog_stmt
5606                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5607                                          build3 (BIT_FIELD_REF, vectype1,
5608                                                  new_temp, TYPE_SIZE (vectype1),
5609                                                  bitsize_int (sz * BITS_PER_UNIT)));
5610               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5611             }
5612           else
5613             {
5614               /* Extract via punning to appropriately sized integer mode
5615                  vector.  */
5616               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5617                                                             1);
5618               tree etype = build_vector_type (eltype, 2);
5619               gcc_assert (convert_optab_handler (vec_extract_optab,
5620                                                  TYPE_MODE (etype),
5621                                                  TYPE_MODE (eltype))
5622                           != CODE_FOR_nothing);
5623               tree tem = make_ssa_name (etype);
5624               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5625                                                  build1 (VIEW_CONVERT_EXPR,
5626                                                          etype, new_temp));
5627               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5628               new_temp = tem;
5629               tem = make_ssa_name (eltype);
5630               epilog_stmt
5631                   = gimple_build_assign (tem, BIT_FIELD_REF,
5632                                          build3 (BIT_FIELD_REF, eltype,
5633                                                  new_temp, TYPE_SIZE (eltype),
5634                                                  bitsize_int (0)));
5635               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5636               dst1 = make_ssa_name (vectype1);
5637               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5638                                                  build1 (VIEW_CONVERT_EXPR,
5639                                                          vectype1, tem));
5640               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5641               tem = make_ssa_name (eltype);
5642               epilog_stmt
5643                   = gimple_build_assign (tem, BIT_FIELD_REF,
5644                                          build3 (BIT_FIELD_REF, eltype,
5645                                                  new_temp, TYPE_SIZE (eltype),
5646                                                  bitsize_int (sz * BITS_PER_UNIT)));
5647               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5648               dst2 =  make_ssa_name (vectype1);
5649               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5650                                                  build1 (VIEW_CONVERT_EXPR,
5651                                                          vectype1, tem));
5652               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5653             }
5654
5655           new_temp = make_ssa_name (vectype1);
5656           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5657           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5658         }
5659
5660       if (reduce_with_shift && !slp_reduc)
5661         {
5662           int element_bitsize = tree_to_uhwi (bitsize);
5663           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5664              for variable-length vectors and also requires direct target support
5665              for loop reductions.  */
5666           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5667           int nelements = vec_size_in_bits / element_bitsize;
5668           vec_perm_builder sel;
5669           vec_perm_indices indices;
5670
5671           int elt_offset;
5672
5673           tree zero_vec = build_zero_cst (vectype1);
5674           /* Case 2: Create:
5675              for (offset = nelements/2; offset >= 1; offset/=2)
5676                 {
5677                   Create:  va' = vec_shift <va, offset>
5678                   Create:  va = vop <va, va'>
5679                 }  */
5680
5681           tree rhs;
5682
5683           if (dump_enabled_p ())
5684             dump_printf_loc (MSG_NOTE, vect_location,
5685                              "Reduce using vector shifts\n");
5686
5687           mode1 = TYPE_MODE (vectype1);
5688           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5689           for (elt_offset = nelements / 2;
5690                elt_offset >= 1;
5691                elt_offset /= 2)
5692             {
5693               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5694               indices.new_vector (sel, 2, nelements);
5695               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5696               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5697                                                  new_temp, zero_vec, mask);
5698               new_name = make_ssa_name (vec_dest, epilog_stmt);
5699               gimple_assign_set_lhs (epilog_stmt, new_name);
5700               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5701
5702               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5703                                                  new_temp);
5704               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5705               gimple_assign_set_lhs (epilog_stmt, new_temp);
5706               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5707             }
5708
5709           /* 2.4  Extract the final scalar result.  Create:
5710              s_out3 = extract_field <v_out2, bitpos>  */
5711
5712           if (dump_enabled_p ())
5713             dump_printf_loc (MSG_NOTE, vect_location,
5714                              "extract scalar result\n");
5715
5716           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5717                         bitsize, bitsize_zero_node);
5718           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5719           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5720           gimple_assign_set_lhs (epilog_stmt, new_temp);
5721           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5722           scalar_results.safe_push (new_temp);
5723         }
5724       else
5725         {
5726           /* Case 3: Create:
5727              s = extract_field <v_out2, 0>
5728              for (offset = element_size;
5729                   offset < vector_size;
5730                   offset += element_size;)
5731                {
5732                  Create:  s' = extract_field <v_out2, offset>
5733                  Create:  s = op <s, s'>  // For non SLP cases
5734                }  */
5735
5736           if (dump_enabled_p ())
5737             dump_printf_loc (MSG_NOTE, vect_location,
5738                              "Reduce using scalar code.\n");
5739
5740           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5741           int element_bitsize = tree_to_uhwi (bitsize);
5742           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5743             {
5744               int bit_offset;
5745               if (gimple_code (new_phi) == GIMPLE_PHI)
5746                 vec_temp = PHI_RESULT (new_phi);
5747               else
5748                 vec_temp = gimple_assign_lhs (new_phi);
5749               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5750                                  bitsize_zero_node);
5751               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5752               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5753               gimple_assign_set_lhs (epilog_stmt, new_temp);
5754               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5755
5756               /* In SLP we don't need to apply reduction operation, so we just
5757                  collect s' values in SCALAR_RESULTS.  */
5758               if (slp_reduc)
5759                 scalar_results.safe_push (new_temp);
5760
5761               for (bit_offset = element_bitsize;
5762                    bit_offset < vec_size_in_bits;
5763                    bit_offset += element_bitsize)
5764                 {
5765                   tree bitpos = bitsize_int (bit_offset);
5766                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5767                                      bitsize, bitpos);
5768
5769                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5770                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5771                   gimple_assign_set_lhs (epilog_stmt, new_name);
5772                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5773
5774                   if (slp_reduc)
5775                     {
5776                       /* In SLP we don't need to apply reduction operation, so
5777                          we just collect s' values in SCALAR_RESULTS.  */
5778                       new_temp = new_name;
5779                       scalar_results.safe_push (new_name);
5780                     }
5781                   else
5782                     {
5783                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5784                                                          new_name, new_temp);
5785                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5786                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5787                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5788                     }
5789                 }
5790             }
5791
5792           /* The only case where we need to reduce scalar results in SLP, is
5793              unrolling.  If the size of SCALAR_RESULTS is greater than
5794              GROUP_SIZE, we reduce them combining elements modulo
5795              GROUP_SIZE.  */
5796           if (slp_reduc)
5797             {
5798               tree res, first_res, new_res;
5799               gimple *new_stmt;
5800
5801               /* Reduce multiple scalar results in case of SLP unrolling.  */
5802               for (j = group_size; scalar_results.iterate (j, &res);
5803                    j++)
5804                 {
5805                   first_res = scalar_results[j % group_size];
5806                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5807                                                   first_res, res);
5808                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5809                   gimple_assign_set_lhs (new_stmt, new_res);
5810                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5811                   scalar_results[j % group_size] = new_res;
5812                 }
5813             }
5814           else
5815             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5816             scalar_results.safe_push (new_temp);
5817         }
5818
5819       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5820            == INTEGER_INDUC_COND_REDUCTION)
5821           && !operand_equal_p (initial_def, induc_val, 0))
5822         {
5823           /* Earlier we set the initial value to be a vector if induc_val
5824              values.  Check the result and if it is induc_val then replace
5825              with the original initial value, unless induc_val is
5826              the same as initial_def already.  */
5827           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5828                                   induc_val);
5829
5830           tree tmp = make_ssa_name (new_scalar_dest);
5831           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5832                                              initial_def, new_temp);
5833           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5834           scalar_results[0] = tmp;
5835         }
5836     }
5837
5838 vect_finalize_reduction:
5839
5840   if (double_reduc)
5841     loop = loop->inner;
5842
5843   /* 2.5 Adjust the final result by the initial value of the reduction
5844          variable. (When such adjustment is not needed, then
5845          'adjustment_def' is zero).  For example, if code is PLUS we create:
5846          new_temp = loop_exit_def + adjustment_def  */
5847
5848   if (adjustment_def)
5849     {
5850       gcc_assert (!slp_reduc);
5851       if (nested_in_vect_loop)
5852         {
5853           new_phi = new_phis[0];
5854           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5855           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5856           new_dest = vect_create_destination_var (scalar_dest, vectype);
5857         }
5858       else
5859         {
5860           new_temp = scalar_results[0];
5861           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5862           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5863           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5864         }
5865
5866       epilog_stmt = gimple_build_assign (new_dest, expr);
5867       new_temp = make_ssa_name (new_dest, epilog_stmt);
5868       gimple_assign_set_lhs (epilog_stmt, new_temp);
5869       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5870       if (nested_in_vect_loop)
5871         {
5872           set_vinfo_for_stmt (epilog_stmt,
5873                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5874           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5875                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5876
5877           if (!double_reduc)
5878             scalar_results.quick_push (new_temp);
5879           else
5880             scalar_results[0] = new_temp;
5881         }
5882       else
5883         scalar_results[0] = new_temp;
5884
5885       new_phis[0] = epilog_stmt;
5886     }
5887
5888   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5889           phis with new adjusted scalar results, i.e., replace use <s_out0>
5890           with use <s_out4>.
5891
5892      Transform:
5893         loop_exit:
5894           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5895           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5896           v_out2 = reduce <v_out1>
5897           s_out3 = extract_field <v_out2, 0>
5898           s_out4 = adjust_result <s_out3>
5899           use <s_out0>
5900           use <s_out0>
5901
5902      into:
5903
5904         loop_exit:
5905           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5906           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5907           v_out2 = reduce <v_out1>
5908           s_out3 = extract_field <v_out2, 0>
5909           s_out4 = adjust_result <s_out3>
5910           use <s_out4>
5911           use <s_out4> */
5912
5913
5914   /* In SLP reduction chain we reduce vector results into one vector if
5915      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5916      the last stmt in the reduction chain, since we are looking for the loop
5917      exit phi node.  */
5918   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5919     {
5920       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5921       /* Handle reduction patterns.  */
5922       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5923         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5924
5925       scalar_dest = gimple_assign_lhs (dest_stmt);
5926       group_size = 1;
5927     }
5928
5929   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5930      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5931      need to match SCALAR_RESULTS with corresponding statements.  The first
5932      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5933      the first vector stmt, etc.
5934      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5935   if (group_size > new_phis.length ())
5936     {
5937       ratio = group_size / new_phis.length ();
5938       gcc_assert (!(group_size % new_phis.length ()));
5939     }
5940   else
5941     ratio = 1;
5942
5943   for (k = 0; k < group_size; k++)
5944     {
5945       if (k % ratio == 0)
5946         {
5947           epilog_stmt = new_phis[k / ratio];
5948           reduction_phi = reduction_phis[k / ratio];
5949           if (double_reduc)
5950             inner_phi = inner_phis[k / ratio];
5951         }
5952
5953       if (slp_reduc)
5954         {
5955           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5956
5957           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5958           /* SLP statements can't participate in patterns.  */
5959           gcc_assert (!orig_stmt);
5960           scalar_dest = gimple_assign_lhs (current_stmt);
5961         }
5962
5963       phis.create (3);
5964       /* Find the loop-closed-use at the loop exit of the original scalar
5965          result.  (The reduction result is expected to have two immediate uses -
5966          one at the latch block, and one at the loop exit).  */
5967       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5968         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5969             && !is_gimple_debug (USE_STMT (use_p)))
5970           phis.safe_push (USE_STMT (use_p));
5971
5972       /* While we expect to have found an exit_phi because of loop-closed-ssa
5973          form we can end up without one if the scalar cycle is dead.  */
5974
5975       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5976         {
5977           if (outer_loop)
5978             {
5979               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5980               gphi *vect_phi;
5981
5982               /* FORNOW. Currently not supporting the case that an inner-loop
5983                  reduction is not used in the outer-loop (but only outside the
5984                  outer-loop), unless it is double reduction.  */
5985               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5986                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5987                           || double_reduc);
5988
5989               if (double_reduc)
5990                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5991               else
5992                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5993               if (!double_reduc
5994                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5995                       != vect_double_reduction_def)
5996                 continue;
5997
5998               /* Handle double reduction:
5999
6000                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
6001                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6002                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
6003                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
6004
6005                  At that point the regular reduction (stmt2 and stmt3) is
6006                  already vectorized, as well as the exit phi node, stmt4.
6007                  Here we vectorize the phi node of double reduction, stmt1, and
6008                  update all relevant statements.  */
6009
6010               /* Go through all the uses of s2 to find double reduction phi
6011                  node, i.e., stmt1 above.  */
6012               orig_name = PHI_RESULT (exit_phi);
6013               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6014                 {
6015                   stmt_vec_info use_stmt_vinfo;
6016                   stmt_vec_info new_phi_vinfo;
6017                   tree vect_phi_init, preheader_arg, vect_phi_res;
6018                   basic_block bb = gimple_bb (use_stmt);
6019                   gimple *use;
6020
6021                   /* Check that USE_STMT is really double reduction phi
6022                      node.  */
6023                   if (gimple_code (use_stmt) != GIMPLE_PHI
6024                       || gimple_phi_num_args (use_stmt) != 2
6025                       || bb->loop_father != outer_loop)
6026                     continue;
6027                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6028                   if (!use_stmt_vinfo
6029                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6030                           != vect_double_reduction_def)
6031                     continue;
6032
6033                   /* Create vector phi node for double reduction:
6034                      vs1 = phi <vs0, vs2>
6035                      vs1 was created previously in this function by a call to
6036                        vect_get_vec_def_for_operand and is stored in
6037                        vec_initial_def;
6038                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6039                      vs0 is created here.  */
6040
6041                   /* Create vector phi node.  */
6042                   vect_phi = create_phi_node (vec_initial_def, bb);
6043                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6044                                     loop_vec_info_for_loop (outer_loop));
6045                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6046
6047                   /* Create vs0 - initial def of the double reduction phi.  */
6048                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6049                                              loop_preheader_edge (outer_loop));
6050                   vect_phi_init = get_initial_def_for_reduction
6051                     (stmt, preheader_arg, NULL);
6052
6053                   /* Update phi node arguments with vs0 and vs2.  */
6054                   add_phi_arg (vect_phi, vect_phi_init,
6055                                loop_preheader_edge (outer_loop),
6056                                UNKNOWN_LOCATION);
6057                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6058                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6059                   if (dump_enabled_p ())
6060                     {
6061                       dump_printf_loc (MSG_NOTE, vect_location,
6062                                        "created double reduction phi node: ");
6063                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6064                     }
6065
6066                   vect_phi_res = PHI_RESULT (vect_phi);
6067
6068                   /* Replace the use, i.e., set the correct vs1 in the regular
6069                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6070                      loop is redundant.  */
6071                   use = reduction_phi;
6072                   for (j = 0; j < ncopies; j++)
6073                     {
6074                       edge pr_edge = loop_preheader_edge (loop);
6075                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6076                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6077                     }
6078                 }
6079             }
6080         }
6081
6082       phis.release ();
6083       if (nested_in_vect_loop)
6084         {
6085           if (double_reduc)
6086             loop = outer_loop;
6087           else
6088             continue;
6089         }
6090
6091       phis.create (3);
6092       /* Find the loop-closed-use at the loop exit of the original scalar
6093          result.  (The reduction result is expected to have two immediate uses,
6094          one at the latch block, and one at the loop exit).  For double
6095          reductions we are looking for exit phis of the outer loop.  */
6096       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6097         {
6098           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6099             {
6100               if (!is_gimple_debug (USE_STMT (use_p)))
6101                 phis.safe_push (USE_STMT (use_p));
6102             }
6103           else
6104             {
6105               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6106                 {
6107                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6108
6109                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6110                     {
6111                       if (!flow_bb_inside_loop_p (loop,
6112                                              gimple_bb (USE_STMT (phi_use_p)))
6113                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6114                         phis.safe_push (USE_STMT (phi_use_p));
6115                     }
6116                 }
6117             }
6118         }
6119
6120       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6121         {
6122           /* Replace the uses:  */
6123           orig_name = PHI_RESULT (exit_phi);
6124           scalar_result = scalar_results[k];
6125           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6126             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6127               SET_USE (use_p, scalar_result);
6128         }
6129
6130       phis.release ();
6131     }
6132 }
6133
6134 /* Return a vector of type VECTYPE that is equal to the vector select
6135    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6136    before GSI.  */
6137
6138 static tree
6139 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6140                      tree vec, tree identity)
6141 {
6142   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6143   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6144                                           mask, vec, identity);
6145   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6146   return cond;
6147 }
6148
6149 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6150    order, starting with LHS.  Insert the extraction statements before GSI and
6151    associate the new scalar SSA names with variable SCALAR_DEST.
6152    Return the SSA name for the result.  */
6153
6154 static tree
6155 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6156                        tree_code code, tree lhs, tree vector_rhs)
6157 {
6158   tree vectype = TREE_TYPE (vector_rhs);
6159   tree scalar_type = TREE_TYPE (vectype);
6160   tree bitsize = TYPE_SIZE (scalar_type);
6161   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6162   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6163
6164   for (unsigned HOST_WIDE_INT bit_offset = 0;
6165        bit_offset < vec_size_in_bits;
6166        bit_offset += element_bitsize)
6167     {
6168       tree bitpos = bitsize_int (bit_offset);
6169       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6170                          bitsize, bitpos);
6171
6172       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6173       rhs = make_ssa_name (scalar_dest, stmt);
6174       gimple_assign_set_lhs (stmt, rhs);
6175       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6176
6177       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6178       tree new_name = make_ssa_name (scalar_dest, stmt);
6179       gimple_assign_set_lhs (stmt, new_name);
6180       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6181       lhs = new_name;
6182     }
6183   return lhs;
6184 }
6185
6186 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6187    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6188    statement.  CODE is the operation performed by STMT and OPS are
6189    its scalar operands.  REDUC_INDEX is the index of the operand in
6190    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6191    implements in-order reduction, or IFN_LAST if we should open-code it.
6192    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6193    that should be used to control the operation in a fully-masked loop.  */
6194
6195 static bool
6196 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6197                                gimple **vec_stmt, slp_tree slp_node,
6198                                gimple *reduc_def_stmt,
6199                                tree_code code, internal_fn reduc_fn,
6200                                tree ops[3], tree vectype_in,
6201                                int reduc_index, vec_loop_masks *masks)
6202 {
6203   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6204   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6205   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6206   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6207   gimple *new_stmt = NULL;
6208
6209   int ncopies;
6210   if (slp_node)
6211     ncopies = 1;
6212   else
6213     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6214
6215   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6216   gcc_assert (ncopies == 1);
6217   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6218   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6219   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6220               == FOLD_LEFT_REDUCTION);
6221
6222   if (slp_node)
6223     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6224                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6225
6226   tree op0 = ops[1 - reduc_index];
6227
6228   int group_size = 1;
6229   gimple *scalar_dest_def;
6230   auto_vec<tree> vec_oprnds0;
6231   if (slp_node)
6232     {
6233       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6234       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6235       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6236     }
6237   else
6238     {
6239       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6240       vec_oprnds0.create (1);
6241       vec_oprnds0.quick_push (loop_vec_def0);
6242       scalar_dest_def = stmt;
6243     }
6244
6245   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6246   tree scalar_type = TREE_TYPE (scalar_dest);
6247   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6248
6249   int vec_num = vec_oprnds0.length ();
6250   gcc_assert (vec_num == 1 || slp_node);
6251   tree vec_elem_type = TREE_TYPE (vectype_out);
6252   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6253
6254   tree vector_identity = NULL_TREE;
6255   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6256     vector_identity = build_zero_cst (vectype_out);
6257
6258   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6259   int i;
6260   tree def0;
6261   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6262     {
6263       tree mask = NULL_TREE;
6264       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6265         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6266
6267       /* Handle MINUS by adding the negative.  */
6268       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6269         {
6270           tree negated = make_ssa_name (vectype_out);
6271           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6272           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6273           def0 = negated;
6274         }
6275
6276       if (mask)
6277         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6278                                     vector_identity);
6279
6280       /* On the first iteration the input is simply the scalar phi
6281          result, and for subsequent iterations it is the output of
6282          the preceding operation.  */
6283       if (reduc_fn != IFN_LAST)
6284         {
6285           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6286           /* For chained SLP reductions the output of the previous reduction
6287              operation serves as the input of the next. For the final statement
6288              the output cannot be a temporary - we reuse the original
6289              scalar destination of the last statement.  */
6290           if (i != vec_num - 1)
6291             {
6292               gimple_set_lhs (new_stmt, scalar_dest_var);
6293               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6294               gimple_set_lhs (new_stmt, reduc_var);
6295             }
6296         }
6297       else
6298         {
6299           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6300                                              reduc_var, def0);
6301           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6302           /* Remove the statement, so that we can use the same code paths
6303              as for statements that we've just created.  */
6304           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6305           gsi_remove (&tmp_gsi, false);
6306         }
6307
6308       if (i == vec_num - 1)
6309         {
6310           gimple_set_lhs (new_stmt, scalar_dest);
6311           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6312         }
6313       else
6314         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6315
6316       if (slp_node)
6317         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6318     }
6319
6320   if (!slp_node)
6321     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6322
6323   return true;
6324 }
6325
6326 /* Function is_nonwrapping_integer_induction.
6327
6328    Check if STMT (which is part of loop LOOP) both increments and
6329    does not cause overflow.  */
6330
6331 static bool
6332 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6333 {
6334   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6335   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6336   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6337   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6338   widest_int ni, max_loop_value, lhs_max;
6339   bool overflow = false;
6340
6341   /* Make sure the loop is integer based.  */
6342   if (TREE_CODE (base) != INTEGER_CST
6343       || TREE_CODE (step) != INTEGER_CST)
6344     return false;
6345
6346   /* Check that the max size of the loop will not wrap.  */
6347
6348   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6349     return true;
6350
6351   if (! max_stmt_executions (loop, &ni))
6352     return false;
6353
6354   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6355                             &overflow);
6356   if (overflow)
6357     return false;
6358
6359   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6360                             TYPE_SIGN (lhs_type), &overflow);
6361   if (overflow)
6362     return false;
6363
6364   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6365           <= TYPE_PRECISION (lhs_type));
6366 }
6367
6368 /* Function vectorizable_reduction.
6369
6370    Check if STMT performs a reduction operation that can be vectorized.
6371    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6372    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6373    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6374
6375    This function also handles reduction idioms (patterns) that have been
6376    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6377    of this form:
6378      X = pattern_expr (arg0, arg1, ..., X)
6379    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6380    sequence that had been detected and replaced by the pattern-stmt (STMT).
6381
6382    This function also handles reduction of condition expressions, for example:
6383      for (int i = 0; i < N; i++)
6384        if (a[i] < value)
6385          last = a[i];
6386    This is handled by vectorising the loop and creating an additional vector
6387    containing the loop indexes for which "a[i] < value" was true.  In the
6388    function epilogue this is reduced to a single max value and then used to
6389    index into the vector of results.
6390
6391    In some cases of reduction patterns, the type of the reduction variable X is
6392    different than the type of the other arguments of STMT.
6393    In such cases, the vectype that is used when transforming STMT into a vector
6394    stmt is different than the vectype that is used to determine the
6395    vectorization factor, because it consists of a different number of elements
6396    than the actual number of elements that are being operated upon in parallel.
6397
6398    For example, consider an accumulation of shorts into an int accumulator.
6399    On some targets it's possible to vectorize this pattern operating on 8
6400    shorts at a time (hence, the vectype for purposes of determining the
6401    vectorization factor should be V8HI); on the other hand, the vectype that
6402    is used to create the vector form is actually V4SI (the type of the result).
6403
6404    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6405    indicates what is the actual level of parallelism (V8HI in the example), so
6406    that the right vectorization factor would be derived.  This vectype
6407    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6408    be used to create the vectorized stmt.  The right vectype for the vectorized
6409    stmt is obtained from the type of the result X:
6410         get_vectype_for_scalar_type (TREE_TYPE (X))
6411
6412    This means that, contrary to "regular" reductions (or "regular" stmts in
6413    general), the following equation:
6414       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6415    does *NOT* necessarily hold for reduction patterns.  */
6416
6417 bool
6418 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6419                         gimple **vec_stmt, slp_tree slp_node,
6420                         slp_instance slp_node_instance)
6421 {
6422   tree vec_dest;
6423   tree scalar_dest;
6424   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6425   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6426   tree vectype_in = NULL_TREE;
6427   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6428   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6429   enum tree_code code, orig_code;
6430   internal_fn reduc_fn;
6431   machine_mode vec_mode;
6432   int op_type;
6433   optab optab;
6434   tree new_temp = NULL_TREE;
6435   gimple *def_stmt;
6436   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6437   gimple *cond_reduc_def_stmt = NULL;
6438   enum tree_code cond_reduc_op_code = ERROR_MARK;
6439   tree scalar_type;
6440   bool is_simple_use;
6441   gimple *orig_stmt;
6442   stmt_vec_info orig_stmt_info = NULL;
6443   int i;
6444   int ncopies;
6445   int epilog_copies;
6446   stmt_vec_info prev_stmt_info, prev_phi_info;
6447   bool single_defuse_cycle = false;
6448   gimple *new_stmt = NULL;
6449   int j;
6450   tree ops[3];
6451   enum vect_def_type dts[3];
6452   bool nested_cycle = false, found_nested_cycle_def = false;
6453   bool double_reduc = false;
6454   basic_block def_bb;
6455   struct loop * def_stmt_loop, *outer_loop = NULL;
6456   tree def_arg;
6457   gimple *def_arg_stmt;
6458   auto_vec<tree> vec_oprnds0;
6459   auto_vec<tree> vec_oprnds1;
6460   auto_vec<tree> vec_oprnds2;
6461   auto_vec<tree> vect_defs;
6462   auto_vec<gimple *> phis;
6463   int vec_num;
6464   tree def0, tem;
6465   bool first_p = true;
6466   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6467   tree cond_reduc_val = NULL_TREE;
6468
6469   /* Make sure it was already recognized as a reduction computation.  */
6470   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6471       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6472     return false;
6473
6474   if (nested_in_vect_loop_p (loop, stmt))
6475     {
6476       outer_loop = loop;
6477       loop = loop->inner;
6478       nested_cycle = true;
6479     }
6480
6481   /* In case of reduction chain we switch to the first stmt in the chain, but
6482      we don't update STMT_INFO, since only the last stmt is marked as reduction
6483      and has reduction properties.  */
6484   if (GROUP_FIRST_ELEMENT (stmt_info)
6485       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6486     {
6487       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6488       first_p = false;
6489     }
6490
6491   if (gimple_code (stmt) == GIMPLE_PHI)
6492     {
6493       /* Analysis is fully done on the reduction stmt invocation.  */
6494       if (! vec_stmt)
6495         {
6496           if (slp_node)
6497             slp_node_instance->reduc_phis = slp_node;
6498
6499           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6500           return true;
6501         }
6502
6503       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6504         /* Leave the scalar phi in place.  Note that checking
6505            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6506            for reductions involving a single statement.  */
6507         return true;
6508
6509       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6510       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6511         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6512
6513       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6514           == EXTRACT_LAST_REDUCTION)
6515         /* Leave the scalar phi in place.  */
6516         return true;
6517
6518       gcc_assert (is_gimple_assign (reduc_stmt));
6519       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6520         {
6521           tree op = gimple_op (reduc_stmt, k);
6522           if (op == gimple_phi_result (stmt))
6523             continue;
6524           if (k == 1
6525               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6526             continue;
6527           if (!vectype_in
6528               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6529                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6530             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6531           break;
6532         }
6533       gcc_assert (vectype_in);
6534
6535       if (slp_node)
6536         ncopies = 1;
6537       else
6538         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6539
6540       use_operand_p use_p;
6541       gimple *use_stmt;
6542       if (ncopies > 1
6543           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6544               <= vect_used_only_live)
6545           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6546           && (use_stmt == reduc_stmt
6547               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6548                   == reduc_stmt)))
6549         single_defuse_cycle = true;
6550
6551       /* Create the destination vector  */
6552       scalar_dest = gimple_assign_lhs (reduc_stmt);
6553       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6554
6555       if (slp_node)
6556         /* The size vect_schedule_slp_instance computes is off for us.  */
6557         vec_num = vect_get_num_vectors
6558           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6559            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6560            vectype_in);
6561       else
6562         vec_num = 1;
6563
6564       /* Generate the reduction PHIs upfront.  */
6565       prev_phi_info = NULL;
6566       for (j = 0; j < ncopies; j++)
6567         {
6568           if (j == 0 || !single_defuse_cycle)
6569             {
6570               for (i = 0; i < vec_num; i++)
6571                 {
6572                   /* Create the reduction-phi that defines the reduction
6573                      operand.  */
6574                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6575                   set_vinfo_for_stmt (new_phi,
6576                                       new_stmt_vec_info (new_phi, loop_vinfo));
6577
6578                   if (slp_node)
6579                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6580                   else
6581                     {
6582                       if (j == 0)
6583                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6584                       else
6585                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6586                       prev_phi_info = vinfo_for_stmt (new_phi);
6587                     }
6588                 }
6589             }
6590         }
6591
6592       return true;
6593     }
6594
6595   /* 1. Is vectorizable reduction?  */
6596   /* Not supportable if the reduction variable is used in the loop, unless
6597      it's a reduction chain.  */
6598   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6599       && !GROUP_FIRST_ELEMENT (stmt_info))
6600     return false;
6601
6602   /* Reductions that are not used even in an enclosing outer-loop,
6603      are expected to be "live" (used out of the loop).  */
6604   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6605       && !STMT_VINFO_LIVE_P (stmt_info))
6606     return false;
6607
6608   /* 2. Has this been recognized as a reduction pattern?
6609
6610      Check if STMT represents a pattern that has been recognized
6611      in earlier analysis stages.  For stmts that represent a pattern,
6612      the STMT_VINFO_RELATED_STMT field records the last stmt in
6613      the original sequence that constitutes the pattern.  */
6614
6615   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6616   if (orig_stmt)
6617     {
6618       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6619       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6620       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6621     }
6622
6623   /* 3. Check the operands of the operation.  The first operands are defined
6624         inside the loop body. The last operand is the reduction variable,
6625         which is defined by the loop-header-phi.  */
6626
6627   gcc_assert (is_gimple_assign (stmt));
6628
6629   /* Flatten RHS.  */
6630   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6631     {
6632     case GIMPLE_BINARY_RHS:
6633       code = gimple_assign_rhs_code (stmt);
6634       op_type = TREE_CODE_LENGTH (code);
6635       gcc_assert (op_type == binary_op);
6636       ops[0] = gimple_assign_rhs1 (stmt);
6637       ops[1] = gimple_assign_rhs2 (stmt);
6638       break;
6639
6640     case GIMPLE_TERNARY_RHS:
6641       code = gimple_assign_rhs_code (stmt);
6642       op_type = TREE_CODE_LENGTH (code);
6643       gcc_assert (op_type == ternary_op);
6644       ops[0] = gimple_assign_rhs1 (stmt);
6645       ops[1] = gimple_assign_rhs2 (stmt);
6646       ops[2] = gimple_assign_rhs3 (stmt);
6647       break;
6648
6649     case GIMPLE_UNARY_RHS:
6650       return false;
6651
6652     default:
6653       gcc_unreachable ();
6654     }
6655
6656   if (code == COND_EXPR && slp_node)
6657     return false;
6658
6659   scalar_dest = gimple_assign_lhs (stmt);
6660   scalar_type = TREE_TYPE (scalar_dest);
6661   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6662       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6663     return false;
6664
6665   /* Do not try to vectorize bit-precision reductions.  */
6666   if (!type_has_mode_precision_p (scalar_type))
6667     return false;
6668
6669   /* All uses but the last are expected to be defined in the loop.
6670      The last use is the reduction variable.  In case of nested cycle this
6671      assumption is not true: we use reduc_index to record the index of the
6672      reduction variable.  */
6673   gimple *reduc_def_stmt = NULL;
6674   int reduc_index = -1;
6675   for (i = 0; i < op_type; i++)
6676     {
6677       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6678       if (i == 0 && code == COND_EXPR)
6679         continue;
6680
6681       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6682                                           &def_stmt, &dts[i], &tem);
6683       dt = dts[i];
6684       gcc_assert (is_simple_use);
6685       if (dt == vect_reduction_def)
6686         {
6687           reduc_def_stmt = def_stmt;
6688           reduc_index = i;
6689           continue;
6690         }
6691       else if (tem)
6692         {
6693           /* To properly compute ncopies we are interested in the widest
6694              input type in case we're looking at a widening accumulation.  */
6695           if (!vectype_in
6696               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6697                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6698             vectype_in = tem;
6699         }
6700
6701       if (dt != vect_internal_def
6702           && dt != vect_external_def
6703           && dt != vect_constant_def
6704           && dt != vect_induction_def
6705           && !(dt == vect_nested_cycle && nested_cycle))
6706         return false;
6707
6708       if (dt == vect_nested_cycle)
6709         {
6710           found_nested_cycle_def = true;
6711           reduc_def_stmt = def_stmt;
6712           reduc_index = i;
6713         }
6714
6715       if (i == 1 && code == COND_EXPR)
6716         {
6717           /* Record how value of COND_EXPR is defined.  */
6718           if (dt == vect_constant_def)
6719             {
6720               cond_reduc_dt = dt;
6721               cond_reduc_val = ops[i];
6722             }
6723           if (dt == vect_induction_def
6724               && def_stmt != NULL
6725               && is_nonwrapping_integer_induction (def_stmt, loop))
6726             {
6727               cond_reduc_dt = dt;
6728               cond_reduc_def_stmt = def_stmt;
6729             }
6730         }
6731     }
6732
6733   if (!vectype_in)
6734     vectype_in = vectype_out;
6735
6736   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6737      directy used in stmt.  */
6738   if (reduc_index == -1)
6739     {
6740       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6741         {
6742           if (dump_enabled_p ())
6743             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6744                              "in-order reduction chain without SLP.\n");
6745           return false;
6746         }
6747
6748       if (orig_stmt)
6749         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6750       else
6751         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6752     }
6753
6754   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6755     return false;
6756
6757   if (!(reduc_index == -1
6758         || dts[reduc_index] == vect_reduction_def
6759         || dts[reduc_index] == vect_nested_cycle
6760         || ((dts[reduc_index] == vect_internal_def
6761              || dts[reduc_index] == vect_external_def
6762              || dts[reduc_index] == vect_constant_def
6763              || dts[reduc_index] == vect_induction_def)
6764             && nested_cycle && found_nested_cycle_def)))
6765     {
6766       /* For pattern recognized stmts, orig_stmt might be a reduction,
6767          but some helper statements for the pattern might not, or
6768          might be COND_EXPRs with reduction uses in the condition.  */
6769       gcc_assert (orig_stmt);
6770       return false;
6771     }
6772
6773   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6774   enum vect_reduction_type v_reduc_type
6775     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6776   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6777
6778   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6779   /* If we have a condition reduction, see if we can simplify it further.  */
6780   if (v_reduc_type == COND_REDUCTION)
6781     {
6782       /* Loop peeling modifies initial value of reduction PHI, which
6783          makes the reduction stmt to be transformed different to the
6784          original stmt analyzed.  We need to record reduction code for
6785          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6786          it can be used directly at transform stage.  */
6787       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6788           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6789         {
6790           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6791           gcc_assert (cond_reduc_dt == vect_constant_def);
6792           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6793         }
6794       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6795                                                vectype_in, OPTIMIZE_FOR_SPEED))
6796         {
6797           if (dump_enabled_p ())
6798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6799                              "optimizing condition reduction with"
6800                              " FOLD_EXTRACT_LAST.\n");
6801           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6802         }
6803       else if (cond_reduc_dt == vect_induction_def)
6804         {
6805           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6806           tree base
6807             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6808           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6809
6810           gcc_assert (TREE_CODE (base) == INTEGER_CST
6811                       && TREE_CODE (step) == INTEGER_CST);
6812           cond_reduc_val = NULL_TREE;
6813           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6814              above base; punt if base is the minimum value of the type for
6815              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6816           if (tree_int_cst_sgn (step) == -1)
6817             {
6818               cond_reduc_op_code = MIN_EXPR;
6819               if (tree_int_cst_sgn (base) == -1)
6820                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6821               else if (tree_int_cst_lt (base,
6822                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6823                 cond_reduc_val
6824                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6825             }
6826           else
6827             {
6828               cond_reduc_op_code = MAX_EXPR;
6829               if (tree_int_cst_sgn (base) == 1)
6830                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6831               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6832                                         base))
6833                 cond_reduc_val
6834                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6835             }
6836           if (cond_reduc_val)
6837             {
6838               if (dump_enabled_p ())
6839                 dump_printf_loc (MSG_NOTE, vect_location,
6840                                  "condition expression based on "
6841                                  "integer induction.\n");
6842               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6843                 = INTEGER_INDUC_COND_REDUCTION;
6844             }
6845         }
6846       else if (cond_reduc_dt == vect_constant_def)
6847         {
6848           enum vect_def_type cond_initial_dt;
6849           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6850           tree cond_initial_val
6851             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6852
6853           gcc_assert (cond_reduc_val != NULL_TREE);
6854           vect_is_simple_use (cond_initial_val, loop_vinfo,
6855                               &def_stmt, &cond_initial_dt);
6856           if (cond_initial_dt == vect_constant_def
6857               && types_compatible_p (TREE_TYPE (cond_initial_val),
6858                                      TREE_TYPE (cond_reduc_val)))
6859             {
6860               tree e = fold_binary (LE_EXPR, boolean_type_node,
6861                                     cond_initial_val, cond_reduc_val);
6862               if (e && (integer_onep (e) || integer_zerop (e)))
6863                 {
6864                   if (dump_enabled_p ())
6865                     dump_printf_loc (MSG_NOTE, vect_location,
6866                                      "condition expression based on "
6867                                      "compile time constant.\n");
6868                   /* Record reduction code at analysis stage.  */
6869                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6870                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6871                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6872                     = CONST_COND_REDUCTION;
6873                 }
6874             }
6875         }
6876     }
6877
6878   if (orig_stmt)
6879     gcc_assert (tmp == orig_stmt
6880                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6881   else
6882     /* We changed STMT to be the first stmt in reduction chain, hence we
6883        check that in this case the first element in the chain is STMT.  */
6884     gcc_assert (stmt == tmp
6885                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6886
6887   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6888     return false;
6889
6890   if (slp_node)
6891     ncopies = 1;
6892   else
6893     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6894
6895   gcc_assert (ncopies >= 1);
6896
6897   vec_mode = TYPE_MODE (vectype_in);
6898   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6899
6900   if (code == COND_EXPR)
6901     {
6902       /* Only call during the analysis stage, otherwise we'll lose
6903          STMT_VINFO_TYPE.  */
6904       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6905                                                 ops[reduc_index], 0, NULL))
6906         {
6907           if (dump_enabled_p ())
6908             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6909                              "unsupported condition in reduction\n");
6910           return false;
6911         }
6912     }
6913   else
6914     {
6915       /* 4. Supportable by target?  */
6916
6917       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6918           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6919         {
6920           /* Shifts and rotates are only supported by vectorizable_shifts,
6921              not vectorizable_reduction.  */
6922           if (dump_enabled_p ())
6923             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6924                              "unsupported shift or rotation.\n");
6925           return false;
6926         }
6927
6928       /* 4.1. check support for the operation in the loop  */
6929       optab = optab_for_tree_code (code, vectype_in, optab_default);
6930       if (!optab)
6931         {
6932           if (dump_enabled_p ())
6933             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6934                              "no optab.\n");
6935
6936           return false;
6937         }
6938
6939       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6940         {
6941           if (dump_enabled_p ())
6942             dump_printf (MSG_NOTE, "op not supported by target.\n");
6943
6944           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6945               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6946             return false;
6947
6948           if (dump_enabled_p ())
6949             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6950         }
6951
6952       /* Worthwhile without SIMD support?  */
6953       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6954           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6955         {
6956           if (dump_enabled_p ())
6957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6958                              "not worthwhile without SIMD support.\n");
6959
6960           return false;
6961         }
6962     }
6963
6964   /* 4.2. Check support for the epilog operation.
6965
6966           If STMT represents a reduction pattern, then the type of the
6967           reduction variable may be different than the type of the rest
6968           of the arguments.  For example, consider the case of accumulation
6969           of shorts into an int accumulator; The original code:
6970                         S1: int_a = (int) short_a;
6971           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6972
6973           was replaced with:
6974                         STMT: int_acc = widen_sum <short_a, int_acc>
6975
6976           This means that:
6977           1. The tree-code that is used to create the vector operation in the
6978              epilog code (that reduces the partial results) is not the
6979              tree-code of STMT, but is rather the tree-code of the original
6980              stmt from the pattern that STMT is replacing.  I.e, in the example
6981              above we want to use 'widen_sum' in the loop, but 'plus' in the
6982              epilog.
6983           2. The type (mode) we use to check available target support
6984              for the vector operation to be created in the *epilog*, is
6985              determined by the type of the reduction variable (in the example
6986              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6987              However the type (mode) we use to check available target support
6988              for the vector operation to be created *inside the loop*, is
6989              determined by the type of the other arguments to STMT (in the
6990              example we'd check this: optab_handler (widen_sum_optab,
6991              vect_short_mode)).
6992
6993           This is contrary to "regular" reductions, in which the types of all
6994           the arguments are the same as the type of the reduction variable.
6995           For "regular" reductions we can therefore use the same vector type
6996           (and also the same tree-code) when generating the epilog code and
6997           when generating the code inside the loop.  */
6998
6999   vect_reduction_type reduction_type
7000     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7001   if (orig_stmt
7002       && (reduction_type == TREE_CODE_REDUCTION
7003           || reduction_type == FOLD_LEFT_REDUCTION))
7004     {
7005       /* This is a reduction pattern: get the vectype from the type of the
7006          reduction variable, and get the tree-code from orig_stmt.  */
7007       orig_code = gimple_assign_rhs_code (orig_stmt);
7008       gcc_assert (vectype_out);
7009       vec_mode = TYPE_MODE (vectype_out);
7010     }
7011   else
7012     {
7013       /* Regular reduction: use the same vectype and tree-code as used for
7014          the vector code inside the loop can be used for the epilog code. */
7015       orig_code = code;
7016
7017       if (code == MINUS_EXPR)
7018         orig_code = PLUS_EXPR;
7019
7020       /* For simple condition reductions, replace with the actual expression
7021          we want to base our reduction around.  */
7022       if (reduction_type == CONST_COND_REDUCTION)
7023         {
7024           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7025           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7026         }
7027       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7028         orig_code = cond_reduc_op_code;
7029     }
7030
7031   if (nested_cycle)
7032     {
7033       def_bb = gimple_bb (reduc_def_stmt);
7034       def_stmt_loop = def_bb->loop_father;
7035       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7036                                        loop_preheader_edge (def_stmt_loop));
7037       if (TREE_CODE (def_arg) == SSA_NAME
7038           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7039           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7040           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7041           && vinfo_for_stmt (def_arg_stmt)
7042           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7043               == vect_double_reduction_def)
7044         double_reduc = true;
7045     }
7046
7047   reduc_fn = IFN_LAST;
7048
7049   if (reduction_type == TREE_CODE_REDUCTION
7050       || reduction_type == FOLD_LEFT_REDUCTION
7051       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7052       || reduction_type == CONST_COND_REDUCTION)
7053     {
7054       if (reduction_type == FOLD_LEFT_REDUCTION
7055           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7056           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7057         {
7058           if (reduc_fn != IFN_LAST
7059               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7060                                                   OPTIMIZE_FOR_SPEED))
7061             {
7062               if (dump_enabled_p ())
7063                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7064                                  "reduc op not supported by target.\n");
7065
7066               reduc_fn = IFN_LAST;
7067             }
7068         }
7069       else
7070         {
7071           if (!nested_cycle || double_reduc)
7072             {
7073               if (dump_enabled_p ())
7074                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7075                                  "no reduc code for scalar code.\n");
7076
7077               return false;
7078             }
7079         }
7080     }
7081   else if (reduction_type == COND_REDUCTION)
7082     {
7083       int scalar_precision
7084         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7085       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7086       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7087                                                 nunits_out);
7088
7089       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7090                                           OPTIMIZE_FOR_SPEED))
7091         reduc_fn = IFN_REDUC_MAX;
7092     }
7093
7094   if (reduction_type != EXTRACT_LAST_REDUCTION
7095       && reduc_fn == IFN_LAST
7096       && !nunits_out.is_constant ())
7097     {
7098       if (dump_enabled_p ())
7099         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7100                          "missing target support for reduction on"
7101                          " variable-length vectors.\n");
7102       return false;
7103     }
7104
7105   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7106       && ncopies > 1)
7107     {
7108       if (dump_enabled_p ())
7109         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7110                          "multiple types in double reduction or condition "
7111                          "reduction.\n");
7112       return false;
7113     }
7114
7115   /* For SLP reductions, see if there is a neutral value we can use.  */
7116   tree neutral_op = NULL_TREE;
7117   if (slp_node)
7118     neutral_op
7119       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7120                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7121
7122   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7123     {
7124       /* We can't support in-order reductions of code such as this:
7125
7126            for (int i = 0; i < n1; ++i)
7127              for (int j = 0; j < n2; ++j)
7128                l += a[j];
7129
7130          since GCC effectively transforms the loop when vectorizing:
7131
7132            for (int i = 0; i < n1 / VF; ++i)
7133              for (int j = 0; j < n2; ++j)
7134                for (int k = 0; k < VF; ++k)
7135                  l += a[j];
7136
7137          which is a reassociation of the original operation.  */
7138       if (dump_enabled_p ())
7139         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7140                          "in-order double reduction not supported.\n");
7141
7142       return false;
7143     }
7144
7145   if (reduction_type == FOLD_LEFT_REDUCTION
7146       && slp_node
7147       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7148     {
7149       /* We cannot use in-order reductions in this case because there is
7150          an implicit reassociation of the operations involved.  */
7151       if (dump_enabled_p ())
7152         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7153                          "in-order unchained SLP reductions not supported.\n");
7154       return false;
7155     }
7156
7157   /* For double reductions, and for SLP reductions with a neutral value,
7158      we construct a variable-length initial vector by loading a vector
7159      full of the neutral value and then shift-and-inserting the start
7160      values into the low-numbered elements.  */
7161   if ((double_reduc || neutral_op)
7162       && !nunits_out.is_constant ()
7163       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7164                                           vectype_out, OPTIMIZE_FOR_SPEED))
7165     {
7166       if (dump_enabled_p ())
7167         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7168                          "reduction on variable-length vectors requires"
7169                          " target support for a vector-shift-and-insert"
7170                          " operation.\n");
7171       return false;
7172     }
7173
7174   /* Check extra constraints for variable-length unchained SLP reductions.  */
7175   if (STMT_SLP_TYPE (stmt_info)
7176       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7177       && !nunits_out.is_constant ())
7178     {
7179       /* We checked above that we could build the initial vector when
7180          there's a neutral element value.  Check here for the case in
7181          which each SLP statement has its own initial value and in which
7182          that value needs to be repeated for every instance of the
7183          statement within the initial vector.  */
7184       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7185       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7186       if (!neutral_op
7187           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7188         {
7189           if (dump_enabled_p ())
7190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7191                              "unsupported form of SLP reduction for"
7192                              " variable-length vectors: cannot build"
7193                              " initial vector.\n");
7194           return false;
7195         }
7196       /* The epilogue code relies on the number of elements being a multiple
7197          of the group size.  The duplicate-and-interleave approach to setting
7198          up the the initial vector does too.  */
7199       if (!multiple_p (nunits_out, group_size))
7200         {
7201           if (dump_enabled_p ())
7202             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7203                              "unsupported form of SLP reduction for"
7204                              " variable-length vectors: the vector size"
7205                              " is not a multiple of the number of results.\n");
7206           return false;
7207         }
7208     }
7209
7210   /* In case of widenning multiplication by a constant, we update the type
7211      of the constant to be the type of the other operand.  We check that the
7212      constant fits the type in the pattern recognition pass.  */
7213   if (code == DOT_PROD_EXPR
7214       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7215     {
7216       if (TREE_CODE (ops[0]) == INTEGER_CST)
7217         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7218       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7219         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7220       else
7221         {
7222           if (dump_enabled_p ())
7223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7224                              "invalid types in dot-prod\n");
7225
7226           return false;
7227         }
7228     }
7229
7230   if (reduction_type == COND_REDUCTION)
7231     {
7232       widest_int ni;
7233
7234       if (! max_loop_iterations (loop, &ni))
7235         {
7236           if (dump_enabled_p ())
7237             dump_printf_loc (MSG_NOTE, vect_location,
7238                              "loop count not known, cannot create cond "
7239                              "reduction.\n");
7240           return false;
7241         }
7242       /* Convert backedges to iterations.  */
7243       ni += 1;
7244
7245       /* The additional index will be the same type as the condition.  Check
7246          that the loop can fit into this less one (because we'll use up the
7247          zero slot for when there are no matches).  */
7248       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7249       if (wi::geu_p (ni, wi::to_widest (max_index)))
7250         {
7251           if (dump_enabled_p ())
7252             dump_printf_loc (MSG_NOTE, vect_location,
7253                              "loop size is greater than data size.\n");
7254           return false;
7255         }
7256     }
7257
7258   /* In case the vectorization factor (VF) is bigger than the number
7259      of elements that we can fit in a vectype (nunits), we have to generate
7260      more than one vector stmt - i.e - we need to "unroll" the
7261      vector stmt by a factor VF/nunits.  For more details see documentation
7262      in vectorizable_operation.  */
7263
7264   /* If the reduction is used in an outer loop we need to generate
7265      VF intermediate results, like so (e.g. for ncopies=2):
7266         r0 = phi (init, r0)
7267         r1 = phi (init, r1)
7268         r0 = x0 + r0;
7269         r1 = x1 + r1;
7270     (i.e. we generate VF results in 2 registers).
7271     In this case we have a separate def-use cycle for each copy, and therefore
7272     for each copy we get the vector def for the reduction variable from the
7273     respective phi node created for this copy.
7274
7275     Otherwise (the reduction is unused in the loop nest), we can combine
7276     together intermediate results, like so (e.g. for ncopies=2):
7277         r = phi (init, r)
7278         r = x0 + r;
7279         r = x1 + r;
7280    (i.e. we generate VF/2 results in a single register).
7281    In this case for each copy we get the vector def for the reduction variable
7282    from the vectorized reduction operation generated in the previous iteration.
7283
7284    This only works when we see both the reduction PHI and its only consumer
7285    in vectorizable_reduction and there are no intermediate stmts
7286    participating.  */
7287   use_operand_p use_p;
7288   gimple *use_stmt;
7289   if (ncopies > 1
7290       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7291       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7292       && (use_stmt == stmt
7293           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7294     {
7295       single_defuse_cycle = true;
7296       epilog_copies = 1;
7297     }
7298   else
7299     epilog_copies = ncopies;
7300
7301   /* If the reduction stmt is one of the patterns that have lane
7302      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7303   if ((ncopies > 1
7304        && ! single_defuse_cycle)
7305       && (code == DOT_PROD_EXPR
7306           || code == WIDEN_SUM_EXPR
7307           || code == SAD_EXPR))
7308     {
7309       if (dump_enabled_p ())
7310         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7311                          "multi def-use cycle not possible for lane-reducing "
7312                          "reduction operation\n");
7313       return false;
7314     }
7315
7316   if (slp_node)
7317     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7318   else
7319     vec_num = 1;
7320
7321   internal_fn cond_fn = get_conditional_internal_fn (code);
7322   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7323
7324   if (!vec_stmt) /* transformation not required.  */
7325     {
7326       if (first_p)
7327         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7328       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7329         {
7330           if (reduction_type != FOLD_LEFT_REDUCTION
7331               && (cond_fn == IFN_LAST
7332                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7333                                                       OPTIMIZE_FOR_SPEED)))
7334             {
7335               if (dump_enabled_p ())
7336                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7337                                  "can't use a fully-masked loop because no"
7338                                  " conditional operation is available.\n");
7339               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7340             }
7341           else if (reduc_index == -1)
7342             {
7343               if (dump_enabled_p ())
7344                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7345                                  "can't use a fully-masked loop for chained"
7346                                  " reductions.\n");
7347               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7348             }
7349           else
7350             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7351                                    vectype_in);
7352         }
7353       if (dump_enabled_p ()
7354           && reduction_type == FOLD_LEFT_REDUCTION)
7355         dump_printf_loc (MSG_NOTE, vect_location,
7356                          "using an in-order (fold-left) reduction.\n");
7357       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7358       return true;
7359     }
7360
7361   /* Transform.  */
7362
7363   if (dump_enabled_p ())
7364     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7365
7366   /* FORNOW: Multiple types are not supported for condition.  */
7367   if (code == COND_EXPR)
7368     gcc_assert (ncopies == 1);
7369
7370   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7371
7372   if (reduction_type == FOLD_LEFT_REDUCTION)
7373     return vectorize_fold_left_reduction
7374       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7375        reduc_fn, ops, vectype_in, reduc_index, masks);
7376
7377   if (reduction_type == EXTRACT_LAST_REDUCTION)
7378     {
7379       gcc_assert (!slp_node);
7380       return vectorizable_condition (stmt, gsi, vec_stmt,
7381                                      NULL, reduc_index, NULL);
7382     }
7383
7384   /* Create the destination vector  */
7385   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7386
7387   prev_stmt_info = NULL;
7388   prev_phi_info = NULL;
7389   if (!slp_node)
7390     {
7391       vec_oprnds0.create (1);
7392       vec_oprnds1.create (1);
7393       if (op_type == ternary_op)
7394         vec_oprnds2.create (1);
7395     }
7396
7397   phis.create (vec_num);
7398   vect_defs.create (vec_num);
7399   if (!slp_node)
7400     vect_defs.quick_push (NULL_TREE);
7401
7402   if (slp_node)
7403     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7404   else
7405     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7406
7407   for (j = 0; j < ncopies; j++)
7408     {
7409       if (code == COND_EXPR)
7410         {
7411           gcc_assert (!slp_node);
7412           vectorizable_condition (stmt, gsi, vec_stmt,
7413                                   PHI_RESULT (phis[0]),
7414                                   reduc_index, NULL);
7415           /* Multiple types are not supported for condition.  */
7416           break;
7417         }
7418
7419       /* Handle uses.  */
7420       if (j == 0)
7421         {
7422           if (slp_node)
7423             {
7424               /* Get vec defs for all the operands except the reduction index,
7425                  ensuring the ordering of the ops in the vector is kept.  */
7426               auto_vec<tree, 3> slp_ops;
7427               auto_vec<vec<tree>, 3> vec_defs;
7428
7429               slp_ops.quick_push (ops[0]);
7430               slp_ops.quick_push (ops[1]);
7431               if (op_type == ternary_op)
7432                 slp_ops.quick_push (ops[2]);
7433
7434               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7435
7436               vec_oprnds0.safe_splice (vec_defs[0]);
7437               vec_defs[0].release ();
7438               vec_oprnds1.safe_splice (vec_defs[1]);
7439               vec_defs[1].release ();
7440               if (op_type == ternary_op)
7441                 {
7442                   vec_oprnds2.safe_splice (vec_defs[2]);
7443                   vec_defs[2].release ();
7444                 }
7445             }
7446           else
7447             {
7448               vec_oprnds0.quick_push
7449                 (vect_get_vec_def_for_operand (ops[0], stmt));
7450               vec_oprnds1.quick_push
7451                 (vect_get_vec_def_for_operand (ops[1], stmt));
7452               if (op_type == ternary_op)
7453                 vec_oprnds2.quick_push
7454                   (vect_get_vec_def_for_operand (ops[2], stmt));
7455             }
7456         }
7457       else
7458         {
7459           if (!slp_node)
7460             {
7461               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7462
7463               if (single_defuse_cycle && reduc_index == 0)
7464                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7465               else
7466                 vec_oprnds0[0]
7467                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7468               if (single_defuse_cycle && reduc_index == 1)
7469                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7470               else
7471                 vec_oprnds1[0]
7472                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7473               if (op_type == ternary_op)
7474                 {
7475                   if (single_defuse_cycle && reduc_index == 2)
7476                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7477                   else
7478                     vec_oprnds2[0]
7479                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7480                 }
7481             }
7482         }
7483
7484       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7485         {
7486           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7487           if (masked_loop_p)
7488             {
7489               /* Make sure that the reduction accumulator is vop[0].  */
7490               if (reduc_index == 1)
7491                 {
7492                   gcc_assert (commutative_tree_code (code));
7493                   std::swap (vop[0], vop[1]);
7494                 }
7495               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7496                                               vectype_in, i * ncopies + j);
7497               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7498                                                         vop[0], vop[1]);
7499               new_temp = make_ssa_name (vec_dest, call);
7500               gimple_call_set_lhs (call, new_temp);
7501               gimple_call_set_nothrow (call, true);
7502               new_stmt = call;
7503             }
7504           else
7505             {
7506               if (op_type == ternary_op)
7507                 vop[2] = vec_oprnds2[i];
7508
7509               new_temp = make_ssa_name (vec_dest, new_stmt);
7510               new_stmt = gimple_build_assign (new_temp, code,
7511                                               vop[0], vop[1], vop[2]);
7512             }
7513           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7514
7515           if (slp_node)
7516             {
7517               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7518               vect_defs.quick_push (new_temp);
7519             }
7520           else
7521             vect_defs[0] = new_temp;
7522         }
7523
7524       if (slp_node)
7525         continue;
7526
7527       if (j == 0)
7528         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7529       else
7530         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7531
7532       prev_stmt_info = vinfo_for_stmt (new_stmt);
7533     }
7534
7535   /* Finalize the reduction-phi (set its arguments) and create the
7536      epilog reduction code.  */
7537   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7538     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7539
7540   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7541                                     epilog_copies, reduc_fn, phis,
7542                                     double_reduc, slp_node, slp_node_instance,
7543                                     cond_reduc_val, cond_reduc_op_code,
7544                                     neutral_op);
7545
7546   return true;
7547 }
7548
7549 /* Function vect_min_worthwhile_factor.
7550
7551    For a loop where we could vectorize the operation indicated by CODE,
7552    return the minimum vectorization factor that makes it worthwhile
7553    to use generic vectors.  */
7554 static unsigned int
7555 vect_min_worthwhile_factor (enum tree_code code)
7556 {
7557   switch (code)
7558     {
7559     case PLUS_EXPR:
7560     case MINUS_EXPR:
7561     case NEGATE_EXPR:
7562       return 4;
7563
7564     case BIT_AND_EXPR:
7565     case BIT_IOR_EXPR:
7566     case BIT_XOR_EXPR:
7567     case BIT_NOT_EXPR:
7568       return 2;
7569
7570     default:
7571       return INT_MAX;
7572     }
7573 }
7574
7575 /* Return true if VINFO indicates we are doing loop vectorization and if
7576    it is worth decomposing CODE operations into scalar operations for
7577    that loop's vectorization factor.  */
7578
7579 bool
7580 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7581 {
7582   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7583   unsigned HOST_WIDE_INT value;
7584   return (loop_vinfo
7585           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7586           && value >= vect_min_worthwhile_factor (code));
7587 }
7588
7589 /* Function vectorizable_induction
7590
7591    Check if PHI performs an induction computation that can be vectorized.
7592    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7593    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7594    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7595
7596 bool
7597 vectorizable_induction (gimple *phi,
7598                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7599                         gimple **vec_stmt, slp_tree slp_node)
7600 {
7601   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7602   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7603   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7604   unsigned ncopies;
7605   bool nested_in_vect_loop = false;
7606   struct loop *iv_loop;
7607   tree vec_def;
7608   edge pe = loop_preheader_edge (loop);
7609   basic_block new_bb;
7610   tree new_vec, vec_init, vec_step, t;
7611   tree new_name;
7612   gimple *new_stmt;
7613   gphi *induction_phi;
7614   tree induc_def, vec_dest;
7615   tree init_expr, step_expr;
7616   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7617   unsigned i;
7618   tree expr;
7619   gimple_seq stmts;
7620   imm_use_iterator imm_iter;
7621   use_operand_p use_p;
7622   gimple *exit_phi;
7623   edge latch_e;
7624   tree loop_arg;
7625   gimple_stmt_iterator si;
7626   basic_block bb = gimple_bb (phi);
7627
7628   if (gimple_code (phi) != GIMPLE_PHI)
7629     return false;
7630
7631   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7632     return false;
7633
7634   /* Make sure it was recognized as induction computation.  */
7635   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7636     return false;
7637
7638   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7639   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7640
7641   if (slp_node)
7642     ncopies = 1;
7643   else
7644     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7645   gcc_assert (ncopies >= 1);
7646
7647   /* FORNOW. These restrictions should be relaxed.  */
7648   if (nested_in_vect_loop_p (loop, phi))
7649     {
7650       imm_use_iterator imm_iter;
7651       use_operand_p use_p;
7652       gimple *exit_phi;
7653       edge latch_e;
7654       tree loop_arg;
7655
7656       if (ncopies > 1)
7657         {
7658           if (dump_enabled_p ())
7659             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7660                              "multiple types in nested loop.\n");
7661           return false;
7662         }
7663
7664       /* FORNOW: outer loop induction with SLP not supported.  */
7665       if (STMT_SLP_TYPE (stmt_info))
7666         return false;
7667
7668       exit_phi = NULL;
7669       latch_e = loop_latch_edge (loop->inner);
7670       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7671       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7672         {
7673           gimple *use_stmt = USE_STMT (use_p);
7674           if (is_gimple_debug (use_stmt))
7675             continue;
7676
7677           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7678             {
7679               exit_phi = use_stmt;
7680               break;
7681             }
7682         }
7683       if (exit_phi)
7684         {
7685           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7686           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7687                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7688             {
7689               if (dump_enabled_p ())
7690                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7691                                  "inner-loop induction only used outside "
7692                                  "of the outer vectorized loop.\n");
7693               return false;
7694             }
7695         }
7696
7697       nested_in_vect_loop = true;
7698       iv_loop = loop->inner;
7699     }
7700   else
7701     iv_loop = loop;
7702   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7703
7704   if (slp_node && !nunits.is_constant ())
7705     {
7706       /* The current SLP code creates the initial value element-by-element.  */
7707       if (dump_enabled_p ())
7708         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7709                          "SLP induction not supported for variable-length"
7710                          " vectors.\n");
7711       return false;
7712     }
7713
7714   if (!vec_stmt) /* transformation not required.  */
7715     {
7716       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7717       if (dump_enabled_p ())
7718         dump_printf_loc (MSG_NOTE, vect_location,
7719                          "=== vectorizable_induction ===\n");
7720       vect_model_induction_cost (stmt_info, ncopies);
7721       return true;
7722     }
7723
7724   /* Transform.  */
7725
7726   /* Compute a vector variable, initialized with the first VF values of
7727      the induction variable.  E.g., for an iv with IV_PHI='X' and
7728      evolution S, for a vector of 4 units, we want to compute:
7729      [X, X + S, X + 2*S, X + 3*S].  */
7730
7731   if (dump_enabled_p ())
7732     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7733
7734   latch_e = loop_latch_edge (iv_loop);
7735   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7736
7737   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7738   gcc_assert (step_expr != NULL_TREE);
7739
7740   pe = loop_preheader_edge (iv_loop);
7741   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7742                                      loop_preheader_edge (iv_loop));
7743
7744   stmts = NULL;
7745   if (!nested_in_vect_loop)
7746     {
7747       /* Convert the initial value to the desired type.  */
7748       tree new_type = TREE_TYPE (vectype);
7749       init_expr = gimple_convert (&stmts, new_type, init_expr);
7750
7751       /* If we are using the loop mask to "peel" for alignment then we need
7752          to adjust the start value here.  */
7753       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7754       if (skip_niters != NULL_TREE)
7755         {
7756           if (FLOAT_TYPE_P (vectype))
7757             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7758                                         skip_niters);
7759           else
7760             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7761           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7762                                          skip_niters, step_expr);
7763           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7764                                     init_expr, skip_step);
7765         }
7766     }
7767
7768   /* Convert the step to the desired type.  */
7769   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7770
7771   if (stmts)
7772     {
7773       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7774       gcc_assert (!new_bb);
7775     }
7776
7777   /* Find the first insertion point in the BB.  */
7778   si = gsi_after_labels (bb);
7779
7780   /* For SLP induction we have to generate several IVs as for example
7781      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7782      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7783      [VF*S, VF*S, VF*S, VF*S] for all.  */
7784   if (slp_node)
7785     {
7786       /* Enforced above.  */
7787       unsigned int const_nunits = nunits.to_constant ();
7788
7789       /* Generate [VF*S, VF*S, ... ].  */
7790       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7791         {
7792           expr = build_int_cst (integer_type_node, vf);
7793           expr = fold_convert (TREE_TYPE (step_expr), expr);
7794         }
7795       else
7796         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7797       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7798                               expr, step_expr);
7799       if (! CONSTANT_CLASS_P (new_name))
7800         new_name = vect_init_vector (phi, new_name,
7801                                      TREE_TYPE (step_expr), NULL);
7802       new_vec = build_vector_from_val (vectype, new_name);
7803       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7804
7805       /* Now generate the IVs.  */
7806       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7807       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7808       unsigned elts = const_nunits * nvects;
7809       unsigned nivs = least_common_multiple (group_size,
7810                                              const_nunits) / const_nunits;
7811       gcc_assert (elts % group_size == 0);
7812       tree elt = init_expr;
7813       unsigned ivn;
7814       for (ivn = 0; ivn < nivs; ++ivn)
7815         {
7816           tree_vector_builder elts (vectype, const_nunits, 1);
7817           stmts = NULL;
7818           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7819             {
7820               if (ivn*const_nunits + eltn >= group_size
7821                   && (ivn * const_nunits + eltn) % group_size == 0)
7822                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7823                                     elt, step_expr);
7824               elts.quick_push (elt);
7825             }
7826           vec_init = gimple_build_vector (&stmts, &elts);
7827           if (stmts)
7828             {
7829               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7830               gcc_assert (!new_bb);
7831             }
7832
7833           /* Create the induction-phi that defines the induction-operand.  */
7834           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7835           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7836           set_vinfo_for_stmt (induction_phi,
7837                               new_stmt_vec_info (induction_phi, loop_vinfo));
7838           induc_def = PHI_RESULT (induction_phi);
7839
7840           /* Create the iv update inside the loop  */
7841           vec_def = make_ssa_name (vec_dest);
7842           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7843           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7844           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7845
7846           /* Set the arguments of the phi node:  */
7847           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7848           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7849                        UNKNOWN_LOCATION);
7850
7851           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7852         }
7853
7854       /* Re-use IVs when we can.  */
7855       if (ivn < nvects)
7856         {
7857           unsigned vfp
7858             = least_common_multiple (group_size, const_nunits) / group_size;
7859           /* Generate [VF'*S, VF'*S, ... ].  */
7860           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7861             {
7862               expr = build_int_cst (integer_type_node, vfp);
7863               expr = fold_convert (TREE_TYPE (step_expr), expr);
7864             }
7865           else
7866             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7867           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7868                                   expr, step_expr);
7869           if (! CONSTANT_CLASS_P (new_name))
7870             new_name = vect_init_vector (phi, new_name,
7871                                          TREE_TYPE (step_expr), NULL);
7872           new_vec = build_vector_from_val (vectype, new_name);
7873           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7874           for (; ivn < nvects; ++ivn)
7875             {
7876               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7877               tree def;
7878               if (gimple_code (iv) == GIMPLE_PHI)
7879                 def = gimple_phi_result (iv);
7880               else
7881                 def = gimple_assign_lhs (iv);
7882               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7883                                               PLUS_EXPR,
7884                                               def, vec_step);
7885               if (gimple_code (iv) == GIMPLE_PHI)
7886                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7887               else
7888                 {
7889                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7890                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7891                 }
7892               set_vinfo_for_stmt (new_stmt,
7893                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7894               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7895             }
7896         }
7897
7898       return true;
7899     }
7900
7901   /* Create the vector that holds the initial_value of the induction.  */
7902   if (nested_in_vect_loop)
7903     {
7904       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7905          been created during vectorization of previous stmts.  We obtain it
7906          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7907       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7908       /* If the initial value is not of proper type, convert it.  */
7909       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7910         {
7911           new_stmt
7912             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7913                                                           vect_simple_var,
7914                                                           "vec_iv_"),
7915                                    VIEW_CONVERT_EXPR,
7916                                    build1 (VIEW_CONVERT_EXPR, vectype,
7917                                            vec_init));
7918           vec_init = gimple_assign_lhs (new_stmt);
7919           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7920                                                  new_stmt);
7921           gcc_assert (!new_bb);
7922           set_vinfo_for_stmt (new_stmt,
7923                               new_stmt_vec_info (new_stmt, loop_vinfo));
7924         }
7925     }
7926   else
7927     {
7928       /* iv_loop is the loop to be vectorized. Create:
7929          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7930       stmts = NULL;
7931       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7932
7933       unsigned HOST_WIDE_INT const_nunits;
7934       if (nunits.is_constant (&const_nunits))
7935         {
7936           tree_vector_builder elts (vectype, const_nunits, 1);
7937           elts.quick_push (new_name);
7938           for (i = 1; i < const_nunits; i++)
7939             {
7940               /* Create: new_name_i = new_name + step_expr  */
7941               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7942                                        new_name, step_expr);
7943               elts.quick_push (new_name);
7944             }
7945           /* Create a vector from [new_name_0, new_name_1, ...,
7946              new_name_nunits-1]  */
7947           vec_init = gimple_build_vector (&stmts, &elts);
7948         }
7949       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7950         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7951         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7952                                  new_name, step_expr);
7953       else
7954         {
7955           /* Build:
7956                 [base, base, base, ...]
7957                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7958           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7959           gcc_assert (flag_associative_math);
7960           tree index = build_index_vector (vectype, 0, 1);
7961           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7962                                                         new_name);
7963           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7964                                                         step_expr);
7965           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7966           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7967                                    vec_init, step_vec);
7968           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7969                                    vec_init, base_vec);
7970         }
7971
7972       if (stmts)
7973         {
7974           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7975           gcc_assert (!new_bb);
7976         }
7977     }
7978
7979
7980   /* Create the vector that holds the step of the induction.  */
7981   if (nested_in_vect_loop)
7982     /* iv_loop is nested in the loop to be vectorized. Generate:
7983        vec_step = [S, S, S, S]  */
7984     new_name = step_expr;
7985   else
7986     {
7987       /* iv_loop is the loop to be vectorized. Generate:
7988           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7989       gimple_seq seq = NULL;
7990       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7991         {
7992           expr = build_int_cst (integer_type_node, vf);
7993           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7994         }
7995       else
7996         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7997       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7998                                expr, step_expr);
7999       if (seq)
8000         {
8001           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8002           gcc_assert (!new_bb);
8003         }
8004     }
8005
8006   t = unshare_expr (new_name);
8007   gcc_assert (CONSTANT_CLASS_P (new_name)
8008               || TREE_CODE (new_name) == SSA_NAME);
8009   new_vec = build_vector_from_val (vectype, t);
8010   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8011
8012
8013   /* Create the following def-use cycle:
8014      loop prolog:
8015          vec_init = ...
8016          vec_step = ...
8017      loop:
8018          vec_iv = PHI <vec_init, vec_loop>
8019          ...
8020          STMT
8021          ...
8022          vec_loop = vec_iv + vec_step;  */
8023
8024   /* Create the induction-phi that defines the induction-operand.  */
8025   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8026   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8027   set_vinfo_for_stmt (induction_phi,
8028                       new_stmt_vec_info (induction_phi, loop_vinfo));
8029   induc_def = PHI_RESULT (induction_phi);
8030
8031   /* Create the iv update inside the loop  */
8032   vec_def = make_ssa_name (vec_dest);
8033   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8034   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8035   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8036
8037   /* Set the arguments of the phi node:  */
8038   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8039   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8040                UNKNOWN_LOCATION);
8041
8042   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8043
8044   /* In case that vectorization factor (VF) is bigger than the number
8045      of elements that we can fit in a vectype (nunits), we have to generate
8046      more than one vector stmt - i.e - we need to "unroll" the
8047      vector stmt by a factor VF/nunits.  For more details see documentation
8048      in vectorizable_operation.  */
8049
8050   if (ncopies > 1)
8051     {
8052       gimple_seq seq = NULL;
8053       stmt_vec_info prev_stmt_vinfo;
8054       /* FORNOW. This restriction should be relaxed.  */
8055       gcc_assert (!nested_in_vect_loop);
8056
8057       /* Create the vector that holds the step of the induction.  */
8058       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8059         {
8060           expr = build_int_cst (integer_type_node, nunits);
8061           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8062         }
8063       else
8064         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8065       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8066                                expr, step_expr);
8067       if (seq)
8068         {
8069           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8070           gcc_assert (!new_bb);
8071         }
8072
8073       t = unshare_expr (new_name);
8074       gcc_assert (CONSTANT_CLASS_P (new_name)
8075                   || TREE_CODE (new_name) == SSA_NAME);
8076       new_vec = build_vector_from_val (vectype, t);
8077       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8078
8079       vec_def = induc_def;
8080       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8081       for (i = 1; i < ncopies; i++)
8082         {
8083           /* vec_i = vec_prev + vec_step  */
8084           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8085                                           vec_def, vec_step);
8086           vec_def = make_ssa_name (vec_dest, new_stmt);
8087           gimple_assign_set_lhs (new_stmt, vec_def);
8088
8089           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8090           set_vinfo_for_stmt (new_stmt,
8091                               new_stmt_vec_info (new_stmt, loop_vinfo));
8092           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8093           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8094         }
8095     }
8096
8097   if (nested_in_vect_loop)
8098     {
8099       /* Find the loop-closed exit-phi of the induction, and record
8100          the final vector of induction results:  */
8101       exit_phi = NULL;
8102       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8103         {
8104           gimple *use_stmt = USE_STMT (use_p);
8105           if (is_gimple_debug (use_stmt))
8106             continue;
8107
8108           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8109             {
8110               exit_phi = use_stmt;
8111               break;
8112             }
8113         }
8114       if (exit_phi)
8115         {
8116           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8117           /* FORNOW. Currently not supporting the case that an inner-loop induction
8118              is not used in the outer-loop (i.e. only outside the outer-loop).  */
8119           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8120                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
8121
8122           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8123           if (dump_enabled_p ())
8124             {
8125               dump_printf_loc (MSG_NOTE, vect_location,
8126                                "vector of inductions after inner-loop:");
8127               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8128             }
8129         }
8130     }
8131
8132
8133   if (dump_enabled_p ())
8134     {
8135       dump_printf_loc (MSG_NOTE, vect_location,
8136                        "transform induction: created def-use cycle: ");
8137       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8138       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8139                         SSA_NAME_DEF_STMT (vec_def), 0);
8140     }
8141
8142   return true;
8143 }
8144
8145 /* Function vectorizable_live_operation.
8146
8147    STMT computes a value that is used outside the loop.  Check if
8148    it can be supported.  */
8149
8150 bool
8151 vectorizable_live_operation (gimple *stmt,
8152                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8153                              slp_tree slp_node, int slp_index,
8154                              gimple **vec_stmt)
8155 {
8156   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8157   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8158   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8159   imm_use_iterator imm_iter;
8160   tree lhs, lhs_type, bitsize, vec_bitsize;
8161   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8162   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8163   int ncopies;
8164   gimple *use_stmt;
8165   auto_vec<tree> vec_oprnds;
8166   int vec_entry = 0;
8167   poly_uint64 vec_index = 0;
8168
8169   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8170
8171   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8172     return false;
8173
8174   /* FORNOW.  CHECKME.  */
8175   if (nested_in_vect_loop_p (loop, stmt))
8176     return false;
8177
8178   /* If STMT is not relevant and it is a simple assignment and its inputs are
8179      invariant then it can remain in place, unvectorized.  The original last
8180      scalar value that it computes will be used.  */
8181   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8182     {
8183       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8184       if (dump_enabled_p ())
8185         dump_printf_loc (MSG_NOTE, vect_location,
8186                          "statement is simple and uses invariant.  Leaving in "
8187                          "place.\n");
8188       return true;
8189     }
8190
8191   if (slp_node)
8192     ncopies = 1;
8193   else
8194     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8195
8196   if (slp_node)
8197     {
8198       gcc_assert (slp_index >= 0);
8199
8200       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8201       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8202
8203       /* Get the last occurrence of the scalar index from the concatenation of
8204          all the slp vectors. Calculate which slp vector it is and the index
8205          within.  */
8206       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8207
8208       /* Calculate which vector contains the result, and which lane of
8209          that vector we need.  */
8210       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8211         {
8212           if (dump_enabled_p ())
8213             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8214                              "Cannot determine which vector holds the"
8215                              " final result.\n");
8216           return false;
8217         }
8218     }
8219
8220   if (!vec_stmt)
8221     {
8222       /* No transformation required.  */
8223       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8224         {
8225           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8226                                                OPTIMIZE_FOR_SPEED))
8227             {
8228               if (dump_enabled_p ())
8229                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8230                                  "can't use a fully-masked loop because "
8231                                  "the target doesn't support extract last "
8232                                  "reduction.\n");
8233               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8234             }
8235           else if (slp_node)
8236             {
8237               if (dump_enabled_p ())
8238                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8239                                  "can't use a fully-masked loop because an "
8240                                  "SLP statement is live after the loop.\n");
8241               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8242             }
8243           else if (ncopies > 1)
8244             {
8245               if (dump_enabled_p ())
8246                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8247                                  "can't use a fully-masked loop because"
8248                                  " ncopies is greater than 1.\n");
8249               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8250             }
8251           else
8252             {
8253               gcc_assert (ncopies == 1 && !slp_node);
8254               vect_record_loop_mask (loop_vinfo,
8255                                      &LOOP_VINFO_MASKS (loop_vinfo),
8256                                      1, vectype);
8257             }
8258         }
8259       return true;
8260     }
8261
8262   /* If stmt has a related stmt, then use that for getting the lhs.  */
8263   if (is_pattern_stmt_p (stmt_info))
8264     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8265
8266   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8267         : gimple_get_lhs (stmt);
8268   lhs_type = TREE_TYPE (lhs);
8269
8270   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8271              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8272              : TYPE_SIZE (TREE_TYPE (vectype)));
8273   vec_bitsize = TYPE_SIZE (vectype);
8274
8275   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8276   tree vec_lhs, bitstart;
8277   if (slp_node)
8278     {
8279       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8280
8281       /* Get the correct slp vectorized stmt.  */
8282       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8283       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8284         vec_lhs = gimple_phi_result (phi);
8285       else
8286         vec_lhs = gimple_get_lhs (vec_stmt);
8287
8288       /* Get entry to use.  */
8289       bitstart = bitsize_int (vec_index);
8290       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8291     }
8292   else
8293     {
8294       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8295       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8296       gcc_checking_assert (ncopies == 1
8297                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8298
8299       /* For multiple copies, get the last copy.  */
8300       for (int i = 1; i < ncopies; ++i)
8301         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8302                                                   vec_lhs);
8303
8304       /* Get the last lane in the vector.  */
8305       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8306     }
8307
8308   gimple_seq stmts = NULL;
8309   tree new_tree;
8310   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8311     {
8312       /* Emit:
8313
8314            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8315
8316          where VEC_LHS is the vectorized live-out result and MASK is
8317          the loop mask for the final iteration.  */
8318       gcc_assert (ncopies == 1 && !slp_node);
8319       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8320       tree scalar_res = make_ssa_name (scalar_type);
8321       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8322                                       1, vectype, 0);
8323       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8324                                                     2, mask, vec_lhs);
8325       gimple_call_set_lhs (new_stmt, scalar_res);
8326       gimple_seq_add_stmt (&stmts, new_stmt);
8327
8328       /* Convert the extracted vector element to the required scalar type.  */
8329       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8330     }
8331   else
8332     {
8333       tree bftype = TREE_TYPE (vectype);
8334       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8335         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8336       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8337       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8338                                        &stmts, true, NULL_TREE);
8339     }
8340
8341   if (stmts)
8342     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8343
8344   /* Replace use of lhs with newly computed result.  If the use stmt is a
8345      single arg PHI, just replace all uses of PHI result.  It's necessary
8346      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8347   use_operand_p use_p;
8348   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8349     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8350         && !is_gimple_debug (use_stmt))
8351     {
8352       if (gimple_code (use_stmt) == GIMPLE_PHI
8353           && gimple_phi_num_args (use_stmt) == 1)
8354         {
8355           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8356         }
8357       else
8358         {
8359           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8360             SET_USE (use_p, new_tree);
8361         }
8362       update_stmt (use_stmt);
8363     }
8364
8365   return true;
8366 }
8367
8368 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8369
8370 static void
8371 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8372 {
8373   ssa_op_iter op_iter;
8374   imm_use_iterator imm_iter;
8375   def_operand_p def_p;
8376   gimple *ustmt;
8377
8378   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8379     {
8380       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8381         {
8382           basic_block bb;
8383
8384           if (!is_gimple_debug (ustmt))
8385             continue;
8386
8387           bb = gimple_bb (ustmt);
8388
8389           if (!flow_bb_inside_loop_p (loop, bb))
8390             {
8391               if (gimple_debug_bind_p (ustmt))
8392                 {
8393                   if (dump_enabled_p ())
8394                     dump_printf_loc (MSG_NOTE, vect_location,
8395                                      "killing debug use\n");
8396
8397                   gimple_debug_bind_reset_value (ustmt);
8398                   update_stmt (ustmt);
8399                 }
8400               else
8401                 gcc_unreachable ();
8402             }
8403         }
8404     }
8405 }
8406
8407 /* Given loop represented by LOOP_VINFO, return true if computation of
8408    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8409    otherwise.  */
8410
8411 static bool
8412 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8413 {
8414   /* Constant case.  */
8415   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8416     {
8417       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8418       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8419
8420       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8421       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8422       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8423         return true;
8424     }
8425
8426   widest_int max;
8427   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8428   /* Check the upper bound of loop niters.  */
8429   if (get_max_loop_iterations (loop, &max))
8430     {
8431       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8432       signop sgn = TYPE_SIGN (type);
8433       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8434       if (max < type_max)
8435         return true;
8436     }
8437   return false;
8438 }
8439
8440 /* Return a mask type with half the number of elements as TYPE.  */
8441
8442 tree
8443 vect_halve_mask_nunits (tree type)
8444 {
8445   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8446   return build_truth_vector_type (nunits, current_vector_size);
8447 }
8448
8449 /* Return a mask type with twice as many elements as TYPE.  */
8450
8451 tree
8452 vect_double_mask_nunits (tree type)
8453 {
8454   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8455   return build_truth_vector_type (nunits, current_vector_size);
8456 }
8457
8458 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8459    contain a sequence of NVECTORS masks that each control a vector of type
8460    VECTYPE.  */
8461
8462 void
8463 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8464                        unsigned int nvectors, tree vectype)
8465 {
8466   gcc_assert (nvectors != 0);
8467   if (masks->length () < nvectors)
8468     masks->safe_grow_cleared (nvectors);
8469   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8470   /* The number of scalars per iteration and the number of vectors are
8471      both compile-time constants.  */
8472   unsigned int nscalars_per_iter
8473     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8474                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8475   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8476     {
8477       rgm->max_nscalars_per_iter = nscalars_per_iter;
8478       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8479     }
8480 }
8481
8482 /* Given a complete set of masks MASKS, extract mask number INDEX
8483    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8484    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8485
8486    See the comment above vec_loop_masks for more details about the mask
8487    arrangement.  */
8488
8489 tree
8490 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8491                     unsigned int nvectors, tree vectype, unsigned int index)
8492 {
8493   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8494   tree mask_type = rgm->mask_type;
8495
8496   /* Populate the rgroup's mask array, if this is the first time we've
8497      used it.  */
8498   if (rgm->masks.is_empty ())
8499     {
8500       rgm->masks.safe_grow_cleared (nvectors);
8501       for (unsigned int i = 0; i < nvectors; ++i)
8502         {
8503           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8504           /* Provide a dummy definition until the real one is available.  */
8505           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8506           rgm->masks[i] = mask;
8507         }
8508     }
8509
8510   tree mask = rgm->masks[index];
8511   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8512                 TYPE_VECTOR_SUBPARTS (vectype)))
8513     {
8514       /* A loop mask for data type X can be reused for data type Y
8515          if X has N times more elements than Y and if Y's elements
8516          are N times bigger than X's.  In this case each sequence
8517          of N elements in the loop mask will be all-zero or all-one.
8518          We can then view-convert the mask so that each sequence of
8519          N elements is replaced by a single element.  */
8520       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8521                               TYPE_VECTOR_SUBPARTS (vectype)));
8522       gimple_seq seq = NULL;
8523       mask_type = build_same_sized_truth_vector_type (vectype);
8524       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8525       if (seq)
8526         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8527     }
8528   return mask;
8529 }
8530
8531 /* Scale profiling counters by estimation for LOOP which is vectorized
8532    by factor VF.  */
8533
8534 static void
8535 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8536 {
8537   edge preheader = loop_preheader_edge (loop);
8538   /* Reduce loop iterations by the vectorization factor.  */
8539   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8540   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8541
8542   if (freq_h.nonzero_p ())
8543     {
8544       profile_probability p;
8545
8546       /* Avoid dropping loop body profile counter to 0 because of zero count
8547          in loop's preheader.  */
8548       if (!(freq_e == profile_count::zero ()))
8549         freq_e = freq_e.force_nonzero ();
8550       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8551       scale_loop_frequencies (loop, p);
8552     }
8553
8554   edge exit_e = single_exit (loop);
8555   exit_e->probability = profile_probability::always ()
8556                                  .apply_scale (1, new_est_niter + 1);
8557
8558   edge exit_l = single_pred_edge (loop->latch);
8559   profile_probability prob = exit_l->probability;
8560   exit_l->probability = exit_e->probability.invert ();
8561   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8562     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8563 }
8564
8565 /* Function vect_transform_loop.
8566
8567    The analysis phase has determined that the loop is vectorizable.
8568    Vectorize the loop - created vectorized stmts to replace the scalar
8569    stmts in the loop, and update the loop exit condition.
8570    Returns scalar epilogue loop if any.  */
8571
8572 struct loop *
8573 vect_transform_loop (loop_vec_info loop_vinfo)
8574 {
8575   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8576   struct loop *epilogue = NULL;
8577   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8578   int nbbs = loop->num_nodes;
8579   int i;
8580   tree niters_vector = NULL_TREE;
8581   tree step_vector = NULL_TREE;
8582   tree niters_vector_mult_vf = NULL_TREE;
8583   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8584   unsigned int lowest_vf = constant_lower_bound (vf);
8585   bool grouped_store;
8586   bool slp_scheduled = false;
8587   gimple *stmt, *pattern_stmt;
8588   gimple_seq pattern_def_seq = NULL;
8589   gimple_stmt_iterator pattern_def_si = gsi_none ();
8590   bool transform_pattern_stmt = false;
8591   bool check_profitability = false;
8592   unsigned int th;
8593
8594   if (dump_enabled_p ())
8595     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8596
8597   /* Use the more conservative vectorization threshold.  If the number
8598      of iterations is constant assume the cost check has been performed
8599      by our caller.  If the threshold makes all loops profitable that
8600      run at least the (estimated) vectorization factor number of times
8601      checking is pointless, too.  */
8602   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8603   if (th >= vect_vf_for_cost (loop_vinfo)
8604       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8605     {
8606       if (dump_enabled_p ())
8607         dump_printf_loc (MSG_NOTE, vect_location,
8608                          "Profitability threshold is %d loop iterations.\n",
8609                          th);
8610       check_profitability = true;
8611     }
8612
8613   /* Make sure there exists a single-predecessor exit bb.  Do this before
8614      versioning.   */
8615   edge e = single_exit (loop);
8616   if (! single_pred_p (e->dest))
8617     {
8618       split_loop_exit_edge (e);
8619       if (dump_enabled_p ())
8620         dump_printf (MSG_NOTE, "split exit edge\n");
8621     }
8622
8623   /* Version the loop first, if required, so the profitability check
8624      comes first.  */
8625
8626   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8627     {
8628       poly_uint64 versioning_threshold
8629         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8630       if (check_profitability
8631           && ordered_p (poly_uint64 (th), versioning_threshold))
8632         {
8633           versioning_threshold = ordered_max (poly_uint64 (th),
8634                                               versioning_threshold);
8635           check_profitability = false;
8636         }
8637       vect_loop_versioning (loop_vinfo, th, check_profitability,
8638                             versioning_threshold);
8639       check_profitability = false;
8640     }
8641
8642   /* Make sure there exists a single-predecessor exit bb also on the
8643      scalar loop copy.  Do this after versioning but before peeling
8644      so CFG structure is fine for both scalar and if-converted loop
8645      to make slpeel_duplicate_current_defs_from_edges face matched
8646      loop closed PHI nodes on the exit.  */
8647   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8648     {
8649       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8650       if (! single_pred_p (e->dest))
8651         {
8652           split_loop_exit_edge (e);
8653           if (dump_enabled_p ())
8654             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8655         }
8656     }
8657
8658   tree niters = vect_build_loop_niters (loop_vinfo);
8659   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8660   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8661   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8662   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8663                               &step_vector, &niters_vector_mult_vf, th,
8664                               check_profitability, niters_no_overflow);
8665
8666   if (niters_vector == NULL_TREE)
8667     {
8668       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8669           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8670           && known_eq (lowest_vf, vf))
8671         {
8672           niters_vector
8673             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8674                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8675           step_vector = build_one_cst (TREE_TYPE (niters));
8676         }
8677       else
8678         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8679                                      &step_vector, niters_no_overflow);
8680     }
8681
8682   /* 1) Make sure the loop header has exactly two entries
8683      2) Make sure we have a preheader basic block.  */
8684
8685   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8686
8687   split_edge (loop_preheader_edge (loop));
8688
8689   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8690       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8691     /* This will deal with any possible peeling.  */
8692     vect_prepare_for_masked_peels (loop_vinfo);
8693
8694   /* FORNOW: the vectorizer supports only loops which body consist
8695      of one basic block (header + empty latch). When the vectorizer will
8696      support more involved loop forms, the order by which the BBs are
8697      traversed need to be reconsidered.  */
8698
8699   for (i = 0; i < nbbs; i++)
8700     {
8701       basic_block bb = bbs[i];
8702       stmt_vec_info stmt_info;
8703
8704       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8705            gsi_next (&si))
8706         {
8707           gphi *phi = si.phi ();
8708           if (dump_enabled_p ())
8709             {
8710               dump_printf_loc (MSG_NOTE, vect_location,
8711                                "------>vectorizing phi: ");
8712               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8713             }
8714           stmt_info = vinfo_for_stmt (phi);
8715           if (!stmt_info)
8716             continue;
8717
8718           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8719             vect_loop_kill_debug_uses (loop, phi);
8720
8721           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8722               && !STMT_VINFO_LIVE_P (stmt_info))
8723             continue;
8724
8725           if (STMT_VINFO_VECTYPE (stmt_info)
8726               && (maybe_ne
8727                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8728               && dump_enabled_p ())
8729             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8730
8731           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8732                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8733                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8734               && ! PURE_SLP_STMT (stmt_info))
8735             {
8736               if (dump_enabled_p ())
8737                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8738               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8739             }
8740         }
8741
8742       pattern_stmt = NULL;
8743       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8744            !gsi_end_p (si) || transform_pattern_stmt;)
8745         {
8746           bool is_store;
8747
8748           if (transform_pattern_stmt)
8749             stmt = pattern_stmt;
8750           else
8751             {
8752               stmt = gsi_stmt (si);
8753               /* During vectorization remove existing clobber stmts.  */
8754               if (gimple_clobber_p (stmt))
8755                 {
8756                   unlink_stmt_vdef (stmt);
8757                   gsi_remove (&si, true);
8758                   release_defs (stmt);
8759                   continue;
8760                 }
8761             }
8762
8763           if (dump_enabled_p ())
8764             {
8765               dump_printf_loc (MSG_NOTE, vect_location,
8766                                "------>vectorizing statement: ");
8767               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8768             }
8769
8770           stmt_info = vinfo_for_stmt (stmt);
8771
8772           /* vector stmts created in the outer-loop during vectorization of
8773              stmts in an inner-loop may not have a stmt_info, and do not
8774              need to be vectorized.  */
8775           if (!stmt_info)
8776             {
8777               gsi_next (&si);
8778               continue;
8779             }
8780
8781           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8782             vect_loop_kill_debug_uses (loop, stmt);
8783
8784           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8785               && !STMT_VINFO_LIVE_P (stmt_info))
8786             {
8787               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8788                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8789                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8790                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8791                 {
8792                   stmt = pattern_stmt;
8793                   stmt_info = vinfo_for_stmt (stmt);
8794                 }
8795               else
8796                 {
8797                   gsi_next (&si);
8798                   continue;
8799                 }
8800             }
8801           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8802                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8803                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8804                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8805             transform_pattern_stmt = true;
8806
8807           /* If pattern statement has def stmts, vectorize them too.  */
8808           if (is_pattern_stmt_p (stmt_info))
8809             {
8810               if (pattern_def_seq == NULL)
8811                 {
8812                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8813                   pattern_def_si = gsi_start (pattern_def_seq);
8814                 }
8815               else if (!gsi_end_p (pattern_def_si))
8816                 gsi_next (&pattern_def_si);
8817               if (pattern_def_seq != NULL)
8818                 {
8819                   gimple *pattern_def_stmt = NULL;
8820                   stmt_vec_info pattern_def_stmt_info = NULL;
8821
8822                   while (!gsi_end_p (pattern_def_si))
8823                     {
8824                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8825                       pattern_def_stmt_info
8826                         = vinfo_for_stmt (pattern_def_stmt);
8827                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8828                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8829                         break;
8830                       gsi_next (&pattern_def_si);
8831                     }
8832
8833                   if (!gsi_end_p (pattern_def_si))
8834                     {
8835                       if (dump_enabled_p ())
8836                         {
8837                           dump_printf_loc (MSG_NOTE, vect_location,
8838                                            "==> vectorizing pattern def "
8839                                            "stmt: ");
8840                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8841                                             pattern_def_stmt, 0);
8842                         }
8843
8844                       stmt = pattern_def_stmt;
8845                       stmt_info = pattern_def_stmt_info;
8846                     }
8847                   else
8848                     {
8849                       pattern_def_si = gsi_none ();
8850                       transform_pattern_stmt = false;
8851                     }
8852                 }
8853               else
8854                 transform_pattern_stmt = false;
8855             }
8856
8857           if (STMT_VINFO_VECTYPE (stmt_info))
8858             {
8859               poly_uint64 nunits
8860                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8861               if (!STMT_SLP_TYPE (stmt_info)
8862                   && maybe_ne (nunits, vf)
8863                   && dump_enabled_p ())
8864                   /* For SLP VF is set according to unrolling factor, and not
8865                      to vector size, hence for SLP this print is not valid.  */
8866                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8867             }
8868
8869           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8870              reached.  */
8871           if (STMT_SLP_TYPE (stmt_info))
8872             {
8873               if (!slp_scheduled)
8874                 {
8875                   slp_scheduled = true;
8876
8877                   if (dump_enabled_p ())
8878                     dump_printf_loc (MSG_NOTE, vect_location,
8879                                      "=== scheduling SLP instances ===\n");
8880
8881                   vect_schedule_slp (loop_vinfo);
8882                 }
8883
8884               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8885               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8886                 {
8887                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8888                     {
8889                       pattern_def_seq = NULL;
8890                       gsi_next (&si);
8891                     }
8892                   continue;
8893                 }
8894             }
8895
8896           /* -------- vectorize statement ------------ */
8897           if (dump_enabled_p ())
8898             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8899
8900           grouped_store = false;
8901           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8902           if (is_store)
8903             {
8904               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8905                 {
8906                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8907                      interleaving chain was completed - free all the stores in
8908                      the chain.  */
8909                   gsi_next (&si);
8910                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8911                 }
8912               else
8913                 {
8914                   /* Free the attached stmt_vec_info and remove the stmt.  */
8915                   gimple *store = gsi_stmt (si);
8916                   free_stmt_vec_info (store);
8917                   unlink_stmt_vdef (store);
8918                   gsi_remove (&si, true);
8919                   release_defs (store);
8920                 }
8921
8922               /* Stores can only appear at the end of pattern statements.  */
8923               gcc_assert (!transform_pattern_stmt);
8924               pattern_def_seq = NULL;
8925             }
8926           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8927             {
8928               pattern_def_seq = NULL;
8929               gsi_next (&si);
8930             }
8931         }                       /* stmts in BB */
8932
8933       /* Stub out scalar statements that must not survive vectorization.
8934          Doing this here helps with grouped statements, or statements that
8935          are involved in patterns.  */
8936       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8937            !gsi_end_p (gsi); gsi_next (&gsi))
8938         {
8939           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8940           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8941             {
8942               tree lhs = gimple_get_lhs (call);
8943               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8944                 {
8945                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8946                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8947                   gsi_replace (&gsi, new_stmt, true);
8948                 }
8949             }
8950         }
8951     }                           /* BBs in loop */
8952
8953   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8954      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8955   if (integer_onep (step_vector))
8956     niters_no_overflow = true;
8957   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8958                            niters_vector_mult_vf, !niters_no_overflow);
8959
8960   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8961   scale_profile_for_vect_loop (loop, assumed_vf);
8962
8963   /* True if the final iteration might not handle a full vector's
8964      worth of scalar iterations.  */
8965   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8966   /* The minimum number of iterations performed by the epilogue.  This
8967      is 1 when peeling for gaps because we always need a final scalar
8968      iteration.  */
8969   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8970   /* +1 to convert latch counts to loop iteration counts,
8971      -min_epilogue_iters to remove iterations that cannot be performed
8972        by the vector code.  */
8973   int bias_for_lowest = 1 - min_epilogue_iters;
8974   int bias_for_assumed = bias_for_lowest;
8975   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8976   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8977     {
8978       /* When the amount of peeling is known at compile time, the first
8979          iteration will have exactly alignment_npeels active elements.
8980          In the worst case it will have at least one.  */
8981       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8982       bias_for_lowest += lowest_vf - min_first_active;
8983       bias_for_assumed += assumed_vf - min_first_active;
8984     }
8985   /* In these calculations the "- 1" converts loop iteration counts
8986      back to latch counts.  */
8987   if (loop->any_upper_bound)
8988     loop->nb_iterations_upper_bound
8989       = (final_iter_may_be_partial
8990          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8991                           lowest_vf) - 1
8992          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8993                            lowest_vf) - 1);
8994   if (loop->any_likely_upper_bound)
8995     loop->nb_iterations_likely_upper_bound
8996       = (final_iter_may_be_partial
8997          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8998                           + bias_for_lowest, lowest_vf) - 1
8999          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9000                            + bias_for_lowest, lowest_vf) - 1);
9001   if (loop->any_estimate)
9002     loop->nb_iterations_estimate
9003       = (final_iter_may_be_partial
9004          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9005                           assumed_vf) - 1
9006          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9007                            assumed_vf) - 1);
9008
9009   if (dump_enabled_p ())
9010     {
9011       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9012         {
9013           dump_printf_loc (MSG_NOTE, vect_location,
9014                            "LOOP VECTORIZED\n");
9015           if (loop->inner)
9016             dump_printf_loc (MSG_NOTE, vect_location,
9017                              "OUTER LOOP VECTORIZED\n");
9018           dump_printf (MSG_NOTE, "\n");
9019         }
9020       else
9021         {
9022           dump_printf_loc (MSG_NOTE, vect_location,
9023                            "LOOP EPILOGUE VECTORIZED (VS=");
9024           dump_dec (MSG_NOTE, current_vector_size);
9025           dump_printf (MSG_NOTE, ")\n");
9026         }
9027     }
9028
9029   /* Free SLP instances here because otherwise stmt reference counting
9030      won't work.  */
9031   slp_instance instance;
9032   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9033     vect_free_slp_instance (instance);
9034   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9035   /* Clear-up safelen field since its value is invalid after vectorization
9036      since vectorized loop can have loop-carried dependencies.  */
9037   loop->safelen = 0;
9038
9039   /* Don't vectorize epilogue for epilogue.  */
9040   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9041     epilogue = NULL;
9042
9043   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9044     epilogue = NULL;
9045
9046   if (epilogue)
9047     {
9048       auto_vector_sizes vector_sizes;
9049       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9050       unsigned int next_size = 0;
9051
9052       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9053           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9054           && known_eq (vf, lowest_vf))
9055         {
9056           unsigned int eiters
9057             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9058                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9059           eiters = eiters % lowest_vf;
9060           epilogue->nb_iterations_upper_bound = eiters - 1;
9061
9062           unsigned int ratio;
9063           while (next_size < vector_sizes.length ()
9064                  && !(constant_multiple_p (current_vector_size,
9065                                            vector_sizes[next_size], &ratio)
9066                       && eiters >= lowest_vf / ratio))
9067             next_size += 1;
9068         }
9069       else
9070         while (next_size < vector_sizes.length ()
9071                && maybe_lt (current_vector_size, vector_sizes[next_size]))
9072           next_size += 1;
9073
9074       if (next_size == vector_sizes.length ())
9075         epilogue = NULL;
9076     }
9077
9078   if (epilogue)
9079     {
9080       epilogue->force_vectorize = loop->force_vectorize;
9081       epilogue->safelen = loop->safelen;
9082       epilogue->dont_vectorize = false;
9083
9084       /* We may need to if-convert epilogue to vectorize it.  */
9085       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9086         tree_if_conversion (epilogue);
9087     }
9088
9089   return epilogue;
9090 }
9091
9092 /* The code below is trying to perform simple optimization - revert
9093    if-conversion for masked stores, i.e. if the mask of a store is zero
9094    do not perform it and all stored value producers also if possible.
9095    For example,
9096      for (i=0; i<n; i++)
9097        if (c[i])
9098         {
9099           p1[i] += 1;
9100           p2[i] = p3[i] +2;
9101         }
9102    this transformation will produce the following semi-hammock:
9103
9104    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9105      {
9106        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9107        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9108        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9109        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9110        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9111        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9112      }
9113 */
9114
9115 void
9116 optimize_mask_stores (struct loop *loop)
9117 {
9118   basic_block *bbs = get_loop_body (loop);
9119   unsigned nbbs = loop->num_nodes;
9120   unsigned i;
9121   basic_block bb;
9122   struct loop *bb_loop;
9123   gimple_stmt_iterator gsi;
9124   gimple *stmt;
9125   auto_vec<gimple *> worklist;
9126
9127   vect_location = find_loop_location (loop);
9128   /* Pick up all masked stores in loop if any.  */
9129   for (i = 0; i < nbbs; i++)
9130     {
9131       bb = bbs[i];
9132       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9133            gsi_next (&gsi))
9134         {
9135           stmt = gsi_stmt (gsi);
9136           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9137             worklist.safe_push (stmt);
9138         }
9139     }
9140
9141   free (bbs);
9142   if (worklist.is_empty ())
9143     return;
9144
9145   /* Loop has masked stores.  */
9146   while (!worklist.is_empty ())
9147     {
9148       gimple *last, *last_store;
9149       edge e, efalse;
9150       tree mask;
9151       basic_block store_bb, join_bb;
9152       gimple_stmt_iterator gsi_to;
9153       tree vdef, new_vdef;
9154       gphi *phi;
9155       tree vectype;
9156       tree zero;
9157
9158       last = worklist.pop ();
9159       mask = gimple_call_arg (last, 2);
9160       bb = gimple_bb (last);
9161       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9162          the same loop as if_bb.  It could be different to LOOP when two
9163          level loop-nest is vectorized and mask_store belongs to the inner
9164          one.  */
9165       e = split_block (bb, last);
9166       bb_loop = bb->loop_father;
9167       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9168       join_bb = e->dest;
9169       store_bb = create_empty_bb (bb);
9170       add_bb_to_loop (store_bb, bb_loop);
9171       e->flags = EDGE_TRUE_VALUE;
9172       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9173       /* Put STORE_BB to likely part.  */
9174       efalse->probability = profile_probability::unlikely ();
9175       store_bb->count = efalse->count ();
9176       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9177       if (dom_info_available_p (CDI_DOMINATORS))
9178         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9179       if (dump_enabled_p ())
9180         dump_printf_loc (MSG_NOTE, vect_location,
9181                          "Create new block %d to sink mask stores.",
9182                          store_bb->index);
9183       /* Create vector comparison with boolean result.  */
9184       vectype = TREE_TYPE (mask);
9185       zero = build_zero_cst (vectype);
9186       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9187       gsi = gsi_last_bb (bb);
9188       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9189       /* Create new PHI node for vdef of the last masked store:
9190          .MEM_2 = VDEF <.MEM_1>
9191          will be converted to
9192          .MEM.3 = VDEF <.MEM_1>
9193          and new PHI node will be created in join bb
9194          .MEM_2 = PHI <.MEM_1, .MEM_3>
9195       */
9196       vdef = gimple_vdef (last);
9197       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9198       gimple_set_vdef (last, new_vdef);
9199       phi = create_phi_node (vdef, join_bb);
9200       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9201
9202       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9203       while (true)
9204         {
9205           gimple_stmt_iterator gsi_from;
9206           gimple *stmt1 = NULL;
9207
9208           /* Move masked store to STORE_BB.  */
9209           last_store = last;
9210           gsi = gsi_for_stmt (last);
9211           gsi_from = gsi;
9212           /* Shift GSI to the previous stmt for further traversal.  */
9213           gsi_prev (&gsi);
9214           gsi_to = gsi_start_bb (store_bb);
9215           gsi_move_before (&gsi_from, &gsi_to);
9216           /* Setup GSI_TO to the non-empty block start.  */
9217           gsi_to = gsi_start_bb (store_bb);
9218           if (dump_enabled_p ())
9219             {
9220               dump_printf_loc (MSG_NOTE, vect_location,
9221                                "Move stmt to created bb\n");
9222               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9223             }
9224           /* Move all stored value producers if possible.  */
9225           while (!gsi_end_p (gsi))
9226             {
9227               tree lhs;
9228               imm_use_iterator imm_iter;
9229               use_operand_p use_p;
9230               bool res;
9231
9232               /* Skip debug statements.  */
9233               if (is_gimple_debug (gsi_stmt (gsi)))
9234                 {
9235                   gsi_prev (&gsi);
9236                   continue;
9237                 }
9238               stmt1 = gsi_stmt (gsi);
9239               /* Do not consider statements writing to memory or having
9240                  volatile operand.  */
9241               if (gimple_vdef (stmt1)
9242                   || gimple_has_volatile_ops (stmt1))
9243                 break;
9244               gsi_from = gsi;
9245               gsi_prev (&gsi);
9246               lhs = gimple_get_lhs (stmt1);
9247               if (!lhs)
9248                 break;
9249
9250               /* LHS of vectorized stmt must be SSA_NAME.  */
9251               if (TREE_CODE (lhs) != SSA_NAME)
9252                 break;
9253
9254               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9255                 {
9256                   /* Remove dead scalar statement.  */
9257                   if (has_zero_uses (lhs))
9258                     {
9259                       gsi_remove (&gsi_from, true);
9260                       continue;
9261                     }
9262                 }
9263
9264               /* Check that LHS does not have uses outside of STORE_BB.  */
9265               res = true;
9266               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9267                 {
9268                   gimple *use_stmt;
9269                   use_stmt = USE_STMT (use_p);
9270                   if (is_gimple_debug (use_stmt))
9271                     continue;
9272                   if (gimple_bb (use_stmt) != store_bb)
9273                     {
9274                       res = false;
9275                       break;
9276                     }
9277                 }
9278               if (!res)
9279                 break;
9280
9281               if (gimple_vuse (stmt1)
9282                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9283                 break;
9284
9285               /* Can move STMT1 to STORE_BB.  */
9286               if (dump_enabled_p ())
9287                 {
9288                   dump_printf_loc (MSG_NOTE, vect_location,
9289                                    "Move stmt to created bb\n");
9290                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9291                 }
9292               gsi_move_before (&gsi_from, &gsi_to);
9293               /* Shift GSI_TO for further insertion.  */
9294               gsi_prev (&gsi_to);
9295             }
9296           /* Put other masked stores with the same mask to STORE_BB.  */
9297           if (worklist.is_empty ()
9298               || gimple_call_arg (worklist.last (), 2) != mask
9299               || worklist.last () != stmt1)
9300             break;
9301           last = worklist.pop ();
9302         }
9303       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9304     }
9305 }