gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Function vect_determine_vectorization_factor
 159
 160    Determine the vectorization factor (VF).  VF is the number of data elements
 161    that are operated upon in parallel in a single iteration of the vectorized
 162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 164    elements can fit in a single vector register.
 165
 166    We currently support vectorization of loops in which all types operated upon
 167    are of the same size.  Therefore this function currently sets VF according to
 168    the size of the types operated upon, and fails if there are multiple sizes
 169    in the loop.
 170
 171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 172    original loop:
 173         for (i=0; i<N; i++){
 174           a[i] = b[i] + c[i];
 175         }
 176
 177    vectorized loop:
 178         for (i=0; i<N; i+=VF){
 179           a[i:VF] = b[i:VF] + c[i:VF];
 180         }
 181 */
 182
 183 static bool
 184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 185 {
 186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 188   unsigned nbbs = loop->num_nodes;
 189   poly_uint64 vectorization_factor = 1;
 190   tree scalar_type = NULL_TREE;
 191   gphi *phi;
 192   tree vectype;
 193   stmt_vec_info stmt_info;
 194   unsigned i;
 195   HOST_WIDE_INT dummy;
 196   gimple *stmt, *pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200   bool bool_result;
 201   auto_vec<stmt_vec_info> mask_producers;
 202
 203   if (dump_enabled_p ())
 204     dump_printf_loc (MSG_NOTE, vect_location,
 205                      "=== vect_determine_vectorization_factor ===\n");
 206
 207   for (i = 0; i < nbbs; i++)
 208     {
 209       basic_block bb = bbs[i];
 210
 211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 212            gsi_next (&si))
 213         {
 214           phi = si.phi ();
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220             }
 221
 222           gcc_assert (stmt_info);
 223
 224           if (STMT_VINFO_RELEVANT_P (stmt_info)
 225               || STMT_VINFO_LIVE_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 264                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 265                   dump_printf (MSG_NOTE, "\n");
 266                 }
 267
 268               vect_update_max_nunits (&vectorization_factor, vectype);
 269             }
 270         }
 271
 272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 273            !gsi_end_p (si) || analyze_pattern_stmt;)
 274         {
 275           tree vf_vectype;
 276
 277           if (analyze_pattern_stmt)
 278             stmt = pattern_stmt;
 279           else
 280             stmt = gsi_stmt (si);
 281
 282           stmt_info = vinfo_for_stmt (stmt);
 283
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_NOTE, vect_location,
 287                                "==> examining statement: ");
 288               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                     }
 311                 }
 312               else
 313                 {
 314                   if (dump_enabled_p ())
 315                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 316                   gsi_next (&si);
 317                   continue;
 318                 }
 319             }
 320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 324             analyze_pattern_stmt = true;
 325
 326           /* If a pattern statement has def stmts, analyze them too.  */
 327           if (is_pattern_stmt_p (stmt_info))
 328             {
 329               if (pattern_def_seq == NULL)
 330                 {
 331                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 332                   pattern_def_si = gsi_start (pattern_def_seq);
 333                 }
 334               else if (!gsi_end_p (pattern_def_si))
 335                 gsi_next (&pattern_def_si);
 336               if (pattern_def_seq != NULL)
 337                 {
 338                   gimple *pattern_def_stmt = NULL;
 339                   stmt_vec_info pattern_def_stmt_info = NULL;
 340
 341                   while (!gsi_end_p (pattern_def_si))
 342                     {
 343                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 344                       pattern_def_stmt_info
 345                         = vinfo_for_stmt (pattern_def_stmt);
 346                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 347                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 348                         break;
 349                       gsi_next (&pattern_def_si);
 350                     }
 351
 352                   if (!gsi_end_p (pattern_def_si))
 353                     {
 354                       if (dump_enabled_p ())
 355                         {
 356                           dump_printf_loc (MSG_NOTE, vect_location,
 357                                            "==> examining pattern def stmt: ");
 358                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 359                                             pattern_def_stmt, 0);
 360                         }
 361
 362                       stmt = pattern_def_stmt;
 363                       stmt_info = pattern_def_stmt_info;
 364                     }
 365                   else
 366                     {
 367                       pattern_def_si = gsi_none ();
 368                       analyze_pattern_stmt = false;
 369                     }
 370                 }
 371               else
 372                 analyze_pattern_stmt = false;
 373             }
 374
 375           if (gimple_get_lhs (stmt) == NULL_TREE
 376               /* MASK_STORE has no lhs, but is ok.  */
 377               && (!is_gimple_call (stmt)
 378                   || !gimple_call_internal_p (stmt)
 379                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 380             {
 381               if (is_gimple_call (stmt))
 382                 {
 383                   /* Ignore calls with no lhs.  These must be calls to
 384                      #pragma omp simd functions, and what vectorization factor
 385                      it really needs can't be determined until
 386                      vectorizable_simd_clone_call.  */
 387                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 388                     {
 389                       pattern_def_seq = NULL;
 390                       gsi_next (&si);
 391                     }
 392                   continue;
 393                 }
 394               if (dump_enabled_p ())
 395                 {
 396                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 397                                    "not vectorized: irregular stmt.");
 398                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 399                                     0);
 400                 }
 401               return false;
 402             }
 403
 404           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 405             {
 406               if (dump_enabled_p ())
 407                 {
 408                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 409                                    "not vectorized: vector stmt in loop:");
 410                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 411                 }
 412               return false;
 413             }
 414
 415           bool_result = false;
 416
 417           if (STMT_VINFO_VECTYPE (stmt_info))
 418             {
 419               /* The only case when a vectype had been already set is for stmts
 420                  that contain a dataref, or for "pattern-stmts" (stmts
 421                  generated by the vectorizer to represent/replace a certain
 422                  idiom).  */
 423               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 424                           || is_pattern_stmt_p (stmt_info)
 425                           || !gsi_end_p (pattern_def_si));
 426               vectype = STMT_VINFO_VECTYPE (stmt_info);
 427             }
 428           else
 429             {
 430               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 431               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 432                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 433               else
 434                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 435
 436               /* Bool ops don't participate in vectorization factor
 437                  computation.  For comparison use compared types to
 438                  compute a factor.  */
 439               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 440                   && is_gimple_assign (stmt)
 441                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 442                 {
 443                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 444                       || STMT_VINFO_LIVE_P (stmt_info))
 445                     mask_producers.safe_push (stmt_info);
 446                   bool_result = true;
 447
 448                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 449                       == tcc_comparison
 450                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 451                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 452                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 453                   else
 454                     {
 455                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 456                         {
 457                           pattern_def_seq = NULL;
 458                           gsi_next (&si);
 459                         }
 460                       continue;
 461                     }
 462                 }
 463
 464               if (dump_enabled_p ())
 465                 {
 466                   dump_printf_loc (MSG_NOTE, vect_location,
 467                                    "get vectype for scalar type:  ");
 468                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 469                   dump_printf (MSG_NOTE, "\n");
 470                 }
 471               vectype = get_vectype_for_scalar_type (scalar_type);
 472               if (!vectype)
 473                 {
 474                   if (dump_enabled_p ())
 475                     {
 476                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                                        "not vectorized: unsupported "
 478                                        "data-type ");
 479                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 480                                          scalar_type);
 481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 482                     }
 483                   return false;
 484                 }
 485
 486               if (!bool_result)
 487                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 488
 489               if (dump_enabled_p ())
 490                 {
 491                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 492                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 493                   dump_printf (MSG_NOTE, "\n");
 494                 }
 495             }
 496
 497           /* Don't try to compute VF out scalar types if we stmt
 498              produces boolean vector.  Use result vectype instead.  */
 499           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 500             vf_vectype = vectype;
 501           else
 502             {
 503               /* The vectorization factor is according to the smallest
 504                  scalar type (or the largest vector size, but we only
 505                  support one vector size per loop).  */
 506               if (!bool_result)
 507                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 508                                                              &dummy);
 509               if (dump_enabled_p ())
 510                 {
 511                   dump_printf_loc (MSG_NOTE, vect_location,
 512                                    "get vectype for scalar type:  ");
 513                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 514                   dump_printf (MSG_NOTE, "\n");
 515                 }
 516               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 517             }
 518           if (!vf_vectype)
 519             {
 520               if (dump_enabled_p ())
 521                 {
 522                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                                    "not vectorized: unsupported data-type ");
 524                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 525                                      scalar_type);
 526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 527                 }
 528               return false;
 529             }
 530
 531           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 532                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 533             {
 534               if (dump_enabled_p ())
 535                 {
 536                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 537                                    "not vectorized: different sized vector "
 538                                    "types in statement, ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 542                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 543                                      vf_vectype);
 544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 545                 }
 546               return false;
 547             }
 548
 549           if (dump_enabled_p ())
 550             {
 551               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 552               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 553               dump_printf (MSG_NOTE, "\n");
 554             }
 555
 556           if (dump_enabled_p ())
 557             {
 558               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 559               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 560               dump_printf (MSG_NOTE, "\n");
 561             }
 562
 563           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 564
 565           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 566             {
 567               pattern_def_seq = NULL;
 568               gsi_next (&si);
 569             }
 570         }
 571     }
 572
 573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 574   if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 577       dump_dec (MSG_NOTE, vectorization_factor);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   if (known_le (vectorization_factor, 1U))
 582     {
 583       if (dump_enabled_p ())
 584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 585                          "not vectorized: unsupported data-type\n");
 586       return false;
 587     }
 588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 589
 590   for (i = 0; i < mask_producers.length (); i++)
 591     {
 592       tree mask_type = NULL;
 593
 594       stmt = STMT_VINFO_STMT (mask_producers[i]);
 595
 596       if (is_gimple_assign (stmt)
 597           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 598           && !VECT_SCALAR_BOOLEAN_TYPE_P
 599                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 600         {
 601           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 602           mask_type = get_mask_type_for_scalar_type (scalar_type);
 603
 604           if (!mask_type)
 605             {
 606               if (dump_enabled_p ())
 607                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                                  "not vectorized: unsupported mask\n");
 609               return false;
 610             }
 611         }
 612       else
 613         {
 614           tree rhs;
 615           ssa_op_iter iter;
 616           gimple *def_stmt;
 617           enum vect_def_type dt;
 618
 619           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 620             {
 621               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 622                                        &def_stmt, &dt, &vectype))
 623                 {
 624                   if (dump_enabled_p ())
 625                     {
 626                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                                        "not vectorized: can't compute mask type "
 628                                        "for statement, ");
 629                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 630                                         0);
 631                     }
 632                   return false;
 633                 }
 634
 635               /* No vectype probably means external definition.
 636                  Allow it in case there is another operand which
 637                  allows to determine mask type.  */
 638               if (!vectype)
 639                 continue;
 640
 641               if (!mask_type)
 642                 mask_type = vectype;
 643               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 644                                  TYPE_VECTOR_SUBPARTS (vectype)))
 645                 {
 646                   if (dump_enabled_p ())
 647                     {
 648                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 649                                        "not vectorized: different sized masks "
 650                                        "types in statement, ");
 651                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 652                                          mask_type);
 653                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 654                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 655                                          vectype);
 656                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 657                     }
 658                   return false;
 659                 }
 660               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 661                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 662                 {
 663                   if (dump_enabled_p ())
 664                     {
 665                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 666                                        "not vectorized: mixed mask and "
 667                                        "nonmask vector types in statement, ");
 668                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 669                                          mask_type);
 670                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 671                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 672                                          vectype);
 673                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 674                     }
 675                   return false;
 676                 }
 677             }
 678
 679           /* We may compare boolean value loaded as vector of integers.
 680              Fix mask_type in such case.  */
 681           if (mask_type
 682               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 683               && gimple_code (stmt) == GIMPLE_ASSIGN
 684               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 685             mask_type = build_same_sized_truth_vector_type (mask_type);
 686         }
 687
 688       /* No mask_type should mean loop invariant predicate.
 689          This is probably a subject for optimization in
 690          if-conversion.  */
 691       if (!mask_type)
 692         {
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 696                                "not vectorized: can't compute mask type "
 697                                "for statement, ");
 698               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 699                                 0);
 700             }
 701           return false;
 702         }
 703
 704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 705     }
 706
 707   return true;
 708 }
 709
 710
 711 /* Function vect_is_simple_iv_evolution.
 712
 713    FORNOW: A simple evolution of an induction variables in the loop is
 714    considered a polynomial evolution.  */
 715
 716 static bool
 717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 718                              tree * step)
 719 {
 720   tree init_expr;
 721   tree step_expr;
 722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 723   basic_block bb;
 724
 725   /* When there is no evolution in this loop, the evolution function
 726      is not "simple".  */
 727   if (evolution_part == NULL_TREE)
 728     return false;
 729
 730   /* When the evolution is a polynomial of degree >= 2
 731      the evolution function is not "simple".  */
 732   if (tree_is_chrec (evolution_part))
 733     return false;
 734
 735   step_expr = evolution_part;
 736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 737
 738   if (dump_enabled_p ())
 739     {
 740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 742       dump_printf (MSG_NOTE, ",  init: ");
 743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 744       dump_printf (MSG_NOTE, "\n");
 745     }
 746
 747   *init = init_expr;
 748   *step = step_expr;
 749
 750   if (TREE_CODE (step_expr) != INTEGER_CST
 751       && (TREE_CODE (step_expr) != SSA_NAME
 752           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 753               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 754           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 755               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 756                   || !flag_associative_math)))
 757       && (TREE_CODE (step_expr) != REAL_CST
 758           || !flag_associative_math))
 759     {
 760       if (dump_enabled_p ())
 761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 762                          "step unknown.\n");
 763       return false;
 764     }
 765
 766   return true;
 767 }
 768
 769 /* Function vect_analyze_scalar_cycles_1.
 770
 771    Examine the cross iteration def-use cycles of scalar variables
 772    in LOOP.  LOOP_VINFO represents the loop that is now being
 773    considered for vectorization (can be LOOP, or an outer-loop
 774    enclosing LOOP).  */
 775
 776 static void
 777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 778 {
 779   basic_block bb = loop->header;
 780   tree init, step;
 781   auto_vec<gimple *, 64> worklist;
 782   gphi_iterator gsi;
 783   bool double_reduc;
 784
 785   if (dump_enabled_p ())
 786     dump_printf_loc (MSG_NOTE, vect_location,
 787                      "=== vect_analyze_scalar_cycles ===\n");
 788
 789   /* First - identify all inductions.  Reduction detection assumes that all the
 790      inductions have been identified, therefore, this order must not be
 791      changed.  */
 792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 793     {
 794       gphi *phi = gsi.phi ();
 795       tree access_fn = NULL;
 796       tree def = PHI_RESULT (phi);
 797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 798
 799       if (dump_enabled_p ())
 800         {
 801           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 802           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 803         }
 804
 805       /* Skip virtual phi's.  The data dependences that are associated with
 806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 807       if (virtual_operand_p (def))
 808         continue;
 809
 810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 811
 812       /* Analyze the evolution function.  */
 813       access_fn = analyze_scalar_evolution (loop, def);
 814       if (access_fn)
 815         {
 816           STRIP_NOPS (access_fn);
 817           if (dump_enabled_p ())
 818             {
 819               dump_printf_loc (MSG_NOTE, vect_location,
 820                                "Access function of PHI: ");
 821               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 822               dump_printf (MSG_NOTE, "\n");
 823             }
 824           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 825             = initial_condition_in_loop_num (access_fn, loop->num);
 826           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 827             = evolution_part_in_loop_num (access_fn, loop->num);
 828         }
 829
 830       if (!access_fn
 831           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 832           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 833               && TREE_CODE (step) != INTEGER_CST))
 834         {
 835           worklist.safe_push (phi);
 836           continue;
 837         }
 838
 839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 840                   != NULL_TREE);
 841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 842
 843       if (dump_enabled_p ())
 844         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 846     }
 847
 848
 849   /* Second - identify all reductions and nested cycles.  */
 850   while (worklist.length () > 0)
 851     {
 852       gimple *phi = worklist.pop ();
 853       tree def = PHI_RESULT (phi);
 854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 855       gimple *reduc_stmt;
 856
 857       if (dump_enabled_p ())
 858         {
 859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 861         }
 862
 863       gcc_assert (!virtual_operand_p (def)
 864                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 865
 866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 867                                                 &double_reduc, false);
 868       if (reduc_stmt)
 869         {
 870           if (double_reduc)
 871             {
 872               if (dump_enabled_p ())
 873                 dump_printf_loc (MSG_NOTE, vect_location,
 874                                  "Detected double reduction.\n");
 875
 876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 878                                                     vect_double_reduction_def;
 879             }
 880           else
 881             {
 882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 883                 {
 884                   if (dump_enabled_p ())
 885                     dump_printf_loc (MSG_NOTE, vect_location,
 886                                      "Detected vectorizable nested cycle.\n");
 887
 888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 890                                                              vect_nested_cycle;
 891                 }
 892               else
 893                 {
 894                   if (dump_enabled_p ())
 895                     dump_printf_loc (MSG_NOTE, vect_location,
 896                                      "Detected reduction.\n");
 897
 898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 900                                                            vect_reduction_def;
 901                   /* Store the reduction cycles for possible vectorization in
 902                      loop-aware SLP if it was not detected as reduction
 903                      chain.  */
 904                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 905                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 906                 }
 907             }
 908         }
 909       else
 910         if (dump_enabled_p ())
 911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                            "Unknown def-use cycle pattern.\n");
 913     }
 914 }
 915
 916
 917 /* Function vect_analyze_scalar_cycles.
 918
 919    Examine the cross iteration def-use cycles of scalar variables, by
 920    analyzing the loop-header PHIs of scalar variables.  Classify each
 921    cycle as one of the following: invariant, induction, reduction, unknown.
 922    We do that for the loop represented by LOOP_VINFO, and also to its
 923    inner-loop, if exists.
 924    Examples for scalar cycles:
 925
 926    Example1: reduction:
 927
 928               loop1:
 929               for (i=0; i<N; i++)
 930                  sum += a[i];
 931
 932    Example2: induction:
 933
 934               loop2:
 935               for (i=0; i<N; i++)
 936                  a[i] = i;  */
 937
 938 static void
 939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 940 {
 941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 942
 943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 944
 945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 946      Reductions in such inner-loop therefore have different properties than
 947      the reductions in the nest that gets vectorized:
 948      1. When vectorized, they are executed in the same order as in the original
 949         scalar loop, so we can't change the order of computation when
 950         vectorizing them.
 951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 952         current checks are too strict.  */
 953
 954   if (loop->inner)
 955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 956 }
 957
 958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 959
 960 static void
 961 vect_fixup_reduc_chain (gimple *stmt)
 962 {
 963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964   gimple *stmtp;
 965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 966               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 968   do
 969     {
 970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 973       if (stmt)
 974         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 975           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 976     }
 977   while (stmt);
 978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 979 }
 980
 981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 982
 983 static void
 984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 985 {
 986   gimple *first;
 987   unsigned i;
 988
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 991       {
 992         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 993         while (next)
 994           {
 995             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 996               break;
 997             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 998           }
 999         /* If not all stmt in the chain are patterns try to handle
1000            the chain without patterns.  */
1001         if (! next)
1002           {
1003             vect_fixup_reduc_chain (first);
1004             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006           }
1007       }
1008 }
1009
1010 /* Function vect_get_loop_niters.
1011
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016
1017    Return the loop exit condition.  */
1018
1019
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022                       tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034                      "=== get_loop_niters ===\n");
1035
1036   if (!exit)
1037     return cond;
1038
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056         {
1057           /* Try to combine may_be_zero with assumptions, this can simplify
1058              computation of niter expression.  */
1059           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061                                              niter_assumptions,
1062                                              fold_build1 (TRUTH_NOT_EXPR,
1063                                                           boolean_type_node,
1064                                                           may_be_zero));
1065           else
1066             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067                                  build_int_cst (TREE_TYPE (niter), 0),
1068                                  rewrite_to_non_trapping_overflow (niter));
1069
1070           may_be_zero = NULL_TREE;
1071         }
1072       else if (integer_nonzerop (may_be_zero))
1073         {
1074           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076           return cond;
1077         }
1078       else
1079         return cond;
1080     }
1081
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091                           build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093
1094   return cond;
1095 }
1096
1097 /* Function bb_in_loop_p
1098
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109
1110
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     ivexpr_map (NULL),
1132     slp_unrolling_factor (1),
1133     single_scalar_iteration_cost (0),
1134     vectorizable (false),
1135     can_fully_mask_p (true),
1136     fully_masked_p (false),
1137     peeling_for_gaps (false),
1138     peeling_for_niter (false),
1139     operands_swapped (false),
1140     no_data_dependencies (false),
1141     has_mask_store (false),
1142     scalar_loop (NULL),
1143     orig_loop_info (NULL)
1144 {
1145   /* Create/Update stmt_info for all stmts in the loop.  */
1146   basic_block *body = get_loop_body (loop);
1147   for (unsigned int i = 0; i < loop->num_nodes; i++)
1148     {
1149       basic_block bb = body[i];
1150       gimple_stmt_iterator si;
1151
1152       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153         {
1154           gimple *phi = gsi_stmt (si);
1155           gimple_set_uid (phi, 0);
1156           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1157         }
1158
1159       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160         {
1161           gimple *stmt = gsi_stmt (si);
1162           gimple_set_uid (stmt, 0);
1163           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1164         }
1165     }
1166   free (body);
1167
1168   /* CHECKME: We want to visit all BBs before their successors (except for
1169      latch blocks, for which this assertion wouldn't hold).  In the simple
1170      case of the loop forms we allow, a dfs order of the BBs would the same
1171      as reversed postorder traversal, so we are safe.  */
1172
1173   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174                                           bbs, loop->num_nodes, loop);
1175   gcc_assert (nbbs == loop->num_nodes);
1176 }
1177
1178 /* Free all levels of MASKS.  */
1179
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1182 {
1183   rgroup_masks *rgm;
1184   unsigned int i;
1185   FOR_EACH_VEC_ELT (*masks, i, rgm)
1186     rgm->masks.release ();
1187   masks->release ();
1188 }
1189
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191    stmt_vec_info structs of all the stmts in the loop.  */
1192
1193 _loop_vec_info::~_loop_vec_info ()
1194 {
1195   int nbbs;
1196   gimple_stmt_iterator si;
1197   int j;
1198
1199   nbbs = loop->num_nodes;
1200   for (j = 0; j < nbbs; j++)
1201     {
1202       basic_block bb = bbs[j];
1203       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204         free_stmt_vec_info (gsi_stmt (si));
1205
1206       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207         {
1208           gimple *stmt = gsi_stmt (si);
1209
1210           /* We may have broken canonical form by moving a constant
1211              into RHS1 of a commutative op.  Fix such occurrences.  */
1212           if (operands_swapped && is_gimple_assign (stmt))
1213             {
1214               enum tree_code code = gimple_assign_rhs_code (stmt);
1215
1216               if ((code == PLUS_EXPR
1217                    || code == POINTER_PLUS_EXPR
1218                    || code == MULT_EXPR)
1219                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220                 swap_ssa_operands (stmt,
1221                                    gimple_assign_rhs1_ptr (stmt),
1222                                    gimple_assign_rhs2_ptr (stmt));
1223               else if (code == COND_EXPR
1224                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225                 {
1226                   tree cond_expr = gimple_assign_rhs1 (stmt);
1227                   enum tree_code cond_code = TREE_CODE (cond_expr);
1228
1229                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230                     {
1231                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232                                                                   0));
1233                       cond_code = invert_tree_comparison (cond_code,
1234                                                           honor_nans);
1235                       if (cond_code != ERROR_MARK)
1236                         {
1237                           TREE_SET_CODE (cond_expr, cond_code);
1238                           swap_ssa_operands (stmt,
1239                                              gimple_assign_rhs2_ptr (stmt),
1240                                              gimple_assign_rhs3_ptr (stmt));
1241                         }
1242                     }
1243                 }
1244             }
1245
1246           /* Free stmt_vec_info.  */
1247           free_stmt_vec_info (stmt);
1248           gsi_next (&si);
1249         }
1250     }
1251
1252   free (bbs);
1253
1254   release_vec_loop_masks (&masks);
1255   delete ivexpr_map;
1256
1257   loop->aux = NULL;
1258 }
1259
1260 /* Return an invariant or register for EXPR and emit necessary
1261    computations in the LOOP_VINFO loop preheader.  */
1262
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1265 {
1266   if (is_gimple_reg (expr)
1267       || is_gimple_min_invariant (expr))
1268     return expr;
1269
1270   if (! loop_vinfo->ivexpr_map)
1271     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273   if (! cached)
1274     {
1275       gimple_seq stmts = NULL;
1276       cached = force_gimple_operand (unshare_expr (expr),
1277                                      &stmts, true, NULL_TREE);
1278       if (stmts)
1279         {
1280           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281           gsi_insert_seq_on_edge_immediate (e, stmts);
1282         }
1283     }
1284   return cached;
1285 }
1286
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288    all masks required to mask LOOP_VINFO.  */
1289
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1292 {
1293   rgroup_masks *rgm;
1294   unsigned int i;
1295   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296     if (rgm->mask_type != NULL_TREE
1297         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298                                             cmp_type, rgm->mask_type,
1299                                             OPTIMIZE_FOR_SPEED))
1300       return false;
1301   return true;
1302 }
1303
1304 /* Calculate the maximum number of scalars per iteration for every
1305    rgroup in LOOP_VINFO.  */
1306
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1309 {
1310   unsigned int res = 1;
1311   unsigned int i;
1312   rgroup_masks *rgm;
1313   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314     res = MAX (res, rgm->max_nscalars_per_iter);
1315   return res;
1316 }
1317
1318 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1319    whether we can actually generate the masks required.  Return true if so,
1320    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1321
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1324 {
1325   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326   unsigned int min_ni_width;
1327
1328   /* Use a normal loop if there are no statements that need masking.
1329      This only happens in rare degenerate cases: it means that the loop
1330      has no loads, no stores, and no live-out values.  */
1331   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332     return false;
1333
1334   /* Get the maximum number of iterations that is representable
1335      in the counter type.  */
1336   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1338
1339   /* Get a more refined estimate for the number of iterations.  */
1340   widest_int max_back_edges;
1341   if (max_loop_iterations (loop, &max_back_edges))
1342     max_ni = wi::smin (max_ni, max_back_edges + 1);
1343
1344   /* Account for rgroup masks, in which each bit is replicated N times.  */
1345   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1346
1347   /* Work out how many bits we need to represent the limit.  */
1348   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1349
1350   /* Find a scalar mode for which WHILE_ULT is supported.  */
1351   opt_scalar_int_mode cmp_mode_iter;
1352   tree cmp_type = NULL_TREE;
1353   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1354     {
1355       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356       if (cmp_bits >= min_ni_width
1357           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1358         {
1359           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360           if (this_type
1361               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1362             {
1363               /* Although we could stop as soon as we find a valid mode,
1364                  it's often better to continue until we hit Pmode, since the
1365                  operands to the WHILE are more likely to be reusable in
1366                  address calculations.  */
1367               cmp_type = this_type;
1368               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369                 break;
1370             }
1371         }
1372     }
1373
1374   if (!cmp_type)
1375     return false;
1376
1377   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378   return true;
1379 }
1380
1381 /* Calculate the cost of one scalar iteration of the loop.  */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1384 {
1385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387   int nbbs = loop->num_nodes, factor;
1388   int innerloop_iters, i;
1389
1390   /* Gather costs for statements in the scalar loop.  */
1391
1392   /* FORNOW.  */
1393   innerloop_iters = 1;
1394   if (loop->inner)
1395     innerloop_iters = 50; /* FIXME */
1396
1397   for (i = 0; i < nbbs; i++)
1398     {
1399       gimple_stmt_iterator si;
1400       basic_block bb = bbs[i];
1401
1402       if (bb->loop_father == loop->inner)
1403         factor = innerloop_iters;
1404       else
1405         factor = 1;
1406
1407       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409           gimple *stmt = gsi_stmt (si);
1410           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411
1412           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413             continue;
1414
1415           /* Skip stmts that are not vectorized inside the loop.  */
1416           if (stmt_info
1417               && !STMT_VINFO_RELEVANT_P (stmt_info)
1418               && (!STMT_VINFO_LIVE_P (stmt_info)
1419                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421             continue;
1422
1423           vect_cost_for_stmt kind;
1424           if (STMT_VINFO_DATA_REF (stmt_info))
1425             {
1426               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427                kind = scalar_load;
1428              else
1429                kind = scalar_store;
1430             }
1431           else
1432             kind = scalar_stmt;
1433
1434           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435                             factor, kind, stmt_info, 0, vect_prologue);
1436         }
1437     }
1438
1439   /* Now accumulate cost.  */
1440   void *target_cost_data = init_cost (loop);
1441   stmt_info_for_cost *si;
1442   int j;
1443   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444                     j, si)
1445     {
1446       struct _stmt_vec_info *stmt_info
1447         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448       (void) add_stmt_cost (target_cost_data, si->count,
1449                             si->kind, stmt_info, si->misalign,
1450                             vect_body);
1451     }
1452   unsigned dummy, body_cost = 0;
1453   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454   destroy_cost_data (target_cost_data);
1455   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1456 }
1457
1458
1459 /* Function vect_analyze_loop_form_1.
1460
1461    Verify that certain CFG restrictions hold, including:
1462    - the loop has a pre-header
1463    - the loop has a single entry and exit
1464    - the loop exit condition is simple enough
1465    - the number of iterations can be analyzed, i.e, a countable loop.  The
1466      niter could be analyzed under some assumptions.  */
1467
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470                           tree *assumptions, tree *number_of_iterationsm1,
1471                           tree *number_of_iterations, gcond **inner_loop_cond)
1472 {
1473   if (dump_enabled_p ())
1474     dump_printf_loc (MSG_NOTE, vect_location,
1475                      "=== vect_analyze_loop_form ===\n");
1476
1477   /* Different restrictions apply when we are considering an inner-most loop,
1478      vs. an outer (nested) loop.
1479      (FORNOW. May want to relax some of these restrictions in the future).  */
1480
1481   if (!loop->inner)
1482     {
1483       /* Inner-most loop.  We currently require that the number of BBs is
1484          exactly 2 (the header and latch).  Vectorizable inner-most loops
1485          look like this:
1486
1487                         (pre-header)
1488                            |
1489                           header <--------+
1490                            | |            |
1491                            | +--> latch --+
1492                            |
1493                         (exit-bb)  */
1494
1495       if (loop->num_nodes != 2)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                              "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502
1503       if (empty_block_p (loop->header))
1504         {
1505           if (dump_enabled_p ())
1506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                              "not vectorized: empty loop.\n");
1508           return false;
1509         }
1510     }
1511   else
1512     {
1513       struct loop *innerloop = loop->inner;
1514       edge entryedge;
1515
1516       /* Nested loop. We currently require that the loop is doubly-nested,
1517          contains a single inner loop, and the number of BBs is exactly 5.
1518          Vectorizable outer-loops look like this:
1519
1520                         (pre-header)
1521                            |
1522                           header <---+
1523                            |         |
1524                           inner-loop |
1525                            |         |
1526                           tail ------+
1527                            |
1528                         (exit-bb)
1529
1530          The inner-loop has the properties expected of inner-most loops
1531          as described above.  */
1532
1533       if ((loop->inner)->inner || (loop->inner)->next)
1534         {
1535           if (dump_enabled_p ())
1536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                              "not vectorized: multiple nested loops.\n");
1538           return false;
1539         }
1540
1541       if (loop->num_nodes != 5)
1542         {
1543           if (dump_enabled_p ())
1544             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                              "not vectorized: control flow in loop.\n");
1546           return false;
1547         }
1548
1549       entryedge = loop_preheader_edge (innerloop);
1550       if (entryedge->src != loop->header
1551           || !single_exit (innerloop)
1552           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1553         {
1554           if (dump_enabled_p ())
1555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                              "not vectorized: unsupported outerloop form.\n");
1557           return false;
1558         }
1559
1560       /* Analyze the inner-loop.  */
1561       tree inner_niterm1, inner_niter, inner_assumptions;
1562       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563                                       &inner_assumptions, &inner_niterm1,
1564                                       &inner_niter, NULL)
1565           /* Don't support analyzing niter under assumptions for inner
1566              loop.  */
1567           || !integer_onep (inner_assumptions))
1568         {
1569           if (dump_enabled_p ())
1570             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                              "not vectorized: Bad inner loop.\n");
1572           return false;
1573         }
1574
1575       if (!expr_invariant_in_loop_p (loop, inner_niter))
1576         {
1577           if (dump_enabled_p ())
1578             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579                              "not vectorized: inner-loop count not"
1580                              " invariant.\n");
1581           return false;
1582         }
1583
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_NOTE, vect_location,
1586                          "Considering outer-loop vectorization.\n");
1587     }
1588
1589   if (!single_exit (loop)
1590       || EDGE_COUNT (loop->header->preds) != 2)
1591     {
1592       if (dump_enabled_p ())
1593         {
1594           if (!single_exit (loop))
1595             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596                              "not vectorized: multiple exits.\n");
1597           else if (EDGE_COUNT (loop->header->preds) != 2)
1598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                              "not vectorized: too many incoming edges.\n");
1600         }
1601       return false;
1602     }
1603
1604   /* We assume that the loop exit condition is at the end of the loop. i.e,
1605      that the loop is represented as a do-while (with a proper if-guard
1606      before the loop if needed), where the loop header contains all the
1607      executable statements, and the latch is empty.  */
1608   if (!empty_block_p (loop->latch)
1609       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1610     {
1611       if (dump_enabled_p ())
1612         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613                          "not vectorized: latch block not empty.\n");
1614       return false;
1615     }
1616
1617   /* Make sure the exit is not abnormal.  */
1618   edge e = single_exit (loop);
1619   if (e->flags & EDGE_ABNORMAL)
1620     {
1621       if (dump_enabled_p ())
1622         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623                          "not vectorized: abnormal loop exit edge.\n");
1624       return false;
1625     }
1626
1627   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628                                      number_of_iterationsm1);
1629   if (!*loop_cond)
1630     {
1631       if (dump_enabled_p ())
1632         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633                          "not vectorized: complicated exit condition.\n");
1634       return false;
1635     }
1636
1637   if (integer_zerop (*assumptions)
1638       || !*number_of_iterations
1639       || chrec_contains_undetermined (*number_of_iterations))
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "not vectorized: number of iterations cannot be "
1644                          "computed.\n");
1645       return false;
1646     }
1647
1648   if (integer_zerop (*number_of_iterations))
1649     {
1650       if (dump_enabled_p ())
1651         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652                          "not vectorized: number of iterations = 0.\n");
1653       return false;
1654     }
1655
1656   return true;
1657 }
1658
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1660
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1663 {
1664   tree assumptions, number_of_iterations, number_of_iterationsm1;
1665   gcond *loop_cond, *inner_loop_cond = NULL;
1666
1667   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668                                   &assumptions, &number_of_iterationsm1,
1669                                   &number_of_iterations, &inner_loop_cond))
1670     return NULL;
1671
1672   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676   if (!integer_onep (assumptions))
1677     {
1678       /* We consider to vectorize this loop by versioning it under
1679          some assumptions.  In order to do this, we need to clear
1680          existing information computed by scev and niter analyzer.  */
1681       scev_reset_htab ();
1682       free_numbers_of_iterations_estimates (loop);
1683       /* Also set flag for this loop so that following scev and niter
1684          analysis are done under the assumptions.  */
1685       loop_constraint_set (loop, LOOP_C_FINITE);
1686       /* Also record the assumptions for versioning.  */
1687       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1688     }
1689
1690   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1691     {
1692       if (dump_enabled_p ())
1693         {
1694           dump_printf_loc (MSG_NOTE, vect_location,
1695                            "Symbolic number of iterations is ");
1696           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697           dump_printf (MSG_NOTE, "\n");
1698         }
1699     }
1700
1701   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702   if (inner_loop_cond)
1703     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704       = loop_exit_ctrl_vec_info_type;
1705
1706   gcc_assert (!loop->aux);
1707   loop->aux = loop_vinfo;
1708   return loop_vinfo;
1709 }
1710
1711
1712
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714    statements update the vectorization factor.  */
1715
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1718 {
1719   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721   int nbbs = loop->num_nodes;
1722   poly_uint64 vectorization_factor;
1723   int i;
1724
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location,
1727                      "=== vect_update_vf_for_slp ===\n");
1728
1729   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730   gcc_assert (known_ne (vectorization_factor, 0U));
1731
1732   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733      vectorization factor of the loop is the unrolling factor required by
1734      the SLP instances.  If that unrolling factor is 1, we say, that we
1735      perform pure SLP on loop - cross iteration parallelism is not
1736      exploited.  */
1737   bool only_slp_in_loop = true;
1738   for (i = 0; i < nbbs; i++)
1739     {
1740       basic_block bb = bbs[i];
1741       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742            gsi_next (&si))
1743         {
1744           gimple *stmt = gsi_stmt (si);
1745           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747               && STMT_VINFO_RELATED_STMT (stmt_info))
1748             {
1749               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750               stmt_info = vinfo_for_stmt (stmt);
1751             }
1752           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754               && !PURE_SLP_STMT (stmt_info))
1755             /* STMT needs both SLP and loop-based vectorization.  */
1756             only_slp_in_loop = false;
1757         }
1758     }
1759
1760   if (only_slp_in_loop)
1761     {
1762       dump_printf_loc (MSG_NOTE, vect_location,
1763                        "Loop contains only SLP stmts\n");
1764       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1765     }
1766   else
1767     {
1768       dump_printf_loc (MSG_NOTE, vect_location,
1769                        "Loop contains SLP and non-SLP stmts\n");
1770       /* Both the vectorization factor and unroll factor have the form
1771          current_vector_size * X for some rational X, so they must have
1772          a common multiple.  */
1773       vectorization_factor
1774         = force_common_multiple (vectorization_factor,
1775                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1776     }
1777
1778   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779   if (dump_enabled_p ())
1780     {
1781       dump_printf_loc (MSG_NOTE, vect_location,
1782                        "Updating vectorization factor to ");
1783       dump_dec (MSG_NOTE, vectorization_factor);
1784       dump_printf (MSG_NOTE, ".\n");
1785     }
1786 }
1787
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789    the other phi in the reduction is also relevant for vectorization.
1790    This rejects cases such as:
1791
1792       outer1:
1793         x_1 = PHI <x_3(outer2), ...>;
1794         ...
1795
1796       inner:
1797         x_2 = ...;
1798         ...
1799
1800       outer2:
1801         x_3 = PHI <x_2(inner)>;
1802
1803    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1804
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1807 {
1808   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809     return false;
1810
1811   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1813 }
1814
1815 /* Function vect_analyze_loop_operations.
1816
1817    Scan the loop stmts and make sure they are all vectorizable.  */
1818
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1821 {
1822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824   int nbbs = loop->num_nodes;
1825   int i;
1826   stmt_vec_info stmt_info;
1827   bool need_to_vectorize = false;
1828   bool ok;
1829
1830   if (dump_enabled_p ())
1831     dump_printf_loc (MSG_NOTE, vect_location,
1832                      "=== vect_analyze_loop_operations ===\n");
1833
1834   for (i = 0; i < nbbs; i++)
1835     {
1836       basic_block bb = bbs[i];
1837
1838       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839            gsi_next (&si))
1840         {
1841           gphi *phi = si.phi ();
1842           ok = true;
1843
1844           stmt_info = vinfo_for_stmt (phi);
1845           if (dump_enabled_p ())
1846             {
1847               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1849             }
1850           if (virtual_operand_p (gimple_phi_result (phi)))
1851             continue;
1852
1853           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854              (i.e., a phi in the tail of the outer-loop).  */
1855           if (! is_loop_header_bb_p (bb))
1856             {
1857               /* FORNOW: we currently don't support the case that these phis
1858                  are not used in the outerloop (unless it is double reduction,
1859                  i.e., this phi is vect_reduction_def), cause this case
1860                  requires to actually do something here.  */
1861               if (STMT_VINFO_LIVE_P (stmt_info)
1862                   && !vect_active_double_reduction_p (stmt_info))
1863                 {
1864                   if (dump_enabled_p ())
1865                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866                                      "Unsupported loop-closed phi in "
1867                                      "outer-loop.\n");
1868                   return false;
1869                 }
1870
1871               /* If PHI is used in the outer loop, we check that its operand
1872                  is defined in the inner loop.  */
1873               if (STMT_VINFO_RELEVANT_P (stmt_info))
1874                 {
1875                   tree phi_op;
1876                   gimple *op_def_stmt;
1877
1878                   if (gimple_phi_num_args (phi) != 1)
1879                     return false;
1880
1881                   phi_op = PHI_ARG_DEF (phi, 0);
1882                   if (TREE_CODE (phi_op) != SSA_NAME)
1883                     return false;
1884
1885                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886                   if (gimple_nop_p (op_def_stmt)
1887                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888                       || !vinfo_for_stmt (op_def_stmt))
1889                     return false;
1890
1891                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892                         != vect_used_in_outer
1893                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894                            != vect_used_in_outer_by_reduction)
1895                     return false;
1896                 }
1897
1898               continue;
1899             }
1900
1901           gcc_assert (stmt_info);
1902
1903           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904                || STMT_VINFO_LIVE_P (stmt_info))
1905               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1906             {
1907               /* A scalar-dependence cycle that we don't support.  */
1908               if (dump_enabled_p ())
1909                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910                                  "not vectorized: scalar dependence cycle.\n");
1911               return false;
1912             }
1913
1914           if (STMT_VINFO_RELEVANT_P (stmt_info))
1915             {
1916               need_to_vectorize = true;
1917               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918                   && ! PURE_SLP_STMT (stmt_info))
1919                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922                        && ! PURE_SLP_STMT (stmt_info))
1923                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1924             }
1925
1926           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1927           if (ok
1928               && STMT_VINFO_LIVE_P (stmt_info)
1929               && !PURE_SLP_STMT (stmt_info))
1930             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1931
1932           if (!ok)
1933             {
1934               if (dump_enabled_p ())
1935                 {
1936                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                                    "not vectorized: relevant phi not "
1938                                    "supported: ");
1939                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1940                 }
1941               return false;
1942             }
1943         }
1944
1945       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946            gsi_next (&si))
1947         {
1948           gimple *stmt = gsi_stmt (si);
1949           if (!gimple_clobber_p (stmt)
1950               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951             return false;
1952         }
1953     } /* bbs */
1954
1955   /* All operations in the loop are either irrelevant (deal with loop
1956      control, or dead), or only used outside the loop and can be moved
1957      out of the loop (e.g. invariants, inductions).  The loop can be
1958      optimized away by scalar optimizations.  We're better off not
1959      touching this loop.  */
1960   if (!need_to_vectorize)
1961     {
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                          "All the computation can be taken out of the loop.\n");
1965       if (dump_enabled_p ())
1966         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967                          "not vectorized: redundant loop. no profit to "
1968                          "vectorize.\n");
1969       return false;
1970     }
1971
1972   return true;
1973 }
1974
1975 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1976    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1977    definitely no, or -1 if it's worth retrying.  */
1978
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1981 {
1982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1984
1985   /* Only fully-masked loops can have iteration counts less than the
1986      vectorization factor.  */
1987   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988     {
1989       HOST_WIDE_INT max_niter;
1990
1991       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993       else
1994         max_niter = max_stmt_executions_int (loop);
1995
1996       if (max_niter != -1
1997           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1998         {
1999           if (dump_enabled_p ())
2000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001                              "not vectorized: iteration count smaller than "
2002                              "vectorization factor.\n");
2003           return 0;
2004         }
2005     }
2006
2007   int min_profitable_iters, min_profitable_estimate;
2008   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009                                       &min_profitable_estimate);
2010
2011   if (min_profitable_iters < 0)
2012     {
2013       if (dump_enabled_p ())
2014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015                          "not vectorized: vectorization not profitable.\n");
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "not vectorized: vector version will never be "
2019                          "profitable.\n");
2020       return -1;
2021     }
2022
2023   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024                                * assumed_vf);
2025
2026   /* Use the cost model only if it is more conservative than user specified
2027      threshold.  */
2028   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029                                     min_profitable_iters);
2030
2031   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2032
2033   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2035     {
2036       if (dump_enabled_p ())
2037         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038                          "not vectorized: vectorization not profitable.\n");
2039       if (dump_enabled_p ())
2040         dump_printf_loc (MSG_NOTE, vect_location,
2041                          "not vectorized: iteration count smaller than user "
2042                          "specified loop bound parameter or minimum profitable "
2043                          "iterations (whichever is more conservative).\n");
2044       return 0;
2045     }
2046
2047   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048   if (estimated_niter == -1)
2049     estimated_niter = likely_max_stmt_executions_int (loop);
2050   if (estimated_niter != -1
2051       && ((unsigned HOST_WIDE_INT) estimated_niter
2052           < MAX (th, (unsigned) min_profitable_estimate)))
2053     {
2054       if (dump_enabled_p ())
2055         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056                          "not vectorized: estimated iteration count too "
2057                          "small.\n");
2058       if (dump_enabled_p ())
2059         dump_printf_loc (MSG_NOTE, vect_location,
2060                          "not vectorized: estimated iteration count smaller "
2061                          "than specified loop bound parameter or minimum "
2062                          "profitable iterations (whichever is more "
2063                          "conservative).\n");
2064       return -1;
2065     }
2066
2067   return 1;
2068 }
2069
2070
2071 /* Function vect_analyze_loop_2.
2072
2073    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074    for it.  The different analyses will record information in the
2075    loop_vec_info struct.  */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2078 {
2079   bool ok;
2080   int res;
2081   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082   poly_uint64 min_vf = 2;
2083   unsigned int n_stmts = 0;
2084
2085   /* The first group of checks is independent of the vector size.  */
2086   fatal = true;
2087
2088   /* Find all data references in the loop (which correspond to vdefs/vuses)
2089      and analyze their evolution in the loop.  */
2090
2091   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2092
2093   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2095     {
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "not vectorized: loop nest containing two "
2099                          "or more consecutive inner loops cannot be "
2100                          "vectorized\n");
2101       return false;
2102     }
2103
2104   for (unsigned i = 0; i < loop->num_nodes; i++)
2105     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106          !gsi_end_p (gsi); gsi_next (&gsi))
2107       {
2108         gimple *stmt = gsi_stmt (gsi);
2109         if (is_gimple_debug (stmt))
2110           continue;
2111         ++n_stmts;
2112         if (!find_data_references_in_stmt (loop, stmt,
2113                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2114           {
2115             if (is_gimple_call (stmt) && loop->safelen)
2116               {
2117                 tree fndecl = gimple_call_fndecl (stmt), op;
2118                 if (fndecl != NULL_TREE)
2119                   {
2120                     cgraph_node *node = cgraph_node::get (fndecl);
2121                     if (node != NULL && node->simd_clones != NULL)
2122                       {
2123                         unsigned int j, n = gimple_call_num_args (stmt);
2124                         for (j = 0; j < n; j++)
2125                           {
2126                             op = gimple_call_arg (stmt, j);
2127                             if (DECL_P (op)
2128                                 || (REFERENCE_CLASS_P (op)
2129                                     && get_base_address (op)))
2130                               break;
2131                           }
2132                         op = gimple_call_lhs (stmt);
2133                         /* Ignore #pragma omp declare simd functions
2134                            if they don't have data references in the
2135                            call stmt itself.  */
2136                         if (j == n
2137                             && !(op
2138                                  && (DECL_P (op)
2139                                      || (REFERENCE_CLASS_P (op)
2140                                          && get_base_address (op)))))
2141                           continue;
2142                       }
2143                   }
2144               }
2145             if (dump_enabled_p ())
2146               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                                "not vectorized: loop contains function "
2148                                "calls or data references that cannot "
2149                                "be analyzed\n");
2150             return false;
2151           }
2152       }
2153
2154   /* Analyze the data references and also adjust the minimal
2155      vectorization factor according to the loads and stores.  */
2156
2157   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158   if (!ok)
2159     {
2160       if (dump_enabled_p ())
2161         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162                          "bad data references.\n");
2163       return false;
2164     }
2165
2166   /* Classify all cross-iteration scalar data-flow cycles.
2167      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2168   vect_analyze_scalar_cycles (loop_vinfo);
2169
2170   vect_pattern_recog (loop_vinfo);
2171
2172   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2173
2174   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2176
2177   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178   if (!ok)
2179     {
2180       if (dump_enabled_p ())
2181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182                          "bad data access.\n");
2183       return false;
2184     }
2185
2186   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2187
2188   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189   if (!ok)
2190     {
2191       if (dump_enabled_p ())
2192         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                          "unexpected pattern.\n");
2194       return false;
2195     }
2196
2197   /* While the rest of the analysis below depends on it in some way.  */
2198   fatal = false;
2199
2200   /* Analyze data dependences between the data-refs in the loop
2201      and adjust the maximum vectorization factor according to
2202      the dependences.
2203      FORNOW: fail at the first data dependence that we encounter.  */
2204
2205   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206   if (!ok
2207       || (max_vf != MAX_VECTORIZATION_FACTOR
2208           && maybe_lt (max_vf, min_vf)))
2209     {
2210       if (dump_enabled_p ())
2211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212                              "bad data dependence.\n");
2213       return false;
2214     }
2215   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2216
2217   ok = vect_determine_vectorization_factor (loop_vinfo);
2218   if (!ok)
2219     {
2220       if (dump_enabled_p ())
2221         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222                          "can't determine vectorization factor.\n");
2223       return false;
2224     }
2225   if (max_vf != MAX_VECTORIZATION_FACTOR
2226       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2227     {
2228       if (dump_enabled_p ())
2229         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230                          "bad data dependence.\n");
2231       return false;
2232     }
2233
2234   /* Compute the scalar iteration cost.  */
2235   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2236
2237   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238   unsigned th;
2239
2240   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2241   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242   if (!ok)
2243     return false;
2244
2245   /* If there are any SLP instances mark them as pure_slp.  */
2246   bool slp = vect_make_slp_decision (loop_vinfo);
2247   if (slp)
2248     {
2249       /* Find stmts that need to be both vectorized and SLPed.  */
2250       vect_detect_hybrid_slp (loop_vinfo);
2251
2252       /* Update the vectorization factor based on the SLP decision.  */
2253       vect_update_vf_for_slp (loop_vinfo);
2254     }
2255
2256   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2257
2258   /* We don't expect to have to roll back to anything other than an empty
2259      set of rgroups.  */
2260   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2261
2262   /* This is the point where we can re-start analysis with SLP forced off.  */
2263 start_over:
2264
2265   /* Now the vectorization factor is final.  */
2266   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267   gcc_assert (known_ne (vectorization_factor, 0U));
2268
2269   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2270     {
2271       dump_printf_loc (MSG_NOTE, vect_location,
2272                        "vectorization_factor = ");
2273       dump_dec (MSG_NOTE, vectorization_factor);
2274       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2276     }
2277
2278   HOST_WIDE_INT max_niter
2279     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2280
2281   /* Analyze the alignment of the data-refs in the loop.
2282      Fail if a data reference is found that cannot be vectorized.  */
2283
2284   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                          "bad data alignment.\n");
2290       return false;
2291     }
2292
2293   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294      It is important to call pruning after vect_analyze_data_ref_accesses,
2295      since we use grouping information gathered by interleaving analysis.  */
2296   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297   if (!ok)
2298     return false;
2299
2300   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301      vectorization.  */
2302   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2303     {
2304     /* This pass will decide on using loop versioning and/or loop peeling in
2305        order to enhance the alignment of data references in the loop.  */
2306     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307     if (!ok)
2308       {
2309         if (dump_enabled_p ())
2310           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                            "bad data alignment.\n");
2312         return false;
2313       }
2314     }
2315
2316   if (slp)
2317     {
2318       /* Analyze operations in the SLP instances.  Note this may
2319          remove unsupported SLP instances which makes the above
2320          SLP kind detection invalid.  */
2321       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322       vect_slp_analyze_operations (loop_vinfo);
2323       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324         goto again;
2325     }
2326
2327   /* Scan all the remaining operations in the loop that are not subject
2328      to SLP and make sure they are vectorizable.  */
2329   ok = vect_analyze_loop_operations (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                          "bad operation or unsupported loop bound.\n");
2335       return false;
2336     }
2337
2338   /* Decide whether to use a fully-masked loop for this vectorization
2339      factor.  */
2340   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342        && vect_verify_full_masking (loop_vinfo));
2343   if (dump_enabled_p ())
2344     {
2345       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346         dump_printf_loc (MSG_NOTE, vect_location,
2347                          "using a fully-masked loop.\n");
2348       else
2349         dump_printf_loc (MSG_NOTE, vect_location,
2350                          "not using a fully-masked loop.\n");
2351     }
2352
2353   /* If epilog loop is required because of data accesses with gaps,
2354      one additional iteration needs to be peeled.  Check if there is
2355      enough iterations for vectorization.  */
2356   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2359     {
2360       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2362
2363       if (known_lt (wi::to_widest (scalar_niters), vf))
2364         {
2365           if (dump_enabled_p ())
2366             dump_printf_loc (MSG_NOTE, vect_location,
2367                              "loop has no enough iterations to support"
2368                              " peeling for gaps.\n");
2369           return false;
2370         }
2371     }
2372
2373   /* Check the costings of the loop make vectorizing worthwhile.  */
2374   res = vect_analyze_loop_costing (loop_vinfo);
2375   if (res < 0)
2376     goto again;
2377   if (!res)
2378     {
2379       if (dump_enabled_p ())
2380         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381                          "Loop costings not worthwhile.\n");
2382       return false;
2383     }
2384
2385   /* Decide whether we need to create an epilogue loop to handle
2386      remaining scalar iterations.  */
2387   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2388
2389   unsigned HOST_WIDE_INT const_vf;
2390   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391     /* The main loop handles all iterations.  */
2392     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2395     {
2396       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2397                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2398                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2399         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2400     }
2401   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2402            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2403            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2404                 < (unsigned) exact_log2 (const_vf))
2405                /* In case of versioning, check if the maximum number of
2406                   iterations is greater than th.  If they are identical,
2407                   the epilogue is unnecessary.  */
2408                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409                    || ((unsigned HOST_WIDE_INT) max_niter
2410                        > (th / const_vf) * const_vf))))
2411     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2412
2413   /* If an epilogue loop is required make sure we can create one.  */
2414   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2415       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2416     {
2417       if (dump_enabled_p ())
2418         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2419       if (!vect_can_advance_ivs_p (loop_vinfo)
2420           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2421                                            single_exit (LOOP_VINFO_LOOP
2422                                                          (loop_vinfo))))
2423         {
2424           if (dump_enabled_p ())
2425             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2426                              "not vectorized: can't create required "
2427                              "epilog loop\n");
2428           goto again;
2429         }
2430     }
2431
2432   /* During peeling, we need to check if number of loop iterations is
2433      enough for both peeled prolog loop and vector loop.  This check
2434      can be merged along with threshold check of loop versioning, so
2435      increase threshold for this case if necessary.  */
2436   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2437     {
2438       poly_uint64 niters_th = 0;
2439
2440       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2441         {
2442           /* Niters for peeled prolog loop.  */
2443           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2444             {
2445               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2446               tree vectype
2447                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2448               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2449             }
2450           else
2451             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2452         }
2453
2454       /* Niters for at least one iteration of vectorized loop.  */
2455       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2456         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457       /* One additional iteration because of peeling for gap.  */
2458       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2459         niters_th += 1;
2460       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2461     }
2462
2463   gcc_assert (known_eq (vectorization_factor,
2464                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2465
2466   /* Ok to vectorize!  */
2467   return true;
2468
2469 again:
2470   /* Try again with SLP forced off but if we didn't do any SLP there is
2471      no point in re-trying.  */
2472   if (!slp)
2473     return false;
2474
2475   /* If there are reduction chains re-trying will fail anyway.  */
2476   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2477     return false;
2478
2479   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2480      via interleaving or lane instructions.  */
2481   slp_instance instance;
2482   slp_tree node;
2483   unsigned i, j;
2484   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2485     {
2486       stmt_vec_info vinfo;
2487       vinfo = vinfo_for_stmt
2488           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2489       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2490         continue;
2491       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2493       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2494       if (! vect_store_lanes_supported (vectype, size, false)
2495           && ! vect_grouped_store_supported (vectype, size))
2496         return false;
2497       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2498         {
2499           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2500           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2501           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2502           size = STMT_VINFO_GROUP_SIZE (vinfo);
2503           vectype = STMT_VINFO_VECTYPE (vinfo);
2504           if (! vect_load_lanes_supported (vectype, size, false)
2505               && ! vect_grouped_load_supported (vectype, single_element_p,
2506                                                 size))
2507             return false;
2508         }
2509     }
2510
2511   if (dump_enabled_p ())
2512     dump_printf_loc (MSG_NOTE, vect_location,
2513                      "re-trying with SLP disabled\n");
2514
2515   /* Roll back state appropriately.  No SLP this time.  */
2516   slp = false;
2517   /* Restore vectorization factor as it were without SLP.  */
2518   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2519   /* Free the SLP instances.  */
2520   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2521     vect_free_slp_instance (instance);
2522   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2523   /* Reset SLP type to loop_vect on all stmts.  */
2524   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2525     {
2526       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2527       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2528            !gsi_end_p (si); gsi_next (&si))
2529         {
2530           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2531           STMT_SLP_TYPE (stmt_info) = loop_vect;
2532         }
2533       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2534            !gsi_end_p (si); gsi_next (&si))
2535         {
2536           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2537           STMT_SLP_TYPE (stmt_info) = loop_vect;
2538           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2539             {
2540               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2541               STMT_SLP_TYPE (stmt_info) = loop_vect;
2542               for (gimple_stmt_iterator pi
2543                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2544                    !gsi_end_p (pi); gsi_next (&pi))
2545                 {
2546                   gimple *pstmt = gsi_stmt (pi);
2547                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2548                 }
2549             }
2550         }
2551     }
2552   /* Free optimized alias test DDRS.  */
2553   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2554   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2555   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2556   /* Reset target cost data.  */
2557   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2558   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2559     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2560   /* Reset accumulated rgroup information.  */
2561   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2562   /* Reset assorted flags.  */
2563   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2564   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2565   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2566   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2567   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2568
2569   goto start_over;
2570 }
2571
2572 /* Function vect_analyze_loop.
2573
2574    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2575    for it.  The different analyses will record information in the
2576    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2577    be vectorized.  */
2578 loop_vec_info
2579 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2580 {
2581   loop_vec_info loop_vinfo;
2582   auto_vector_sizes vector_sizes;
2583
2584   /* Autodetect first vector size we try.  */
2585   current_vector_size = 0;
2586   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2587   unsigned int next_size = 0;
2588
2589   if (dump_enabled_p ())
2590     dump_printf_loc (MSG_NOTE, vect_location,
2591                      "===== analyze_loop_nest =====\n");
2592
2593   if (loop_outer (loop)
2594       && loop_vec_info_for_loop (loop_outer (loop))
2595       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2596     {
2597       if (dump_enabled_p ())
2598         dump_printf_loc (MSG_NOTE, vect_location,
2599                          "outer-loop already vectorized.\n");
2600       return NULL;
2601     }
2602
2603   poly_uint64 autodetected_vector_size = 0;
2604   while (1)
2605     {
2606       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2607       loop_vinfo = vect_analyze_loop_form (loop);
2608       if (!loop_vinfo)
2609         {
2610           if (dump_enabled_p ())
2611             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2612                              "bad loop form.\n");
2613           return NULL;
2614         }
2615
2616       bool fatal = false;
2617
2618       if (orig_loop_vinfo)
2619         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2620
2621       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2622         {
2623           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2624
2625           return loop_vinfo;
2626         }
2627
2628       delete loop_vinfo;
2629
2630       if (next_size == 0)
2631         autodetected_vector_size = current_vector_size;
2632
2633       if (next_size < vector_sizes.length ()
2634           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2635         next_size += 1;
2636
2637       if (fatal
2638           || next_size == vector_sizes.length ()
2639           || known_eq (current_vector_size, 0U))
2640         return NULL;
2641
2642       /* Try the next biggest vector size.  */
2643       current_vector_size = vector_sizes[next_size++];
2644       if (dump_enabled_p ())
2645         {
2646           dump_printf_loc (MSG_NOTE, vect_location,
2647                            "***** Re-trying analysis with "
2648                            "vector size ");
2649           dump_dec (MSG_NOTE, current_vector_size);
2650           dump_printf (MSG_NOTE, "\n");
2651         }
2652     }
2653 }
2654
2655 /* Return true if there is an in-order reduction function for CODE, storing
2656    it in *REDUC_FN if so.  */
2657
2658 static bool
2659 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2660 {
2661   switch (code)
2662     {
2663     case PLUS_EXPR:
2664       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2665       return true;
2666
2667     default:
2668       return false;
2669     }
2670 }
2671
2672 /* Function reduction_fn_for_scalar_code
2673
2674    Input:
2675    CODE - tree_code of a reduction operations.
2676
2677    Output:
2678    REDUC_FN - the corresponding internal function to be used to reduce the
2679       vector of partial results into a single scalar result, or IFN_LAST
2680       if the operation is a supported reduction operation, but does not have
2681       such an internal function.
2682
2683    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2684
2685 static bool
2686 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2687 {
2688   switch (code)
2689     {
2690       case MAX_EXPR:
2691         *reduc_fn = IFN_REDUC_MAX;
2692         return true;
2693
2694       case MIN_EXPR:
2695         *reduc_fn = IFN_REDUC_MIN;
2696         return true;
2697
2698       case PLUS_EXPR:
2699         *reduc_fn = IFN_REDUC_PLUS;
2700         return true;
2701
2702       case BIT_AND_EXPR:
2703         *reduc_fn = IFN_REDUC_AND;
2704         return true;
2705
2706       case BIT_IOR_EXPR:
2707         *reduc_fn = IFN_REDUC_IOR;
2708         return true;
2709
2710       case BIT_XOR_EXPR:
2711         *reduc_fn = IFN_REDUC_XOR;
2712         return true;
2713
2714       case MULT_EXPR:
2715       case MINUS_EXPR:
2716         *reduc_fn = IFN_LAST;
2717         return true;
2718
2719       default:
2720        return false;
2721     }
2722 }
2723
2724 /* If there is a neutral value X such that SLP reduction NODE would not
2725    be affected by the introduction of additional X elements, return that X,
2726    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2727    is true if the SLP statements perform a single reduction, false if each
2728    statement performs an independent reduction.  */
2729
2730 static tree
2731 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2732                               bool reduc_chain)
2733 {
2734   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2735   gimple *stmt = stmts[0];
2736   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2737   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2738   tree scalar_type = TREE_TYPE (vector_type);
2739   struct loop *loop = gimple_bb (stmt)->loop_father;
2740   gcc_assert (loop);
2741
2742   switch (code)
2743     {
2744     case WIDEN_SUM_EXPR:
2745     case DOT_PROD_EXPR:
2746     case SAD_EXPR:
2747     case PLUS_EXPR:
2748     case MINUS_EXPR:
2749     case BIT_IOR_EXPR:
2750     case BIT_XOR_EXPR:
2751       return build_zero_cst (scalar_type);
2752
2753     case MULT_EXPR:
2754       return build_one_cst (scalar_type);
2755
2756     case BIT_AND_EXPR:
2757       return build_all_ones_cst (scalar_type);
2758
2759     case MAX_EXPR:
2760     case MIN_EXPR:
2761       /* For MIN/MAX the initial values are neutral.  A reduction chain
2762          has only a single initial value, so that value is neutral for
2763          all statements.  */
2764       if (reduc_chain)
2765         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2766       return NULL_TREE;
2767
2768     default:
2769       return NULL_TREE;
2770     }
2771 }
2772
2773 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2774    STMT is printed with a message MSG. */
2775
2776 static void
2777 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2778 {
2779   dump_printf_loc (msg_type, vect_location, "%s", msg);
2780   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2781 }
2782
2783
2784 /* Detect SLP reduction of the form:
2785
2786    #a1 = phi <a5, a0>
2787    a2 = operation (a1)
2788    a3 = operation (a2)
2789    a4 = operation (a3)
2790    a5 = operation (a4)
2791
2792    #a = phi <a5>
2793
2794    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2795    FIRST_STMT is the first reduction stmt in the chain
2796    (a2 = operation (a1)).
2797
2798    Return TRUE if a reduction chain was detected.  */
2799
2800 static bool
2801 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2802                        gimple *first_stmt)
2803 {
2804   struct loop *loop = (gimple_bb (phi))->loop_father;
2805   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2806   enum tree_code code;
2807   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2808   stmt_vec_info use_stmt_info, current_stmt_info;
2809   tree lhs;
2810   imm_use_iterator imm_iter;
2811   use_operand_p use_p;
2812   int nloop_uses, size = 0, n_out_of_loop_uses;
2813   bool found = false;
2814
2815   if (loop != vect_loop)
2816     return false;
2817
2818   lhs = PHI_RESULT (phi);
2819   code = gimple_assign_rhs_code (first_stmt);
2820   while (1)
2821     {
2822       nloop_uses = 0;
2823       n_out_of_loop_uses = 0;
2824       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2825         {
2826           gimple *use_stmt = USE_STMT (use_p);
2827           if (is_gimple_debug (use_stmt))
2828             continue;
2829
2830           /* Check if we got back to the reduction phi.  */
2831           if (use_stmt == phi)
2832             {
2833               loop_use_stmt = use_stmt;
2834               found = true;
2835               break;
2836             }
2837
2838           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2839             {
2840               loop_use_stmt = use_stmt;
2841               nloop_uses++;
2842             }
2843            else
2844              n_out_of_loop_uses++;
2845
2846            /* There are can be either a single use in the loop or two uses in
2847               phi nodes.  */
2848            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2849              return false;
2850         }
2851
2852       if (found)
2853         break;
2854
2855       /* We reached a statement with no loop uses.  */
2856       if (nloop_uses == 0)
2857         return false;
2858
2859       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2860       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2861         return false;
2862
2863       if (!is_gimple_assign (loop_use_stmt)
2864           || code != gimple_assign_rhs_code (loop_use_stmt)
2865           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2866         return false;
2867
2868       /* Insert USE_STMT into reduction chain.  */
2869       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2870       if (current_stmt)
2871         {
2872           current_stmt_info = vinfo_for_stmt (current_stmt);
2873           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2874           GROUP_FIRST_ELEMENT (use_stmt_info)
2875             = GROUP_FIRST_ELEMENT (current_stmt_info);
2876         }
2877       else
2878         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2879
2880       lhs = gimple_assign_lhs (loop_use_stmt);
2881       current_stmt = loop_use_stmt;
2882       size++;
2883    }
2884
2885   if (!found || loop_use_stmt != phi || size < 2)
2886     return false;
2887
2888   /* Swap the operands, if needed, to make the reduction operand be the second
2889      operand.  */
2890   lhs = PHI_RESULT (phi);
2891   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2892   while (next_stmt)
2893     {
2894       if (gimple_assign_rhs2 (next_stmt) == lhs)
2895         {
2896           tree op = gimple_assign_rhs1 (next_stmt);
2897           gimple *def_stmt = NULL;
2898
2899           if (TREE_CODE (op) == SSA_NAME)
2900             def_stmt = SSA_NAME_DEF_STMT (op);
2901
2902           /* Check that the other def is either defined in the loop
2903              ("vect_internal_def"), or it's an induction (defined by a
2904              loop-header phi-node).  */
2905           if (def_stmt
2906               && gimple_bb (def_stmt)
2907               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2908               && (is_gimple_assign (def_stmt)
2909                   || is_gimple_call (def_stmt)
2910                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2911                            == vect_induction_def
2912                   || (gimple_code (def_stmt) == GIMPLE_PHI
2913                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2914                                   == vect_internal_def
2915                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2916             {
2917               lhs = gimple_assign_lhs (next_stmt);
2918               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2919               continue;
2920             }
2921
2922           return false;
2923         }
2924       else
2925         {
2926           tree op = gimple_assign_rhs2 (next_stmt);
2927           gimple *def_stmt = NULL;
2928
2929           if (TREE_CODE (op) == SSA_NAME)
2930             def_stmt = SSA_NAME_DEF_STMT (op);
2931
2932           /* Check that the other def is either defined in the loop
2933             ("vect_internal_def"), or it's an induction (defined by a
2934             loop-header phi-node).  */
2935           if (def_stmt
2936               && gimple_bb (def_stmt)
2937               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2938               && (is_gimple_assign (def_stmt)
2939                   || is_gimple_call (def_stmt)
2940                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2941                               == vect_induction_def
2942                   || (gimple_code (def_stmt) == GIMPLE_PHI
2943                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2944                                   == vect_internal_def
2945                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2946             {
2947               if (dump_enabled_p ())
2948                 {
2949                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2950                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2951                 }
2952
2953               swap_ssa_operands (next_stmt,
2954                                  gimple_assign_rhs1_ptr (next_stmt),
2955                                  gimple_assign_rhs2_ptr (next_stmt));
2956               update_stmt (next_stmt);
2957
2958               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2959                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2960             }
2961           else
2962             return false;
2963         }
2964
2965       lhs = gimple_assign_lhs (next_stmt);
2966       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2967     }
2968
2969   /* Save the chain for further analysis in SLP detection.  */
2970   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2971   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2972   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2973
2974   return true;
2975 }
2976
2977 /* Return true if we need an in-order reduction for operation CODE
2978    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2979    overflow must wrap.  */
2980
2981 static bool
2982 needs_fold_left_reduction_p (tree type, tree_code code,
2983                              bool need_wrapping_integral_overflow)
2984 {
2985   /* CHECKME: check for !flag_finite_math_only too?  */
2986   if (SCALAR_FLOAT_TYPE_P (type))
2987     switch (code)
2988       {
2989       case MIN_EXPR:
2990       case MAX_EXPR:
2991         return false;
2992
2993       default:
2994         return !flag_associative_math;
2995       }
2996
2997   if (INTEGRAL_TYPE_P (type))
2998     {
2999       if (!operation_no_trapping_overflow (type, code))
3000         return true;
3001       if (need_wrapping_integral_overflow
3002           && !TYPE_OVERFLOW_WRAPS (type)
3003           && operation_can_overflow (code))
3004         return true;
3005       return false;
3006     }
3007
3008   if (SAT_FIXED_POINT_TYPE_P (type))
3009     return true;
3010
3011   return false;
3012 }
3013
3014 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3015    reduction operation CODE has a handled computation expression.  */
3016
3017 bool
3018 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3019                       enum tree_code code)
3020 {
3021   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3022   auto_bitmap visited;
3023   tree lookfor = PHI_RESULT (phi);
3024   ssa_op_iter curri;
3025   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3026   while (USE_FROM_PTR (curr) != loop_arg)
3027     curr = op_iter_next_use (&curri);
3028   curri.i = curri.numops;
3029   do
3030     {
3031       path.safe_push (std::make_pair (curri, curr));
3032       tree use = USE_FROM_PTR (curr);
3033       if (use == lookfor)
3034         break;
3035       gimple *def = SSA_NAME_DEF_STMT (use);
3036       if (gimple_nop_p (def)
3037           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3038         {
3039 pop:
3040           do
3041             {
3042               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3043               curri = x.first;
3044               curr = x.second;
3045               do
3046                 curr = op_iter_next_use (&curri);
3047               /* Skip already visited or non-SSA operands (from iterating
3048                  over PHI args).  */
3049               while (curr != NULL_USE_OPERAND_P
3050                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3051                          || ! bitmap_set_bit (visited,
3052                                               SSA_NAME_VERSION
3053                                                 (USE_FROM_PTR (curr)))));
3054             }
3055           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3056           if (curr == NULL_USE_OPERAND_P)
3057             break;
3058         }
3059       else
3060         {
3061           if (gimple_code (def) == GIMPLE_PHI)
3062             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3063           else
3064             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3065           while (curr != NULL_USE_OPERAND_P
3066                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3067                      || ! bitmap_set_bit (visited,
3068                                           SSA_NAME_VERSION
3069                                             (USE_FROM_PTR (curr)))))
3070             curr = op_iter_next_use (&curri);
3071           if (curr == NULL_USE_OPERAND_P)
3072             goto pop;
3073         }
3074     }
3075   while (1);
3076   if (dump_file && (dump_flags & TDF_DETAILS))
3077     {
3078       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3079       unsigned i;
3080       std::pair<ssa_op_iter, use_operand_p> *x;
3081       FOR_EACH_VEC_ELT (path, i, x)
3082         {
3083           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3084           dump_printf (MSG_NOTE, " ");
3085         }
3086       dump_printf (MSG_NOTE, "\n");
3087     }
3088
3089   /* Check whether the reduction path detected is valid.  */
3090   bool fail = path.length () == 0;
3091   bool neg = false;
3092   for (unsigned i = 1; i < path.length (); ++i)
3093     {
3094       gimple *use_stmt = USE_STMT (path[i].second);
3095       tree op = USE_FROM_PTR (path[i].second);
3096       if (! has_single_use (op)
3097           || ! is_gimple_assign (use_stmt))
3098         {
3099           fail = true;
3100           break;
3101         }
3102       if (gimple_assign_rhs_code (use_stmt) != code)
3103         {
3104           if (code == PLUS_EXPR
3105               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3106             {
3107               /* Track whether we negate the reduction value each iteration.  */
3108               if (gimple_assign_rhs2 (use_stmt) == op)
3109                 neg = ! neg;
3110             }
3111           else
3112             {
3113               fail = true;
3114               break;
3115             }
3116         }
3117     }
3118   return ! fail && ! neg;
3119 }
3120
3121
3122 /* Function vect_is_simple_reduction
3123
3124    (1) Detect a cross-iteration def-use cycle that represents a simple
3125    reduction computation.  We look for the following pattern:
3126
3127    loop_header:
3128      a1 = phi < a0, a2 >
3129      a3 = ...
3130      a2 = operation (a3, a1)
3131
3132    or
3133
3134    a3 = ...
3135    loop_header:
3136      a1 = phi < a0, a2 >
3137      a2 = operation (a3, a1)
3138
3139    such that:
3140    1. operation is commutative and associative and it is safe to
3141       change the order of the computation
3142    2. no uses for a2 in the loop (a2 is used out of the loop)
3143    3. no uses of a1 in the loop besides the reduction operation
3144    4. no uses of a1 outside the loop.
3145
3146    Conditions 1,4 are tested here.
3147    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3148
3149    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3150    nested cycles.
3151
3152    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3153    reductions:
3154
3155      a1 = phi < a0, a2 >
3156      inner loop (def of a3)
3157      a2 = phi < a3 >
3158
3159    (4) Detect condition expressions, ie:
3160      for (int i = 0; i < N; i++)
3161        if (a[i] < val)
3162         ret_val = a[i];
3163
3164 */
3165
3166 static gimple *
3167 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3168                           bool *double_reduc,
3169                           bool need_wrapping_integral_overflow,
3170                           enum vect_reduction_type *v_reduc_type)
3171 {
3172   struct loop *loop = (gimple_bb (phi))->loop_father;
3173   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3174   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3175   enum tree_code orig_code, code;
3176   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3177   tree type;
3178   int nloop_uses;
3179   tree name;
3180   imm_use_iterator imm_iter;
3181   use_operand_p use_p;
3182   bool phi_def;
3183
3184   *double_reduc = false;
3185   *v_reduc_type = TREE_CODE_REDUCTION;
3186
3187   tree phi_name = PHI_RESULT (phi);
3188   /* ???  If there are no uses of the PHI result the inner loop reduction
3189      won't be detected as possibly double-reduction by vectorizable_reduction
3190      because that tries to walk the PHI arg from the preheader edge which
3191      can be constant.  See PR60382.  */
3192   if (has_zero_uses (phi_name))
3193     return NULL;
3194   nloop_uses = 0;
3195   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3196     {
3197       gimple *use_stmt = USE_STMT (use_p);
3198       if (is_gimple_debug (use_stmt))
3199         continue;
3200
3201       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3202         {
3203           if (dump_enabled_p ())
3204             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3205                              "intermediate value used outside loop.\n");
3206
3207           return NULL;
3208         }
3209
3210       nloop_uses++;
3211       if (nloop_uses > 1)
3212         {
3213           if (dump_enabled_p ())
3214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3215                              "reduction value used in loop.\n");
3216           return NULL;
3217         }
3218
3219       phi_use_stmt = use_stmt;
3220     }
3221
3222   edge latch_e = loop_latch_edge (loop);
3223   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3224   if (TREE_CODE (loop_arg) != SSA_NAME)
3225     {
3226       if (dump_enabled_p ())
3227         {
3228           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3229                            "reduction: not ssa_name: ");
3230           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3231           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3232         }
3233       return NULL;
3234     }
3235
3236   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3237   if (is_gimple_assign (def_stmt))
3238     {
3239       name = gimple_assign_lhs (def_stmt);
3240       phi_def = false;
3241     }
3242   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3243     {
3244       name = PHI_RESULT (def_stmt);
3245       phi_def = true;
3246     }
3247   else
3248     {
3249       if (dump_enabled_p ())
3250         {
3251           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3252                            "reduction: unhandled reduction operation: ");
3253           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3254         }
3255       return NULL;
3256     }
3257
3258   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3259     return NULL;
3260
3261   nloop_uses = 0;
3262   auto_vec<gphi *, 3> lcphis;
3263   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3264     {
3265       gimple *use_stmt = USE_STMT (use_p);
3266       if (is_gimple_debug (use_stmt))
3267         continue;
3268       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3269         nloop_uses++;
3270       else
3271         /* We can have more than one loop-closed PHI.  */
3272         lcphis.safe_push (as_a <gphi *> (use_stmt));
3273       if (nloop_uses > 1)
3274         {
3275           if (dump_enabled_p ())
3276             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3277                              "reduction used in loop.\n");
3278           return NULL;
3279         }
3280     }
3281
3282   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3283      defined in the inner loop.  */
3284   if (phi_def)
3285     {
3286       op1 = PHI_ARG_DEF (def_stmt, 0);
3287
3288       if (gimple_phi_num_args (def_stmt) != 1
3289           || TREE_CODE (op1) != SSA_NAME)
3290         {
3291           if (dump_enabled_p ())
3292             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3293                              "unsupported phi node definition.\n");
3294
3295           return NULL;
3296         }
3297
3298       def1 = SSA_NAME_DEF_STMT (op1);
3299       if (gimple_bb (def1)
3300           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3301           && loop->inner
3302           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3303           && is_gimple_assign (def1)
3304           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3305         {
3306           if (dump_enabled_p ())
3307             report_vect_op (MSG_NOTE, def_stmt,
3308                             "detected double reduction: ");
3309
3310           *double_reduc = true;
3311           return def_stmt;
3312         }
3313
3314       return NULL;
3315     }
3316
3317   /* If we are vectorizing an inner reduction we are executing that
3318      in the original order only in case we are not dealing with a
3319      double reduction.  */
3320   bool check_reduction = true;
3321   if (flow_loop_nested_p (vect_loop, loop))
3322     {
3323       gphi *lcphi;
3324       unsigned i;
3325       check_reduction = false;
3326       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3327         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3328           {
3329             gimple *use_stmt = USE_STMT (use_p);
3330             if (is_gimple_debug (use_stmt))
3331               continue;
3332             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3333               check_reduction = true;
3334           }
3335     }
3336
3337   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3338   code = orig_code = gimple_assign_rhs_code (def_stmt);
3339
3340   /* We can handle "res -= x[i]", which is non-associative by
3341      simply rewriting this into "res += -x[i]".  Avoid changing
3342      gimple instruction for the first simple tests and only do this
3343      if we're allowed to change code at all.  */
3344   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3345     code = PLUS_EXPR;
3346
3347   if (code == COND_EXPR)
3348     {
3349       if (! nested_in_vect_loop)
3350         *v_reduc_type = COND_REDUCTION;
3351
3352       op3 = gimple_assign_rhs1 (def_stmt);
3353       if (COMPARISON_CLASS_P (op3))
3354         {
3355           op4 = TREE_OPERAND (op3, 1);
3356           op3 = TREE_OPERAND (op3, 0);
3357         }
3358       if (op3 == phi_name || op4 == phi_name)
3359         {
3360           if (dump_enabled_p ())
3361             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3362                             "reduction: condition depends on previous"
3363                             " iteration: ");
3364           return NULL;
3365         }
3366
3367       op1 = gimple_assign_rhs2 (def_stmt);
3368       op2 = gimple_assign_rhs3 (def_stmt);
3369     }
3370   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3371     {
3372       if (dump_enabled_p ())
3373         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3374                         "reduction: not commutative/associative: ");
3375       return NULL;
3376     }
3377   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3378     {
3379       op1 = gimple_assign_rhs1 (def_stmt);
3380       op2 = gimple_assign_rhs2 (def_stmt);
3381     }
3382   else
3383     {
3384       if (dump_enabled_p ())
3385         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3386                         "reduction: not handled operation: ");
3387       return NULL;
3388     }
3389
3390   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3391     {
3392       if (dump_enabled_p ())
3393         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3394                         "reduction: both uses not ssa_names: ");
3395
3396       return NULL;
3397     }
3398
3399   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3400   if ((TREE_CODE (op1) == SSA_NAME
3401        && !types_compatible_p (type,TREE_TYPE (op1)))
3402       || (TREE_CODE (op2) == SSA_NAME
3403           && !types_compatible_p (type, TREE_TYPE (op2)))
3404       || (op3 && TREE_CODE (op3) == SSA_NAME
3405           && !types_compatible_p (type, TREE_TYPE (op3)))
3406       || (op4 && TREE_CODE (op4) == SSA_NAME
3407           && !types_compatible_p (type, TREE_TYPE (op4))))
3408     {
3409       if (dump_enabled_p ())
3410         {
3411           dump_printf_loc (MSG_NOTE, vect_location,
3412                            "reduction: multiple types: operation type: ");
3413           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3414           dump_printf (MSG_NOTE, ", operands types: ");
3415           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3416                              TREE_TYPE (op1));
3417           dump_printf (MSG_NOTE, ",");
3418           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3419                              TREE_TYPE (op2));
3420           if (op3)
3421             {
3422               dump_printf (MSG_NOTE, ",");
3423               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3424                                  TREE_TYPE (op3));
3425             }
3426
3427           if (op4)
3428             {
3429               dump_printf (MSG_NOTE, ",");
3430               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3431                                  TREE_TYPE (op4));
3432             }
3433           dump_printf (MSG_NOTE, "\n");
3434         }
3435
3436       return NULL;
3437     }
3438
3439   /* Check whether it's ok to change the order of the computation.
3440      Generally, when vectorizing a reduction we change the order of the
3441      computation.  This may change the behavior of the program in some
3442      cases, so we need to check that this is ok.  One exception is when
3443      vectorizing an outer-loop: the inner-loop is executed sequentially,
3444      and therefore vectorizing reductions in the inner-loop during
3445      outer-loop vectorization is safe.  */
3446   if (check_reduction
3447       && *v_reduc_type == TREE_CODE_REDUCTION
3448       && needs_fold_left_reduction_p (type, code,
3449                                       need_wrapping_integral_overflow))
3450     *v_reduc_type = FOLD_LEFT_REDUCTION;
3451
3452   /* Reduction is safe. We're dealing with one of the following:
3453      1) integer arithmetic and no trapv
3454      2) floating point arithmetic, and special flags permit this optimization
3455      3) nested cycle (i.e., outer loop vectorization).  */
3456   if (TREE_CODE (op1) == SSA_NAME)
3457     def1 = SSA_NAME_DEF_STMT (op1);
3458
3459   if (TREE_CODE (op2) == SSA_NAME)
3460     def2 = SSA_NAME_DEF_STMT (op2);
3461
3462   if (code != COND_EXPR
3463       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3464     {
3465       if (dump_enabled_p ())
3466         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3467       return NULL;
3468     }
3469
3470   /* Check that one def is the reduction def, defined by PHI,
3471      the other def is either defined in the loop ("vect_internal_def"),
3472      or it's an induction (defined by a loop-header phi-node).  */
3473
3474   if (def2 && def2 == phi
3475       && (code == COND_EXPR
3476           || !def1 || gimple_nop_p (def1)
3477           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3478           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3479               && (is_gimple_assign (def1)
3480                   || is_gimple_call (def1)
3481                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3482                       == vect_induction_def
3483                   || (gimple_code (def1) == GIMPLE_PHI
3484                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3485                           == vect_internal_def
3486                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3487     {
3488       if (dump_enabled_p ())
3489         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3490       return def_stmt;
3491     }
3492
3493   if (def1 && def1 == phi
3494       && (code == COND_EXPR
3495           || !def2 || gimple_nop_p (def2)
3496           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3497           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3498               && (is_gimple_assign (def2)
3499                   || is_gimple_call (def2)
3500                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3501                        == vect_induction_def
3502                   || (gimple_code (def2) == GIMPLE_PHI
3503                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3504                            == vect_internal_def
3505                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3506     {
3507       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3508         {
3509           /* Check if we can swap operands (just for simplicity - so that
3510              the rest of the code can assume that the reduction variable
3511              is always the last (second) argument).  */
3512           if (code == COND_EXPR)
3513             {
3514               /* Swap cond_expr by inverting the condition.  */
3515               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3516               enum tree_code invert_code = ERROR_MARK;
3517               enum tree_code cond_code = TREE_CODE (cond_expr);
3518
3519               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3520                 {
3521                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3522                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3523                 }
3524               if (invert_code != ERROR_MARK)
3525                 {
3526                   TREE_SET_CODE (cond_expr, invert_code);
3527                   swap_ssa_operands (def_stmt,
3528                                      gimple_assign_rhs2_ptr (def_stmt),
3529                                      gimple_assign_rhs3_ptr (def_stmt));
3530                 }
3531               else
3532                 {
3533                   if (dump_enabled_p ())
3534                     report_vect_op (MSG_NOTE, def_stmt,
3535                                     "detected reduction: cannot swap operands "
3536                                     "for cond_expr");
3537                   return NULL;
3538                 }
3539             }
3540           else
3541             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3542                                gimple_assign_rhs2_ptr (def_stmt));
3543
3544           if (dump_enabled_p ())
3545             report_vect_op (MSG_NOTE, def_stmt,
3546                             "detected reduction: need to swap operands: ");
3547
3548           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3549             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3550         }
3551       else
3552         {
3553           if (dump_enabled_p ())
3554             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3555         }
3556
3557       return def_stmt;
3558     }
3559
3560   /* Try to find SLP reduction chain.  */
3561   if (! nested_in_vect_loop
3562       && code != COND_EXPR
3563       && orig_code != MINUS_EXPR
3564       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3565     {
3566       if (dump_enabled_p ())
3567         report_vect_op (MSG_NOTE, def_stmt,
3568                         "reduction: detected reduction chain: ");
3569
3570       return def_stmt;
3571     }
3572
3573   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3574   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3575   while (first)
3576     {
3577       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3578       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3579       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3580       first = next;
3581     }
3582
3583   /* Look for the expression computing loop_arg from loop PHI result.  */
3584   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3585                             code))
3586     return def_stmt;
3587
3588   if (dump_enabled_p ())
3589     {
3590       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3591                       "reduction: unknown pattern: ");
3592     }
3593
3594   return NULL;
3595 }
3596
3597 /* Wrapper around vect_is_simple_reduction, which will modify code
3598    in-place if it enables detection of more reductions.  Arguments
3599    as there.  */
3600
3601 gimple *
3602 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3603                              bool *double_reduc,
3604                              bool need_wrapping_integral_overflow)
3605 {
3606   enum vect_reduction_type v_reduc_type;
3607   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3608                                           need_wrapping_integral_overflow,
3609                                           &v_reduc_type);
3610   if (def)
3611     {
3612       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3613       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3614       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3615       reduc_def_info = vinfo_for_stmt (def);
3616       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3617       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3618     }
3619   return def;
3620 }
3621
3622 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3623 int
3624 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3625                              int *peel_iters_epilogue,
3626                              stmt_vector_for_cost *scalar_cost_vec,
3627                              stmt_vector_for_cost *prologue_cost_vec,
3628                              stmt_vector_for_cost *epilogue_cost_vec)
3629 {
3630   int retval = 0;
3631   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3632
3633   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3634     {
3635       *peel_iters_epilogue = assumed_vf / 2;
3636       if (dump_enabled_p ())
3637         dump_printf_loc (MSG_NOTE, vect_location,
3638                          "cost model: epilogue peel iters set to vf/2 "
3639                          "because loop iterations are unknown .\n");
3640
3641       /* If peeled iterations are known but number of scalar loop
3642          iterations are unknown, count a taken branch per peeled loop.  */
3643       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3644                                  NULL, 0, vect_prologue);
3645       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3646                                  NULL, 0, vect_epilogue);
3647     }
3648   else
3649     {
3650       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3651       peel_iters_prologue = niters < peel_iters_prologue ?
3652                             niters : peel_iters_prologue;
3653       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3654       /* If we need to peel for gaps, but no peeling is required, we have to
3655          peel VF iterations.  */
3656       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3657         *peel_iters_epilogue = assumed_vf;
3658     }
3659
3660   stmt_info_for_cost *si;
3661   int j;
3662   if (peel_iters_prologue)
3663     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3664         {
3665           stmt_vec_info stmt_info
3666             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3667           retval += record_stmt_cost (prologue_cost_vec,
3668                                       si->count * peel_iters_prologue,
3669                                       si->kind, stmt_info, si->misalign,
3670                                       vect_prologue);
3671         }
3672   if (*peel_iters_epilogue)
3673     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3674         {
3675           stmt_vec_info stmt_info
3676             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3677           retval += record_stmt_cost (epilogue_cost_vec,
3678                                       si->count * *peel_iters_epilogue,
3679                                       si->kind, stmt_info, si->misalign,
3680                                       vect_epilogue);
3681         }
3682
3683   return retval;
3684 }
3685
3686 /* Function vect_estimate_min_profitable_iters
3687
3688    Return the number of iterations required for the vector version of the
3689    loop to be profitable relative to the cost of the scalar version of the
3690    loop.
3691
3692    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3693    of iterations for vectorization.  -1 value means loop vectorization
3694    is not profitable.  This returned value may be used for dynamic
3695    profitability check.
3696
3697    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3698    for static check against estimated number of iterations.  */
3699
3700 static void
3701 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3702                                     int *ret_min_profitable_niters,
3703                                     int *ret_min_profitable_estimate)
3704 {
3705   int min_profitable_iters;
3706   int min_profitable_estimate;
3707   int peel_iters_prologue;
3708   int peel_iters_epilogue;
3709   unsigned vec_inside_cost = 0;
3710   int vec_outside_cost = 0;
3711   unsigned vec_prologue_cost = 0;
3712   unsigned vec_epilogue_cost = 0;
3713   int scalar_single_iter_cost = 0;
3714   int scalar_outside_cost = 0;
3715   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3716   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3717   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3718
3719   /* Cost model disabled.  */
3720   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3721     {
3722       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3723       *ret_min_profitable_niters = 0;
3724       *ret_min_profitable_estimate = 0;
3725       return;
3726     }
3727
3728   /* Requires loop versioning tests to handle misalignment.  */
3729   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3730     {
3731       /*  FIXME: Make cost depend on complexity of individual check.  */
3732       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3733       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3734                             vect_prologue);
3735       dump_printf (MSG_NOTE,
3736                    "cost model: Adding cost of checks for loop "
3737                    "versioning to treat misalignment.\n");
3738     }
3739
3740   /* Requires loop versioning with alias checks.  */
3741   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3742     {
3743       /*  FIXME: Make cost depend on complexity of individual check.  */
3744       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3745       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3746                             vect_prologue);
3747       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3748       if (len)
3749         /* Count LEN - 1 ANDs and LEN comparisons.  */
3750         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3751                               NULL, 0, vect_prologue);
3752       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3753       if (len)
3754         {
3755           /* Count LEN - 1 ANDs and LEN comparisons.  */
3756           unsigned int nstmts = len * 2 - 1;
3757           /* +1 for each bias that needs adding.  */
3758           for (unsigned int i = 0; i < len; ++i)
3759             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3760               nstmts += 1;
3761           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3762                                 NULL, 0, vect_prologue);
3763         }
3764       dump_printf (MSG_NOTE,
3765                    "cost model: Adding cost of checks for loop "
3766                    "versioning aliasing.\n");
3767     }
3768
3769   /* Requires loop versioning with niter checks.  */
3770   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3771     {
3772       /*  FIXME: Make cost depend on complexity of individual check.  */
3773       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3774                             vect_prologue);
3775       dump_printf (MSG_NOTE,
3776                    "cost model: Adding cost of checks for loop "
3777                    "versioning niters.\n");
3778     }
3779
3780   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3781     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3782                           vect_prologue);
3783
3784   /* Count statements in scalar loop.  Using this as scalar cost for a single
3785      iteration for now.
3786
3787      TODO: Add outer loop support.
3788
3789      TODO: Consider assigning different costs to different scalar
3790      statements.  */
3791
3792   scalar_single_iter_cost
3793     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3794
3795   /* Add additional cost for the peeled instructions in prologue and epilogue
3796      loop.  (For fully-masked loops there will be no peeling.)
3797
3798      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3799      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3800
3801      TODO: Build an expression that represents peel_iters for prologue and
3802      epilogue to be used in a run-time test.  */
3803
3804   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3805     {
3806       peel_iters_prologue = 0;
3807       peel_iters_epilogue = 0;
3808
3809       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3810         {
3811           /* We need to peel exactly one iteration.  */
3812           peel_iters_epilogue += 1;
3813           stmt_info_for_cost *si;
3814           int j;
3815           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3816                             j, si)
3817             {
3818               struct _stmt_vec_info *stmt_info
3819                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3820               (void) add_stmt_cost (target_cost_data, si->count,
3821                                     si->kind, stmt_info, si->misalign,
3822                                     vect_epilogue);
3823             }
3824         }
3825     }
3826   else if (npeel < 0)
3827     {
3828       peel_iters_prologue = assumed_vf / 2;
3829       dump_printf (MSG_NOTE, "cost model: "
3830                    "prologue peel iters set to vf/2.\n");
3831
3832       /* If peeling for alignment is unknown, loop bound of main loop becomes
3833          unknown.  */
3834       peel_iters_epilogue = assumed_vf / 2;
3835       dump_printf (MSG_NOTE, "cost model: "
3836                    "epilogue peel iters set to vf/2 because "
3837                    "peeling for alignment is unknown.\n");
3838
3839       /* If peeled iterations are unknown, count a taken branch and a not taken
3840          branch per peeled loop. Even if scalar loop iterations are known,
3841          vector iterations are not known since peeled prologue iterations are
3842          not known. Hence guards remain the same.  */
3843       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3844                             NULL, 0, vect_prologue);
3845       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3846                             NULL, 0, vect_prologue);
3847       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3848                             NULL, 0, vect_epilogue);
3849       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3850                             NULL, 0, vect_epilogue);
3851       stmt_info_for_cost *si;
3852       int j;
3853       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3854         {
3855           struct _stmt_vec_info *stmt_info
3856             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3857           (void) add_stmt_cost (target_cost_data,
3858                                 si->count * peel_iters_prologue,
3859                                 si->kind, stmt_info, si->misalign,
3860                                 vect_prologue);
3861           (void) add_stmt_cost (target_cost_data,
3862                                 si->count * peel_iters_epilogue,
3863                                 si->kind, stmt_info, si->misalign,
3864                                 vect_epilogue);
3865         }
3866     }
3867   else
3868     {
3869       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3870       stmt_info_for_cost *si;
3871       int j;
3872       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3873
3874       prologue_cost_vec.create (2);
3875       epilogue_cost_vec.create (2);
3876       peel_iters_prologue = npeel;
3877
3878       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3879                                           &peel_iters_epilogue,
3880                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3881                                             (loop_vinfo),
3882                                           &prologue_cost_vec,
3883                                           &epilogue_cost_vec);
3884
3885       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3886         {
3887           struct _stmt_vec_info *stmt_info
3888             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3889           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3890                                 si->misalign, vect_prologue);
3891         }
3892
3893       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3894         {
3895           struct _stmt_vec_info *stmt_info
3896             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3897           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3898                                 si->misalign, vect_epilogue);
3899         }
3900
3901       prologue_cost_vec.release ();
3902       epilogue_cost_vec.release ();
3903     }
3904
3905   /* FORNOW: The scalar outside cost is incremented in one of the
3906      following ways:
3907
3908      1. The vectorizer checks for alignment and aliasing and generates
3909      a condition that allows dynamic vectorization.  A cost model
3910      check is ANDED with the versioning condition.  Hence scalar code
3911      path now has the added cost of the versioning check.
3912
3913        if (cost > th & versioning_check)
3914          jmp to vector code
3915
3916      Hence run-time scalar is incremented by not-taken branch cost.
3917
3918      2. The vectorizer then checks if a prologue is required.  If the
3919      cost model check was not done before during versioning, it has to
3920      be done before the prologue check.
3921
3922        if (cost <= th)
3923          prologue = scalar_iters
3924        if (prologue == 0)
3925          jmp to vector code
3926        else
3927          execute prologue
3928        if (prologue == num_iters)
3929          go to exit
3930
3931      Hence the run-time scalar cost is incremented by a taken branch,
3932      plus a not-taken branch, plus a taken branch cost.
3933
3934      3. The vectorizer then checks if an epilogue is required.  If the
3935      cost model check was not done before during prologue check, it
3936      has to be done with the epilogue check.
3937
3938        if (prologue == 0)
3939          jmp to vector code
3940        else
3941          execute prologue
3942        if (prologue == num_iters)
3943          go to exit
3944        vector code:
3945          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3946            jmp to epilogue
3947
3948      Hence the run-time scalar cost should be incremented by 2 taken
3949      branches.
3950
3951      TODO: The back end may reorder the BBS's differently and reverse
3952      conditions/branch directions.  Change the estimates below to
3953      something more reasonable.  */
3954
3955   /* If the number of iterations is known and we do not do versioning, we can
3956      decide whether to vectorize at compile time.  Hence the scalar version
3957      do not carry cost model guard costs.  */
3958   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3959       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3960     {
3961       /* Cost model check occurs at versioning.  */
3962       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3963         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3964       else
3965         {
3966           /* Cost model check occurs at prologue generation.  */
3967           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3968             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3969               + vect_get_stmt_cost (cond_branch_not_taken);
3970           /* Cost model check occurs at epilogue generation.  */
3971           else
3972             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3973         }
3974     }
3975
3976   /* Complete the target-specific cost calculations.  */
3977   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3978                &vec_inside_cost, &vec_epilogue_cost);
3979
3980   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3981
3982   if (dump_enabled_p ())
3983     {
3984       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3985       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3986                    vec_inside_cost);
3987       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3988                    vec_prologue_cost);
3989       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3990                    vec_epilogue_cost);
3991       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3992                    scalar_single_iter_cost);
3993       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3994                    scalar_outside_cost);
3995       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3996                    vec_outside_cost);
3997       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3998                    peel_iters_prologue);
3999       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4000                    peel_iters_epilogue);
4001     }
4002
4003   /* Calculate number of iterations required to make the vector version
4004      profitable, relative to the loop bodies only.  The following condition
4005      must hold true:
4006      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4007      where
4008      SIC = scalar iteration cost, VIC = vector iteration cost,
4009      VOC = vector outside cost, VF = vectorization factor,
4010      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4011      SOC = scalar outside cost for run time cost model check.  */
4012
4013   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4014     {
4015       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4016                               * assumed_vf
4017                               - vec_inside_cost * peel_iters_prologue
4018                               - vec_inside_cost * peel_iters_epilogue);
4019       if (min_profitable_iters <= 0)
4020         min_profitable_iters = 0;
4021       else
4022         {
4023           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4024                                    - vec_inside_cost);
4025
4026           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4027               <= (((int) vec_inside_cost * min_profitable_iters)
4028                   + (((int) vec_outside_cost - scalar_outside_cost)
4029                      * assumed_vf)))
4030             min_profitable_iters++;
4031         }
4032     }
4033   /* vector version will never be profitable.  */
4034   else
4035     {
4036       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4037         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4038                     "did not happen for a simd loop");
4039
4040       if (dump_enabled_p ())
4041         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4042                          "cost model: the vector iteration cost = %d "
4043                          "divided by the scalar iteration cost = %d "
4044                          "is greater or equal to the vectorization factor = %d"
4045                          ".\n",
4046                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4047       *ret_min_profitable_niters = -1;
4048       *ret_min_profitable_estimate = -1;
4049       return;
4050     }
4051
4052   dump_printf (MSG_NOTE,
4053                "  Calculated minimum iters for profitability: %d\n",
4054                min_profitable_iters);
4055
4056   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4057       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4058     /* We want the vectorized loop to execute at least once.  */
4059     min_profitable_iters = assumed_vf + peel_iters_prologue;
4060
4061   if (dump_enabled_p ())
4062     dump_printf_loc (MSG_NOTE, vect_location,
4063                      "  Runtime profitability threshold = %d\n",
4064                      min_profitable_iters);
4065
4066   *ret_min_profitable_niters = min_profitable_iters;
4067
4068   /* Calculate number of iterations required to make the vector version
4069      profitable, relative to the loop bodies only.
4070
4071      Non-vectorized variant is SIC * niters and it must win over vector
4072      variant on the expected loop trip count.  The following condition must hold true:
4073      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4074
4075   if (vec_outside_cost <= 0)
4076     min_profitable_estimate = 0;
4077   else
4078     {
4079       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4080                                  * assumed_vf
4081                                  - vec_inside_cost * peel_iters_prologue
4082                                  - vec_inside_cost * peel_iters_epilogue)
4083                                  / ((scalar_single_iter_cost * assumed_vf)
4084                                    - vec_inside_cost);
4085     }
4086   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4087   if (dump_enabled_p ())
4088     dump_printf_loc (MSG_NOTE, vect_location,
4089                      "  Static estimate profitability threshold = %d\n",
4090                      min_profitable_estimate);
4091
4092   *ret_min_profitable_estimate = min_profitable_estimate;
4093 }
4094
4095 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4096    vector elements (not bits) for a vector with NELT elements.  */
4097 static void
4098 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4099                               vec_perm_builder *sel)
4100 {
4101   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4102      by vec_perm_indices.  */
4103   sel->new_vector (nelt, 1, 3);
4104   for (unsigned int i = 0; i < 3; i++)
4105     sel->quick_push (i + offset);
4106 }
4107
4108 /* Checks whether the target supports whole-vector shifts for vectors of mode
4109    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4110    it supports vec_perm_const with masks for all necessary shift amounts.  */
4111 static bool
4112 have_whole_vector_shift (machine_mode mode)
4113 {
4114   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4115     return true;
4116
4117   /* Variable-length vectors should be handled via the optab.  */
4118   unsigned int nelt;
4119   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4120     return false;
4121
4122   vec_perm_builder sel;
4123   vec_perm_indices indices;
4124   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4125     {
4126       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4127       indices.new_vector (sel, 2, nelt);
4128       if (!can_vec_perm_const_p (mode, indices, false))
4129         return false;
4130     }
4131   return true;
4132 }
4133
4134 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4135    functions. Design better to avoid maintenance issues.  */
4136
4137 /* Function vect_model_reduction_cost.
4138
4139    Models cost for a reduction operation, including the vector ops
4140    generated within the strip-mine loop, the initial definition before
4141    the loop, and the epilogue code that must be generated.  */
4142
4143 static void
4144 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4145                            int ncopies)
4146 {
4147   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4148   enum tree_code code;
4149   optab optab;
4150   tree vectype;
4151   gimple *orig_stmt;
4152   machine_mode mode;
4153   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4154   struct loop *loop = NULL;
4155   void *target_cost_data;
4156
4157   if (loop_vinfo)
4158     {
4159       loop = LOOP_VINFO_LOOP (loop_vinfo);
4160       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4161     }
4162   else
4163     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4164
4165   /* Condition reductions generate two reductions in the loop.  */
4166   vect_reduction_type reduction_type
4167     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4168   if (reduction_type == COND_REDUCTION)
4169     ncopies *= 2;
4170
4171   vectype = STMT_VINFO_VECTYPE (stmt_info);
4172   mode = TYPE_MODE (vectype);
4173   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4174
4175   if (!orig_stmt)
4176     orig_stmt = STMT_VINFO_STMT (stmt_info);
4177
4178   code = gimple_assign_rhs_code (orig_stmt);
4179
4180   if (reduction_type == EXTRACT_LAST_REDUCTION
4181       || reduction_type == FOLD_LEFT_REDUCTION)
4182     {
4183       /* No extra instructions needed in the prologue.  */
4184       prologue_cost = 0;
4185
4186       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4187         /* Count one reduction-like operation per vector.  */
4188         inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4189                                      stmt_info, 0, vect_body);
4190       else
4191         {
4192           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4193           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4194           inside_cost = add_stmt_cost (target_cost_data,  nelements,
4195                                        vec_to_scalar, stmt_info, 0,
4196                                        vect_body);
4197           inside_cost += add_stmt_cost (target_cost_data,  nelements,
4198                                         scalar_stmt, stmt_info, 0,
4199                                         vect_body);
4200         }
4201     }
4202   else
4203     {
4204       /* Add in cost for initial definition.
4205          For cond reduction we have four vectors: initial index, step,
4206          initial result of the data reduction, initial value of the index
4207          reduction.  */
4208       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4209       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4210                                       scalar_to_vec, stmt_info, 0,
4211                                       vect_prologue);
4212
4213       /* Cost of reduction op inside loop.  */
4214       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4215                                    stmt_info, 0, vect_body);
4216     }
4217
4218   /* Determine cost of epilogue code.
4219
4220      We have a reduction operator that will reduce the vector in one statement.
4221      Also requires scalar extract.  */
4222
4223   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4224     {
4225       if (reduc_fn != IFN_LAST)
4226         {
4227           if (reduction_type == COND_REDUCTION)
4228             {
4229               /* An EQ stmt and an COND_EXPR stmt.  */
4230               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4231                                               vector_stmt, stmt_info, 0,
4232                                               vect_epilogue);
4233               /* Reduction of the max index and a reduction of the found
4234                  values.  */
4235               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4236                                               vec_to_scalar, stmt_info, 0,
4237                                               vect_epilogue);
4238               /* A broadcast of the max value.  */
4239               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4240                                               scalar_to_vec, stmt_info, 0,
4241                                               vect_epilogue);
4242             }
4243           else
4244             {
4245               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4246                                               stmt_info, 0, vect_epilogue);
4247               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4248                                               vec_to_scalar, stmt_info, 0,
4249                                               vect_epilogue);
4250             }
4251         }
4252       else if (reduction_type == COND_REDUCTION)
4253         {
4254           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4255           /* Extraction of scalar elements.  */
4256           epilogue_cost += add_stmt_cost (target_cost_data,
4257                                           2 * estimated_nunits,
4258                                           vec_to_scalar, stmt_info, 0,
4259                                           vect_epilogue);
4260           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4261           epilogue_cost += add_stmt_cost (target_cost_data,
4262                                           2 * estimated_nunits - 3,
4263                                           scalar_stmt, stmt_info, 0,
4264                                           vect_epilogue);
4265         }
4266       else if (reduction_type == EXTRACT_LAST_REDUCTION
4267                || reduction_type == FOLD_LEFT_REDUCTION)
4268         /* No extra instructions need in the epilogue.  */
4269         ;
4270       else
4271         {
4272           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4273           tree bitsize =
4274             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4275           int element_bitsize = tree_to_uhwi (bitsize);
4276           int nelements = vec_size_in_bits / element_bitsize;
4277
4278           if (code == COND_EXPR)
4279             code = MAX_EXPR;
4280
4281           optab = optab_for_tree_code (code, vectype, optab_default);
4282
4283           /* We have a whole vector shift available.  */
4284           if (optab != unknown_optab
4285               && VECTOR_MODE_P (mode)
4286               && optab_handler (optab, mode) != CODE_FOR_nothing
4287               && have_whole_vector_shift (mode))
4288             {
4289               /* Final reduction via vector shifts and the reduction operator.
4290                  Also requires scalar extract.  */
4291               epilogue_cost += add_stmt_cost (target_cost_data,
4292                                               exact_log2 (nelements) * 2,
4293                                               vector_stmt, stmt_info, 0,
4294                                               vect_epilogue);
4295               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4296                                               vec_to_scalar, stmt_info, 0,
4297                                               vect_epilogue);
4298             }
4299           else
4300             /* Use extracts and reduction op for final reduction.  For N
4301                elements, we have N extracts and N-1 reduction ops.  */
4302             epilogue_cost += add_stmt_cost (target_cost_data,
4303                                             nelements + nelements - 1,
4304                                             vector_stmt, stmt_info, 0,
4305                                             vect_epilogue);
4306         }
4307     }
4308
4309   if (dump_enabled_p ())
4310     dump_printf (MSG_NOTE,
4311                  "vect_model_reduction_cost: inside_cost = %d, "
4312                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4313                  prologue_cost, epilogue_cost);
4314 }
4315
4316
4317 /* Function vect_model_induction_cost.
4318
4319    Models cost for induction operations.  */
4320
4321 static void
4322 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4323 {
4324   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4325   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4326   unsigned inside_cost, prologue_cost;
4327
4328   if (PURE_SLP_STMT (stmt_info))
4329     return;
4330
4331   /* loop cost for vec_loop.  */
4332   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4333                                stmt_info, 0, vect_body);
4334
4335   /* prologue cost for vec_init and vec_step.  */
4336   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4337                                  stmt_info, 0, vect_prologue);
4338
4339   if (dump_enabled_p ())
4340     dump_printf_loc (MSG_NOTE, vect_location,
4341                      "vect_model_induction_cost: inside_cost = %d, "
4342                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4343 }
4344
4345
4346
4347 /* Function get_initial_def_for_reduction
4348
4349    Input:
4350    STMT - a stmt that performs a reduction operation in the loop.
4351    INIT_VAL - the initial value of the reduction variable
4352
4353    Output:
4354    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4355         of the reduction (used for adjusting the epilog - see below).
4356    Return a vector variable, initialized according to the operation that STMT
4357         performs. This vector will be used as the initial value of the
4358         vector of partial results.
4359
4360    Option1 (adjust in epilog): Initialize the vector as follows:
4361      add/bit or/xor:    [0,0,...,0,0]
4362      mult/bit and:      [1,1,...,1,1]
4363      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4364    and when necessary (e.g. add/mult case) let the caller know
4365    that it needs to adjust the result by init_val.
4366
4367    Option2: Initialize the vector as follows:
4368      add/bit or/xor:    [init_val,0,0,...,0]
4369      mult/bit and:      [init_val,1,1,...,1]
4370      min/max/cond_expr: [init_val,init_val,...,init_val]
4371    and no adjustments are needed.
4372
4373    For example, for the following code:
4374
4375    s = init_val;
4376    for (i=0;i<n;i++)
4377      s = s + a[i];
4378
4379    STMT is 's = s + a[i]', and the reduction variable is 's'.
4380    For a vector of 4 units, we want to return either [0,0,0,init_val],
4381    or [0,0,0,0] and let the caller know that it needs to adjust
4382    the result at the end by 'init_val'.
4383
4384    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4385    initialization vector is simpler (same element in all entries), if
4386    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4387
4388    A cost model should help decide between these two schemes.  */
4389
4390 tree
4391 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4392                                tree *adjustment_def)
4393 {
4394   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4395   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4396   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4397   tree scalar_type = TREE_TYPE (init_val);
4398   tree vectype = get_vectype_for_scalar_type (scalar_type);
4399   enum tree_code code = gimple_assign_rhs_code (stmt);
4400   tree def_for_init;
4401   tree init_def;
4402   bool nested_in_vect_loop = false;
4403   REAL_VALUE_TYPE real_init_val = dconst0;
4404   int int_init_val = 0;
4405   gimple *def_stmt = NULL;
4406   gimple_seq stmts = NULL;
4407
4408   gcc_assert (vectype);
4409
4410   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4411               || SCALAR_FLOAT_TYPE_P (scalar_type));
4412
4413   if (nested_in_vect_loop_p (loop, stmt))
4414     nested_in_vect_loop = true;
4415   else
4416     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4417
4418   /* In case of double reduction we only create a vector variable to be put
4419      in the reduction phi node.  The actual statement creation is done in
4420      vect_create_epilog_for_reduction.  */
4421   if (adjustment_def && nested_in_vect_loop
4422       && TREE_CODE (init_val) == SSA_NAME
4423       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4424       && gimple_code (def_stmt) == GIMPLE_PHI
4425       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4426       && vinfo_for_stmt (def_stmt)
4427       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4428           == vect_double_reduction_def)
4429     {
4430       *adjustment_def = NULL;
4431       return vect_create_destination_var (init_val, vectype);
4432     }
4433
4434   vect_reduction_type reduction_type
4435     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4436
4437   /* In case of a nested reduction do not use an adjustment def as
4438      that case is not supported by the epilogue generation correctly
4439      if ncopies is not one.  */
4440   if (adjustment_def && nested_in_vect_loop)
4441     {
4442       *adjustment_def = NULL;
4443       return vect_get_vec_def_for_operand (init_val, stmt);
4444     }
4445
4446   switch (code)
4447     {
4448     case WIDEN_SUM_EXPR:
4449     case DOT_PROD_EXPR:
4450     case SAD_EXPR:
4451     case PLUS_EXPR:
4452     case MINUS_EXPR:
4453     case BIT_IOR_EXPR:
4454     case BIT_XOR_EXPR:
4455     case MULT_EXPR:
4456     case BIT_AND_EXPR:
4457       {
4458         /* ADJUSTMENT_DEF is NULL when called from
4459            vect_create_epilog_for_reduction to vectorize double reduction.  */
4460         if (adjustment_def)
4461           *adjustment_def = init_val;
4462
4463         if (code == MULT_EXPR)
4464           {
4465             real_init_val = dconst1;
4466             int_init_val = 1;
4467           }
4468
4469         if (code == BIT_AND_EXPR)
4470           int_init_val = -1;
4471
4472         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4473           def_for_init = build_real (scalar_type, real_init_val);
4474         else
4475           def_for_init = build_int_cst (scalar_type, int_init_val);
4476
4477         if (adjustment_def)
4478           /* Option1: the first element is '0' or '1' as well.  */
4479           init_def = gimple_build_vector_from_val (&stmts, vectype,
4480                                                    def_for_init);
4481         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4482           {
4483             /* Option2 (variable length): the first element is INIT_VAL.  */
4484             init_def = build_vector_from_val (vectype, def_for_init);
4485             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4486                                                       2, init_def, init_val);
4487             init_def = make_ssa_name (vectype);
4488             gimple_call_set_lhs (call, init_def);
4489             gimple_seq_add_stmt (&stmts, call);
4490           }
4491         else
4492           {
4493             /* Option2: the first element is INIT_VAL.  */
4494             tree_vector_builder elts (vectype, 1, 2);
4495             elts.quick_push (init_val);
4496             elts.quick_push (def_for_init);
4497             init_def = gimple_build_vector (&stmts, &elts);
4498           }
4499       }
4500       break;
4501
4502     case MIN_EXPR:
4503     case MAX_EXPR:
4504     case COND_EXPR:
4505       {
4506         if (adjustment_def)
4507           {
4508             *adjustment_def = NULL_TREE;
4509             if (reduction_type != COND_REDUCTION
4510                 && reduction_type != EXTRACT_LAST_REDUCTION)
4511               {
4512                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4513                 break;
4514               }
4515           }
4516         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4517         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4518       }
4519       break;
4520
4521     default:
4522       gcc_unreachable ();
4523     }
4524
4525   if (stmts)
4526     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4527   return init_def;
4528 }
4529
4530 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4531    NUMBER_OF_VECTORS is the number of vector defs to create.
4532    If NEUTRAL_OP is nonnull, introducing extra elements of that
4533    value will not change the result.  */
4534
4535 static void
4536 get_initial_defs_for_reduction (slp_tree slp_node,
4537                                 vec<tree> *vec_oprnds,
4538                                 unsigned int number_of_vectors,
4539                                 bool reduc_chain, tree neutral_op)
4540 {
4541   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4542   gimple *stmt = stmts[0];
4543   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4544   unsigned HOST_WIDE_INT nunits;
4545   unsigned j, number_of_places_left_in_vector;
4546   tree vector_type;
4547   tree vop;
4548   int group_size = stmts.length ();
4549   unsigned int vec_num, i;
4550   unsigned number_of_copies = 1;
4551   vec<tree> voprnds;
4552   voprnds.create (number_of_vectors);
4553   struct loop *loop;
4554   auto_vec<tree, 16> permute_results;
4555
4556   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4557
4558   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4559
4560   loop = (gimple_bb (stmt))->loop_father;
4561   gcc_assert (loop);
4562   edge pe = loop_preheader_edge (loop);
4563
4564   gcc_assert (!reduc_chain || neutral_op);
4565
4566   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4567      created vectors. It is greater than 1 if unrolling is performed.
4568
4569      For example, we have two scalar operands, s1 and s2 (e.g., group of
4570      strided accesses of size two), while NUNITS is four (i.e., four scalars
4571      of this type can be packed in a vector).  The output vector will contain
4572      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4573      will be 2).
4574
4575      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4576      containing the operands.
4577
4578      For example, NUNITS is four as before, and the group size is 8
4579      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4580      {s5, s6, s7, s8}.  */
4581
4582   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4583     nunits = group_size;
4584
4585   number_of_copies = nunits * number_of_vectors / group_size;
4586
4587   number_of_places_left_in_vector = nunits;
4588   bool constant_p = true;
4589   tree_vector_builder elts (vector_type, nunits, 1);
4590   elts.quick_grow (nunits);
4591   for (j = 0; j < number_of_copies; j++)
4592     {
4593       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4594         {
4595           tree op;
4596           /* Get the def before the loop.  In reduction chain we have only
4597              one initial value.  */
4598           if ((j != (number_of_copies - 1)
4599                || (reduc_chain && i != 0))
4600               && neutral_op)
4601             op = neutral_op;
4602           else
4603             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4604
4605           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4606           number_of_places_left_in_vector--;
4607           elts[number_of_places_left_in_vector] = op;
4608           if (!CONSTANT_CLASS_P (op))
4609             constant_p = false;
4610
4611           if (number_of_places_left_in_vector == 0)
4612             {
4613               gimple_seq ctor_seq = NULL;
4614               tree init;
4615               if (constant_p && !neutral_op
4616                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4617                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4618                 /* Build the vector directly from ELTS.  */
4619                 init = gimple_build_vector (&ctor_seq, &elts);
4620               else if (neutral_op)
4621                 {
4622                   /* Build a vector of the neutral value and shift the
4623                      other elements into place.  */
4624                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4625                                                        neutral_op);
4626                   int k = nunits;
4627                   while (k > 0 && elts[k - 1] == neutral_op)
4628                     k -= 1;
4629                   while (k > 0)
4630                     {
4631                       k -= 1;
4632                       gcall *call = gimple_build_call_internal
4633                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4634                       init = make_ssa_name (vector_type);
4635                       gimple_call_set_lhs (call, init);
4636                       gimple_seq_add_stmt (&ctor_seq, call);
4637                     }
4638                 }
4639               else
4640                 {
4641                   /* First time round, duplicate ELTS to fill the
4642                      required number of vectors, then cherry pick the
4643                      appropriate result for each iteration.  */
4644                   if (vec_oprnds->is_empty ())
4645                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4646                                               number_of_vectors,
4647                                               permute_results);
4648                   init = permute_results[number_of_vectors - j - 1];
4649                 }
4650               if (ctor_seq != NULL)
4651                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4652               voprnds.quick_push (init);
4653
4654               number_of_places_left_in_vector = nunits;
4655               elts.new_vector (vector_type, nunits, 1);
4656               elts.quick_grow (nunits);
4657               constant_p = true;
4658             }
4659         }
4660     }
4661
4662   /* Since the vectors are created in the reverse order, we should invert
4663      them.  */
4664   vec_num = voprnds.length ();
4665   for (j = vec_num; j != 0; j--)
4666     {
4667       vop = voprnds[j - 1];
4668       vec_oprnds->quick_push (vop);
4669     }
4670
4671   voprnds.release ();
4672
4673   /* In case that VF is greater than the unrolling factor needed for the SLP
4674      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4675      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4676      to replicate the vectors.  */
4677   tree neutral_vec = NULL;
4678   while (number_of_vectors > vec_oprnds->length ())
4679     {
4680       if (neutral_op)
4681         {
4682           if (!neutral_vec)
4683             {
4684               gimple_seq ctor_seq = NULL;
4685               neutral_vec = gimple_build_vector_from_val
4686                 (&ctor_seq, vector_type, neutral_op);
4687               if (ctor_seq != NULL)
4688                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4689             }
4690           vec_oprnds->quick_push (neutral_vec);
4691         }
4692       else
4693         {
4694           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4695             vec_oprnds->quick_push (vop);
4696         }
4697     }
4698 }
4699
4700
4701 /* Function vect_create_epilog_for_reduction
4702
4703    Create code at the loop-epilog to finalize the result of a reduction
4704    computation.
4705
4706    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4707      reduction statements.
4708    STMT is the scalar reduction stmt that is being vectorized.
4709    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4710      number of elements that we can fit in a vectype (nunits).  In this case
4711      we have to generate more than one vector stmt - i.e - we need to "unroll"
4712      the vector stmt by a factor VF/nunits.  For more details see documentation
4713      in vectorizable_operation.
4714    REDUC_FN is the internal function for the epilog reduction.
4715    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4716      computation.
4717    REDUC_INDEX is the index of the operand in the right hand side of the
4718      statement that is defined by REDUCTION_PHI.
4719    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4720    SLP_NODE is an SLP node containing a group of reduction statements. The
4721      first one in this group is STMT.
4722    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4723      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4724      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4725      any value of the IV in the loop.
4726    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4727    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4728      null if this is not an SLP reduction
4729
4730    This function:
4731    1. Creates the reduction def-use cycles: sets the arguments for
4732       REDUCTION_PHIS:
4733       The loop-entry argument is the vectorized initial-value of the reduction.
4734       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4735       sums.
4736    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4737       by calling the function specified by REDUC_FN if available, or by
4738       other means (whole-vector shifts or a scalar loop).
4739       The function also creates a new phi node at the loop exit to preserve
4740       loop-closed form, as illustrated below.
4741
4742      The flow at the entry to this function:
4743
4744         loop:
4745           vec_def = phi <null, null>            # REDUCTION_PHI
4746           VECT_DEF = vector_stmt                # vectorized form of STMT
4747           s_loop = scalar_stmt                  # (scalar) STMT
4748         loop_exit:
4749           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4750           use <s_out0>
4751           use <s_out0>
4752
4753      The above is transformed by this function into:
4754
4755         loop:
4756           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4757           VECT_DEF = vector_stmt                # vectorized form of STMT
4758           s_loop = scalar_stmt                  # (scalar) STMT
4759         loop_exit:
4760           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4761           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4762           v_out2 = reduce <v_out1>
4763           s_out3 = extract_field <v_out2, 0>
4764           s_out4 = adjust_result <s_out3>
4765           use <s_out4>
4766           use <s_out4>
4767 */
4768
4769 static void
4770 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4771                                   gimple *reduc_def_stmt,
4772                                   int ncopies, internal_fn reduc_fn,
4773                                   vec<gimple *> reduction_phis,
4774                                   bool double_reduc,
4775                                   slp_tree slp_node,
4776                                   slp_instance slp_node_instance,
4777                                   tree induc_val, enum tree_code induc_code,
4778                                   tree neutral_op)
4779 {
4780   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4781   stmt_vec_info prev_phi_info;
4782   tree vectype;
4783   machine_mode mode;
4784   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4785   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4786   basic_block exit_bb;
4787   tree scalar_dest;
4788   tree scalar_type;
4789   gimple *new_phi = NULL, *phi;
4790   gimple_stmt_iterator exit_gsi;
4791   tree vec_dest;
4792   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4793   gimple *epilog_stmt = NULL;
4794   enum tree_code code = gimple_assign_rhs_code (stmt);
4795   gimple *exit_phi;
4796   tree bitsize;
4797   tree adjustment_def = NULL;
4798   tree vec_initial_def = NULL;
4799   tree expr, def, initial_def = NULL;
4800   tree orig_name, scalar_result;
4801   imm_use_iterator imm_iter, phi_imm_iter;
4802   use_operand_p use_p, phi_use_p;
4803   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4804   bool nested_in_vect_loop = false;
4805   auto_vec<gimple *> new_phis;
4806   auto_vec<gimple *> inner_phis;
4807   enum vect_def_type dt = vect_unknown_def_type;
4808   int j, i;
4809   auto_vec<tree> scalar_results;
4810   unsigned int group_size = 1, k, ratio;
4811   auto_vec<tree> vec_initial_defs;
4812   auto_vec<gimple *> phis;
4813   bool slp_reduc = false;
4814   bool direct_slp_reduc;
4815   tree new_phi_result;
4816   gimple *inner_phi = NULL;
4817   tree induction_index = NULL_TREE;
4818
4819   if (slp_node)
4820     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4821
4822   if (nested_in_vect_loop_p (loop, stmt))
4823     {
4824       outer_loop = loop;
4825       loop = loop->inner;
4826       nested_in_vect_loop = true;
4827       gcc_assert (!slp_node);
4828     }
4829
4830   vectype = STMT_VINFO_VECTYPE (stmt_info);
4831   gcc_assert (vectype);
4832   mode = TYPE_MODE (vectype);
4833
4834   /* 1. Create the reduction def-use cycle:
4835      Set the arguments of REDUCTION_PHIS, i.e., transform
4836
4837         loop:
4838           vec_def = phi <null, null>            # REDUCTION_PHI
4839           VECT_DEF = vector_stmt                # vectorized form of STMT
4840           ...
4841
4842      into:
4843
4844         loop:
4845           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4846           VECT_DEF = vector_stmt                # vectorized form of STMT
4847           ...
4848
4849      (in case of SLP, do it for all the phis). */
4850
4851   /* Get the loop-entry arguments.  */
4852   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4853   if (slp_node)
4854     {
4855       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4856       vec_initial_defs.reserve (vec_num);
4857       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4858                                       &vec_initial_defs, vec_num,
4859                                       GROUP_FIRST_ELEMENT (stmt_info),
4860                                       neutral_op);
4861     }
4862   else
4863     {
4864       /* Get at the scalar def before the loop, that defines the initial value
4865          of the reduction variable.  */
4866       gimple *def_stmt;
4867       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4868                                            loop_preheader_edge (loop));
4869       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4870          and we can't use zero for induc_val, use initial_def.  Similarly
4871          for REDUC_MIN and initial_def larger than the base.  */
4872       if (TREE_CODE (initial_def) == INTEGER_CST
4873           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4874               == INTEGER_INDUC_COND_REDUCTION)
4875           && !integer_zerop (induc_val)
4876           && ((induc_code == MAX_EXPR
4877                && tree_int_cst_lt (initial_def, induc_val))
4878               || (induc_code == MIN_EXPR
4879                   && tree_int_cst_lt (induc_val, initial_def))))
4880         induc_val = initial_def;
4881       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4882       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4883                                                        &adjustment_def);
4884       vec_initial_defs.create (1);
4885       vec_initial_defs.quick_push (vec_initial_def);
4886     }
4887
4888   /* Set phi nodes arguments.  */
4889   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4890     {
4891       tree vec_init_def = vec_initial_defs[i];
4892       tree def = vect_defs[i];
4893       for (j = 0; j < ncopies; j++)
4894         {
4895           if (j != 0)
4896             {
4897               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4898               if (nested_in_vect_loop)
4899                 vec_init_def
4900                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4901                                                     vec_init_def);
4902             }
4903
4904           /* Set the loop-entry arg of the reduction-phi.  */
4905
4906           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4907               == INTEGER_INDUC_COND_REDUCTION)
4908             {
4909               /* Initialise the reduction phi to zero.  This prevents initial
4910                  values of non-zero interferring with the reduction op.  */
4911               gcc_assert (ncopies == 1);
4912               gcc_assert (i == 0);
4913
4914               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4915               tree induc_val_vec
4916                 = build_vector_from_val (vec_init_def_type, induc_val);
4917
4918               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4919                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4920             }
4921           else
4922             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4923                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4924
4925           /* Set the loop-latch arg for the reduction-phi.  */
4926           if (j > 0)
4927             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4928
4929           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4930                        UNKNOWN_LOCATION);
4931
4932           if (dump_enabled_p ())
4933             {
4934               dump_printf_loc (MSG_NOTE, vect_location,
4935                                "transform reduction: created def-use cycle: ");
4936               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4937               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4938             }
4939         }
4940     }
4941
4942   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4943      which is updated with the current index of the loop for every match of
4944      the original loop's cond_expr (VEC_STMT).  This results in a vector
4945      containing the last time the condition passed for that vector lane.
4946      The first match will be a 1 to allow 0 to be used for non-matching
4947      indexes.  If there are no matches at all then the vector will be all
4948      zeroes.  */
4949   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4950     {
4951       tree indx_before_incr, indx_after_incr;
4952       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4953
4954       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4955       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4956
4957       int scalar_precision
4958         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4959       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4960       tree cr_index_vector_type = build_vector_type
4961         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4962
4963       /* First we create a simple vector induction variable which starts
4964          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4965          vector size (STEP).  */
4966
4967       /* Create a {1,2,3,...} vector.  */
4968       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4969
4970       /* Create a vector of the step value.  */
4971       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4972       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4973
4974       /* Create an induction variable.  */
4975       gimple_stmt_iterator incr_gsi;
4976       bool insert_after;
4977       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4978       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4979                  insert_after, &indx_before_incr, &indx_after_incr);
4980
4981       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4982          filled with zeros (VEC_ZERO).  */
4983
4984       /* Create a vector of 0s.  */
4985       tree zero = build_zero_cst (cr_index_scalar_type);
4986       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4987
4988       /* Create a vector phi node.  */
4989       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4990       new_phi = create_phi_node (new_phi_tree, loop->header);
4991       set_vinfo_for_stmt (new_phi,
4992                           new_stmt_vec_info (new_phi, loop_vinfo));
4993       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4994                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4995
4996       /* Now take the condition from the loops original cond_expr
4997          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4998          every match uses values from the induction variable
4999          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5000          (NEW_PHI_TREE).
5001          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5002          the new cond_expr (INDEX_COND_EXPR).  */
5003
5004       /* Duplicate the condition from vec_stmt.  */
5005       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5006
5007       /* Create a conditional, where the condition is taken from vec_stmt
5008          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5009          else is the phi (NEW_PHI_TREE).  */
5010       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5011                                      ccompare, indx_before_incr,
5012                                      new_phi_tree);
5013       induction_index = make_ssa_name (cr_index_vector_type);
5014       gimple *index_condition = gimple_build_assign (induction_index,
5015                                                      index_cond_expr);
5016       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5017       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5018                                                         loop_vinfo);
5019       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5020       set_vinfo_for_stmt (index_condition, index_vec_info);
5021
5022       /* Update the phi with the vec cond.  */
5023       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5024                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5025     }
5026
5027   /* 2. Create epilog code.
5028         The reduction epilog code operates across the elements of the vector
5029         of partial results computed by the vectorized loop.
5030         The reduction epilog code consists of:
5031
5032         step 1: compute the scalar result in a vector (v_out2)
5033         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5034         step 3: adjust the scalar result (s_out3) if needed.
5035
5036         Step 1 can be accomplished using one the following three schemes:
5037           (scheme 1) using reduc_fn, if available.
5038           (scheme 2) using whole-vector shifts, if available.
5039           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5040                      combined.
5041
5042           The overall epilog code looks like this:
5043
5044           s_out0 = phi <s_loop>         # original EXIT_PHI
5045           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5046           v_out2 = reduce <v_out1>              # step 1
5047           s_out3 = extract_field <v_out2, 0>    # step 2
5048           s_out4 = adjust_result <s_out3>       # step 3
5049
5050           (step 3 is optional, and steps 1 and 2 may be combined).
5051           Lastly, the uses of s_out0 are replaced by s_out4.  */
5052
5053
5054   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5055          v_out1 = phi <VECT_DEF>
5056          Store them in NEW_PHIS.  */
5057
5058   exit_bb = single_exit (loop)->dest;
5059   prev_phi_info = NULL;
5060   new_phis.create (vect_defs.length ());
5061   FOR_EACH_VEC_ELT (vect_defs, i, def)
5062     {
5063       for (j = 0; j < ncopies; j++)
5064         {
5065           tree new_def = copy_ssa_name (def);
5066           phi = create_phi_node (new_def, exit_bb);
5067           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5068           if (j == 0)
5069             new_phis.quick_push (phi);
5070           else
5071             {
5072               def = vect_get_vec_def_for_stmt_copy (dt, def);
5073               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5074             }
5075
5076           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5077           prev_phi_info = vinfo_for_stmt (phi);
5078         }
5079     }
5080
5081   /* The epilogue is created for the outer-loop, i.e., for the loop being
5082      vectorized.  Create exit phis for the outer loop.  */
5083   if (double_reduc)
5084     {
5085       loop = outer_loop;
5086       exit_bb = single_exit (loop)->dest;
5087       inner_phis.create (vect_defs.length ());
5088       FOR_EACH_VEC_ELT (new_phis, i, phi)
5089         {
5090           tree new_result = copy_ssa_name (PHI_RESULT (phi));
5091           gphi *outer_phi = create_phi_node (new_result, exit_bb);
5092           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5093                            PHI_RESULT (phi));
5094           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5095                                                             loop_vinfo));
5096           inner_phis.quick_push (phi);
5097           new_phis[i] = outer_phi;
5098           prev_phi_info = vinfo_for_stmt (outer_phi);
5099           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5100             {
5101               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5102               new_result = copy_ssa_name (PHI_RESULT (phi));
5103               outer_phi = create_phi_node (new_result, exit_bb);
5104               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5105                                PHI_RESULT (phi));
5106               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5107                                                                 loop_vinfo));
5108               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5109               prev_phi_info = vinfo_for_stmt (outer_phi);
5110             }
5111         }
5112     }
5113
5114   exit_gsi = gsi_after_labels (exit_bb);
5115
5116   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5117          (i.e. when reduc_fn is not available) and in the final adjustment
5118          code (if needed).  Also get the original scalar reduction variable as
5119          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5120          represents a reduction pattern), the tree-code and scalar-def are
5121          taken from the original stmt that the pattern-stmt (STMT) replaces.
5122          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5123          are taken from STMT.  */
5124
5125   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5126   if (!orig_stmt)
5127     {
5128       /* Regular reduction  */
5129       orig_stmt = stmt;
5130     }
5131   else
5132     {
5133       /* Reduction pattern  */
5134       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5135       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5136       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5137     }
5138
5139   code = gimple_assign_rhs_code (orig_stmt);
5140   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5141      partial results are added and not subtracted.  */
5142   if (code == MINUS_EXPR)
5143     code = PLUS_EXPR;
5144
5145   scalar_dest = gimple_assign_lhs (orig_stmt);
5146   scalar_type = TREE_TYPE (scalar_dest);
5147   scalar_results.create (group_size);
5148   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5149   bitsize = TYPE_SIZE (scalar_type);
5150
5151   /* In case this is a reduction in an inner-loop while vectorizing an outer
5152      loop - we don't need to extract a single scalar result at the end of the
5153      inner-loop (unless it is double reduction, i.e., the use of reduction is
5154      outside the outer-loop).  The final vector of partial results will be used
5155      in the vectorized outer-loop, or reduced to a scalar result at the end of
5156      the outer-loop.  */
5157   if (nested_in_vect_loop && !double_reduc)
5158     goto vect_finalize_reduction;
5159
5160   /* SLP reduction without reduction chain, e.g.,
5161      # a1 = phi <a2, a0>
5162      # b1 = phi <b2, b0>
5163      a2 = operation (a1)
5164      b2 = operation (b1)  */
5165   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5166
5167   /* True if we should implement SLP_REDUC using native reduction operations
5168      instead of scalar operations.  */
5169   direct_slp_reduc = (reduc_fn != IFN_LAST
5170                       && slp_reduc
5171                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5172
5173   /* In case of reduction chain, e.g.,
5174      # a1 = phi <a3, a0>
5175      a2 = operation (a1)
5176      a3 = operation (a2),
5177
5178      we may end up with more than one vector result.  Here we reduce them to
5179      one vector.  */
5180   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5181     {
5182       tree first_vect = PHI_RESULT (new_phis[0]);
5183       gassign *new_vec_stmt = NULL;
5184       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5185       for (k = 1; k < new_phis.length (); k++)
5186         {
5187           gimple *next_phi = new_phis[k];
5188           tree second_vect = PHI_RESULT (next_phi);
5189           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5190           new_vec_stmt = gimple_build_assign (tem, code,
5191                                               first_vect, second_vect);
5192           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5193           first_vect = tem;
5194         }
5195
5196       new_phi_result = first_vect;
5197       if (new_vec_stmt)
5198         {
5199           new_phis.truncate (0);
5200           new_phis.safe_push (new_vec_stmt);
5201         }
5202     }
5203   /* Likewise if we couldn't use a single defuse cycle.  */
5204   else if (ncopies > 1)
5205     {
5206       gcc_assert (new_phis.length () == 1);
5207       tree first_vect = PHI_RESULT (new_phis[0]);
5208       gassign *new_vec_stmt = NULL;
5209       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5210       gimple *next_phi = new_phis[0];
5211       for (int k = 1; k < ncopies; ++k)
5212         {
5213           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5214           tree second_vect = PHI_RESULT (next_phi);
5215           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5216           new_vec_stmt = gimple_build_assign (tem, code,
5217                                               first_vect, second_vect);
5218           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5219           first_vect = tem;
5220         }
5221       new_phi_result = first_vect;
5222       new_phis.truncate (0);
5223       new_phis.safe_push (new_vec_stmt);
5224     }
5225   else
5226     new_phi_result = PHI_RESULT (new_phis[0]);
5227
5228   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5229       && reduc_fn != IFN_LAST)
5230     {
5231       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5232          various data values where the condition matched and another vector
5233          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5234          need to extract the last matching index (which will be the index with
5235          highest value) and use this to index into the data vector.
5236          For the case where there were no matches, the data vector will contain
5237          all default values and the index vector will be all zeros.  */
5238
5239       /* Get various versions of the type of the vector of indexes.  */
5240       tree index_vec_type = TREE_TYPE (induction_index);
5241       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5242       tree index_scalar_type = TREE_TYPE (index_vec_type);
5243       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5244         (index_vec_type);
5245
5246       /* Get an unsigned integer version of the type of the data vector.  */
5247       int scalar_precision
5248         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5249       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5250       tree vectype_unsigned = build_vector_type
5251         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5252
5253       /* First we need to create a vector (ZERO_VEC) of zeros and another
5254          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5255          can create using a MAX reduction and then expanding.
5256          In the case where the loop never made any matches, the max index will
5257          be zero.  */
5258
5259       /* Vector of {0, 0, 0,...}.  */
5260       tree zero_vec = make_ssa_name (vectype);
5261       tree zero_vec_rhs = build_zero_cst (vectype);
5262       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5263       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5264
5265       /* Find maximum value from the vector of found indexes.  */
5266       tree max_index = make_ssa_name (index_scalar_type);
5267       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5268                                                           1, induction_index);
5269       gimple_call_set_lhs (max_index_stmt, max_index);
5270       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5271
5272       /* Vector of {max_index, max_index, max_index,...}.  */
5273       tree max_index_vec = make_ssa_name (index_vec_type);
5274       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5275                                                       max_index);
5276       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5277                                                         max_index_vec_rhs);
5278       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5279
5280       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5281          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5282          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5283          otherwise.  Only one value should match, resulting in a vector
5284          (VEC_COND) with one data value and the rest zeros.
5285          In the case where the loop never made any matches, every index will
5286          match, resulting in a vector with all data values (which will all be
5287          the default value).  */
5288
5289       /* Compare the max index vector to the vector of found indexes to find
5290          the position of the max value.  */
5291       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5292       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5293                                                       induction_index,
5294                                                       max_index_vec);
5295       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5296
5297       /* Use the compare to choose either values from the data vector or
5298          zero.  */
5299       tree vec_cond = make_ssa_name (vectype);
5300       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5301                                                    vec_compare, new_phi_result,
5302                                                    zero_vec);
5303       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5304
5305       /* Finally we need to extract the data value from the vector (VEC_COND)
5306          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5307          reduction, but because this doesn't exist, we can use a MAX reduction
5308          instead.  The data value might be signed or a float so we need to cast
5309          it first.
5310          In the case where the loop never made any matches, the data values are
5311          all identical, and so will reduce down correctly.  */
5312
5313       /* Make the matched data values unsigned.  */
5314       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5315       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5316                                        vec_cond);
5317       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5318                                                         VIEW_CONVERT_EXPR,
5319                                                         vec_cond_cast_rhs);
5320       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5321
5322       /* Reduce down to a scalar value.  */
5323       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5324       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5325                                                            1, vec_cond_cast);
5326       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5327       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5328
5329       /* Convert the reduced value back to the result type and set as the
5330          result.  */
5331       gimple_seq stmts = NULL;
5332       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5333                                data_reduc);
5334       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5335       scalar_results.safe_push (new_temp);
5336     }
5337   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5338            && reduc_fn == IFN_LAST)
5339     {
5340       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5341          idx = 0;
5342          idx_val = induction_index[0];
5343          val = data_reduc[0];
5344          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5345            if (induction_index[i] > idx_val)
5346              val = data_reduc[i], idx_val = induction_index[i];
5347          return val;  */
5348
5349       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5350       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5351       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5352       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5353       /* Enforced by vectorizable_reduction, which ensures we have target
5354          support before allowing a conditional reduction on variable-length
5355          vectors.  */
5356       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5357       tree idx_val = NULL_TREE, val = NULL_TREE;
5358       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5359         {
5360           tree old_idx_val = idx_val;
5361           tree old_val = val;
5362           idx_val = make_ssa_name (idx_eltype);
5363           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5364                                              build3 (BIT_FIELD_REF, idx_eltype,
5365                                                      induction_index,
5366                                                      bitsize_int (el_size),
5367                                                      bitsize_int (off)));
5368           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5369           val = make_ssa_name (data_eltype);
5370           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5371                                              build3 (BIT_FIELD_REF,
5372                                                      data_eltype,
5373                                                      new_phi_result,
5374                                                      bitsize_int (el_size),
5375                                                      bitsize_int (off)));
5376           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5377           if (off != 0)
5378             {
5379               tree new_idx_val = idx_val;
5380               tree new_val = val;
5381               if (off != v_size - el_size)
5382                 {
5383                   new_idx_val = make_ssa_name (idx_eltype);
5384                   epilog_stmt = gimple_build_assign (new_idx_val,
5385                                                      MAX_EXPR, idx_val,
5386                                                      old_idx_val);
5387                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5388                 }
5389               new_val = make_ssa_name (data_eltype);
5390               epilog_stmt = gimple_build_assign (new_val,
5391                                                  COND_EXPR,
5392                                                  build2 (GT_EXPR,
5393                                                          boolean_type_node,
5394                                                          idx_val,
5395                                                          old_idx_val),
5396                                                  val, old_val);
5397               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398               idx_val = new_idx_val;
5399               val = new_val;
5400             }
5401         }
5402       /* Convert the reduced value back to the result type and set as the
5403          result.  */
5404       gimple_seq stmts = NULL;
5405       val = gimple_convert (&stmts, scalar_type, val);
5406       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5407       scalar_results.safe_push (val);
5408     }
5409
5410   /* 2.3 Create the reduction code, using one of the three schemes described
5411          above. In SLP we simply need to extract all the elements from the
5412          vector (without reducing them), so we use scalar shifts.  */
5413   else if (reduc_fn != IFN_LAST && !slp_reduc)
5414     {
5415       tree tmp;
5416       tree vec_elem_type;
5417
5418       /* Case 1:  Create:
5419          v_out2 = reduc_expr <v_out1>  */
5420
5421       if (dump_enabled_p ())
5422         dump_printf_loc (MSG_NOTE, vect_location,
5423                          "Reduce using direct vector reduction.\n");
5424
5425       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5426       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5427         {
5428           tree tmp_dest
5429             = vect_create_destination_var (scalar_dest, vec_elem_type);
5430           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5431                                                     new_phi_result);
5432           gimple_set_lhs (epilog_stmt, tmp_dest);
5433           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5434           gimple_set_lhs (epilog_stmt, new_temp);
5435           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5436
5437           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5438                                              new_temp);
5439         }
5440       else
5441         {
5442           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5443                                                     new_phi_result);
5444           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5445         }
5446
5447       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5448       gimple_set_lhs (epilog_stmt, new_temp);
5449       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5450
5451       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5452            == INTEGER_INDUC_COND_REDUCTION)
5453           && !operand_equal_p (initial_def, induc_val, 0))
5454         {
5455           /* Earlier we set the initial value to be a vector if induc_val
5456              values.  Check the result and if it is induc_val then replace
5457              with the original initial value, unless induc_val is
5458              the same as initial_def already.  */
5459           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5460                                   induc_val);
5461
5462           tmp = make_ssa_name (new_scalar_dest);
5463           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5464                                              initial_def, new_temp);
5465           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5466           new_temp = tmp;
5467         }
5468
5469       scalar_results.safe_push (new_temp);
5470     }
5471   else if (direct_slp_reduc)
5472     {
5473       /* Here we create one vector for each of the GROUP_SIZE results,
5474          with the elements for other SLP statements replaced with the
5475          neutral value.  We can then do a normal reduction on each vector.  */
5476
5477       /* Enforced by vectorizable_reduction.  */
5478       gcc_assert (new_phis.length () == 1);
5479       gcc_assert (pow2p_hwi (group_size));
5480
5481       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5482       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5483       gimple_seq seq = NULL;
5484
5485       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5486          and the same element size as VECTYPE.  */
5487       tree index = build_index_vector (vectype, 0, 1);
5488       tree index_type = TREE_TYPE (index);
5489       tree index_elt_type = TREE_TYPE (index_type);
5490       tree mask_type = build_same_sized_truth_vector_type (index_type);
5491
5492       /* Create a vector that, for each element, identifies which of
5493          the GROUP_SIZE results should use it.  */
5494       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5495       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5496                             build_vector_from_val (index_type, index_mask));
5497
5498       /* Get a neutral vector value.  This is simply a splat of the neutral
5499          scalar value if we have one, otherwise the initial scalar value
5500          is itself a neutral value.  */
5501       tree vector_identity = NULL_TREE;
5502       if (neutral_op)
5503         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5504                                                         neutral_op);
5505       for (unsigned int i = 0; i < group_size; ++i)
5506         {
5507           /* If there's no univeral neutral value, we can use the
5508              initial scalar value from the original PHI.  This is used
5509              for MIN and MAX reduction, for example.  */
5510           if (!neutral_op)
5511             {
5512               tree scalar_value
5513                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5514                                          loop_preheader_edge (loop));
5515               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5516                                                               scalar_value);
5517             }
5518
5519           /* Calculate the equivalent of:
5520
5521              sel[j] = (index[j] == i);
5522
5523              which selects the elements of NEW_PHI_RESULT that should
5524              be included in the result.  */
5525           tree compare_val = build_int_cst (index_elt_type, i);
5526           compare_val = build_vector_from_val (index_type, compare_val);
5527           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5528                                    index, compare_val);
5529
5530           /* Calculate the equivalent of:
5531
5532              vec = seq ? new_phi_result : vector_identity;
5533
5534              VEC is now suitable for a full vector reduction.  */
5535           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5536                                    sel, new_phi_result, vector_identity);
5537
5538           /* Do the reduction and convert it to the appropriate type.  */
5539           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5540           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5541           gimple_call_set_lhs (call, scalar);
5542           gimple_seq_add_stmt (&seq, call);
5543           scalar = gimple_convert (&seq, scalar_type, scalar);
5544           scalar_results.safe_push (scalar);
5545         }
5546       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5547     }
5548   else
5549     {
5550       bool reduce_with_shift;
5551       tree vec_temp;
5552
5553       /* COND reductions all do the final reduction with MAX_EXPR
5554          or MIN_EXPR.  */
5555       if (code == COND_EXPR)
5556         {
5557           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5558               == INTEGER_INDUC_COND_REDUCTION)
5559             code = induc_code;
5560           else
5561             code = MAX_EXPR;
5562         }
5563
5564       /* See if the target wants to do the final (shift) reduction
5565          in a vector mode of smaller size and first reduce upper/lower
5566          halves against each other.  */
5567       enum machine_mode mode1 = mode;
5568       tree vectype1 = vectype;
5569       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5570       unsigned sz1 = sz;
5571       if (!slp_reduc
5572           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5573         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5574
5575       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5576       reduce_with_shift = have_whole_vector_shift (mode1);
5577       if (!VECTOR_MODE_P (mode1))
5578         reduce_with_shift = false;
5579       else
5580         {
5581           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5582           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5583             reduce_with_shift = false;
5584         }
5585
5586       /* First reduce the vector to the desired vector size we should
5587          do shift reduction on by combining upper and lower halves.  */
5588       new_temp = new_phi_result;
5589       while (sz > sz1)
5590         {
5591           gcc_assert (!slp_reduc);
5592           sz /= 2;
5593           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5594
5595           /* The target has to make sure we support lowpart/highpart
5596              extraction, either via direct vector extract or through
5597              an integer mode punning.  */
5598           tree dst1, dst2;
5599           if (convert_optab_handler (vec_extract_optab,
5600                                      TYPE_MODE (TREE_TYPE (new_temp)),
5601                                      TYPE_MODE (vectype1))
5602               != CODE_FOR_nothing)
5603             {
5604               /* Extract sub-vectors directly once vec_extract becomes
5605                  a conversion optab.  */
5606               dst1 = make_ssa_name (vectype1);
5607               epilog_stmt
5608                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5609                                          build3 (BIT_FIELD_REF, vectype1,
5610                                                  new_temp, TYPE_SIZE (vectype1),
5611                                                  bitsize_int (0)));
5612               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5613               dst2 =  make_ssa_name (vectype1);
5614               epilog_stmt
5615                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5616                                          build3 (BIT_FIELD_REF, vectype1,
5617                                                  new_temp, TYPE_SIZE (vectype1),
5618                                                  bitsize_int (sz * BITS_PER_UNIT)));
5619               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5620             }
5621           else
5622             {
5623               /* Extract via punning to appropriately sized integer mode
5624                  vector.  */
5625               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5626                                                             1);
5627               tree etype = build_vector_type (eltype, 2);
5628               gcc_assert (convert_optab_handler (vec_extract_optab,
5629                                                  TYPE_MODE (etype),
5630                                                  TYPE_MODE (eltype))
5631                           != CODE_FOR_nothing);
5632               tree tem = make_ssa_name (etype);
5633               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5634                                                  build1 (VIEW_CONVERT_EXPR,
5635                                                          etype, new_temp));
5636               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5637               new_temp = tem;
5638               tem = make_ssa_name (eltype);
5639               epilog_stmt
5640                   = gimple_build_assign (tem, BIT_FIELD_REF,
5641                                          build3 (BIT_FIELD_REF, eltype,
5642                                                  new_temp, TYPE_SIZE (eltype),
5643                                                  bitsize_int (0)));
5644               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5645               dst1 = make_ssa_name (vectype1);
5646               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5647                                                  build1 (VIEW_CONVERT_EXPR,
5648                                                          vectype1, tem));
5649               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5650               tem = make_ssa_name (eltype);
5651               epilog_stmt
5652                   = gimple_build_assign (tem, BIT_FIELD_REF,
5653                                          build3 (BIT_FIELD_REF, eltype,
5654                                                  new_temp, TYPE_SIZE (eltype),
5655                                                  bitsize_int (sz * BITS_PER_UNIT)));
5656               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5657               dst2 =  make_ssa_name (vectype1);
5658               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5659                                                  build1 (VIEW_CONVERT_EXPR,
5660                                                          vectype1, tem));
5661               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5662             }
5663
5664           new_temp = make_ssa_name (vectype1);
5665           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5666           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5667         }
5668
5669       if (reduce_with_shift && !slp_reduc)
5670         {
5671           int element_bitsize = tree_to_uhwi (bitsize);
5672           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5673              for variable-length vectors and also requires direct target support
5674              for loop reductions.  */
5675           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5676           int nelements = vec_size_in_bits / element_bitsize;
5677           vec_perm_builder sel;
5678           vec_perm_indices indices;
5679
5680           int elt_offset;
5681
5682           tree zero_vec = build_zero_cst (vectype1);
5683           /* Case 2: Create:
5684              for (offset = nelements/2; offset >= 1; offset/=2)
5685                 {
5686                   Create:  va' = vec_shift <va, offset>
5687                   Create:  va = vop <va, va'>
5688                 }  */
5689
5690           tree rhs;
5691
5692           if (dump_enabled_p ())
5693             dump_printf_loc (MSG_NOTE, vect_location,
5694                              "Reduce using vector shifts\n");
5695
5696           mode1 = TYPE_MODE (vectype1);
5697           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5698           for (elt_offset = nelements / 2;
5699                elt_offset >= 1;
5700                elt_offset /= 2)
5701             {
5702               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5703               indices.new_vector (sel, 2, nelements);
5704               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5705               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5706                                                  new_temp, zero_vec, mask);
5707               new_name = make_ssa_name (vec_dest, epilog_stmt);
5708               gimple_assign_set_lhs (epilog_stmt, new_name);
5709               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5710
5711               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5712                                                  new_temp);
5713               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5714               gimple_assign_set_lhs (epilog_stmt, new_temp);
5715               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5716             }
5717
5718           /* 2.4  Extract the final scalar result.  Create:
5719              s_out3 = extract_field <v_out2, bitpos>  */
5720
5721           if (dump_enabled_p ())
5722             dump_printf_loc (MSG_NOTE, vect_location,
5723                              "extract scalar result\n");
5724
5725           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5726                         bitsize, bitsize_zero_node);
5727           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5728           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5729           gimple_assign_set_lhs (epilog_stmt, new_temp);
5730           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5731           scalar_results.safe_push (new_temp);
5732         }
5733       else
5734         {
5735           /* Case 3: Create:
5736              s = extract_field <v_out2, 0>
5737              for (offset = element_size;
5738                   offset < vector_size;
5739                   offset += element_size;)
5740                {
5741                  Create:  s' = extract_field <v_out2, offset>
5742                  Create:  s = op <s, s'>  // For non SLP cases
5743                }  */
5744
5745           if (dump_enabled_p ())
5746             dump_printf_loc (MSG_NOTE, vect_location,
5747                              "Reduce using scalar code.\n");
5748
5749           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5750           int element_bitsize = tree_to_uhwi (bitsize);
5751           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5752             {
5753               int bit_offset;
5754               if (gimple_code (new_phi) == GIMPLE_PHI)
5755                 vec_temp = PHI_RESULT (new_phi);
5756               else
5757                 vec_temp = gimple_assign_lhs (new_phi);
5758               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5759                                  bitsize_zero_node);
5760               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5761               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5762               gimple_assign_set_lhs (epilog_stmt, new_temp);
5763               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5764
5765               /* In SLP we don't need to apply reduction operation, so we just
5766                  collect s' values in SCALAR_RESULTS.  */
5767               if (slp_reduc)
5768                 scalar_results.safe_push (new_temp);
5769
5770               for (bit_offset = element_bitsize;
5771                    bit_offset < vec_size_in_bits;
5772                    bit_offset += element_bitsize)
5773                 {
5774                   tree bitpos = bitsize_int (bit_offset);
5775                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5776                                      bitsize, bitpos);
5777
5778                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5779                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5780                   gimple_assign_set_lhs (epilog_stmt, new_name);
5781                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5782
5783                   if (slp_reduc)
5784                     {
5785                       /* In SLP we don't need to apply reduction operation, so
5786                          we just collect s' values in SCALAR_RESULTS.  */
5787                       new_temp = new_name;
5788                       scalar_results.safe_push (new_name);
5789                     }
5790                   else
5791                     {
5792                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5793                                                          new_name, new_temp);
5794                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5795                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5796                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5797                     }
5798                 }
5799             }
5800
5801           /* The only case where we need to reduce scalar results in SLP, is
5802              unrolling.  If the size of SCALAR_RESULTS is greater than
5803              GROUP_SIZE, we reduce them combining elements modulo
5804              GROUP_SIZE.  */
5805           if (slp_reduc)
5806             {
5807               tree res, first_res, new_res;
5808               gimple *new_stmt;
5809
5810               /* Reduce multiple scalar results in case of SLP unrolling.  */
5811               for (j = group_size; scalar_results.iterate (j, &res);
5812                    j++)
5813                 {
5814                   first_res = scalar_results[j % group_size];
5815                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5816                                                   first_res, res);
5817                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5818                   gimple_assign_set_lhs (new_stmt, new_res);
5819                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5820                   scalar_results[j % group_size] = new_res;
5821                 }
5822             }
5823           else
5824             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5825             scalar_results.safe_push (new_temp);
5826         }
5827
5828       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5829            == INTEGER_INDUC_COND_REDUCTION)
5830           && !operand_equal_p (initial_def, induc_val, 0))
5831         {
5832           /* Earlier we set the initial value to be a vector if induc_val
5833              values.  Check the result and if it is induc_val then replace
5834              with the original initial value, unless induc_val is
5835              the same as initial_def already.  */
5836           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5837                                   induc_val);
5838
5839           tree tmp = make_ssa_name (new_scalar_dest);
5840           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5841                                              initial_def, new_temp);
5842           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5843           scalar_results[0] = tmp;
5844         }
5845     }
5846
5847 vect_finalize_reduction:
5848
5849   if (double_reduc)
5850     loop = loop->inner;
5851
5852   /* 2.5 Adjust the final result by the initial value of the reduction
5853          variable. (When such adjustment is not needed, then
5854          'adjustment_def' is zero).  For example, if code is PLUS we create:
5855          new_temp = loop_exit_def + adjustment_def  */
5856
5857   if (adjustment_def)
5858     {
5859       gcc_assert (!slp_reduc);
5860       if (nested_in_vect_loop)
5861         {
5862           new_phi = new_phis[0];
5863           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5864           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5865           new_dest = vect_create_destination_var (scalar_dest, vectype);
5866         }
5867       else
5868         {
5869           new_temp = scalar_results[0];
5870           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5871           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5872           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5873         }
5874
5875       epilog_stmt = gimple_build_assign (new_dest, expr);
5876       new_temp = make_ssa_name (new_dest, epilog_stmt);
5877       gimple_assign_set_lhs (epilog_stmt, new_temp);
5878       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5879       if (nested_in_vect_loop)
5880         {
5881           set_vinfo_for_stmt (epilog_stmt,
5882                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5883           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5884                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5885
5886           if (!double_reduc)
5887             scalar_results.quick_push (new_temp);
5888           else
5889             scalar_results[0] = new_temp;
5890         }
5891       else
5892         scalar_results[0] = new_temp;
5893
5894       new_phis[0] = epilog_stmt;
5895     }
5896
5897   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5898           phis with new adjusted scalar results, i.e., replace use <s_out0>
5899           with use <s_out4>.
5900
5901      Transform:
5902         loop_exit:
5903           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5904           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5905           v_out2 = reduce <v_out1>
5906           s_out3 = extract_field <v_out2, 0>
5907           s_out4 = adjust_result <s_out3>
5908           use <s_out0>
5909           use <s_out0>
5910
5911      into:
5912
5913         loop_exit:
5914           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5915           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5916           v_out2 = reduce <v_out1>
5917           s_out3 = extract_field <v_out2, 0>
5918           s_out4 = adjust_result <s_out3>
5919           use <s_out4>
5920           use <s_out4> */
5921
5922
5923   /* In SLP reduction chain we reduce vector results into one vector if
5924      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5925      the last stmt in the reduction chain, since we are looking for the loop
5926      exit phi node.  */
5927   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5928     {
5929       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5930       /* Handle reduction patterns.  */
5931       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5932         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5933
5934       scalar_dest = gimple_assign_lhs (dest_stmt);
5935       group_size = 1;
5936     }
5937
5938   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5939      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5940      need to match SCALAR_RESULTS with corresponding statements.  The first
5941      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5942      the first vector stmt, etc.
5943      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5944   if (group_size > new_phis.length ())
5945     {
5946       ratio = group_size / new_phis.length ();
5947       gcc_assert (!(group_size % new_phis.length ()));
5948     }
5949   else
5950     ratio = 1;
5951
5952   for (k = 0; k < group_size; k++)
5953     {
5954       if (k % ratio == 0)
5955         {
5956           epilog_stmt = new_phis[k / ratio];
5957           reduction_phi = reduction_phis[k / ratio];
5958           if (double_reduc)
5959             inner_phi = inner_phis[k / ratio];
5960         }
5961
5962       if (slp_reduc)
5963         {
5964           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5965
5966           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5967           /* SLP statements can't participate in patterns.  */
5968           gcc_assert (!orig_stmt);
5969           scalar_dest = gimple_assign_lhs (current_stmt);
5970         }
5971
5972       phis.create (3);
5973       /* Find the loop-closed-use at the loop exit of the original scalar
5974          result.  (The reduction result is expected to have two immediate uses -
5975          one at the latch block, and one at the loop exit).  */
5976       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5977         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5978             && !is_gimple_debug (USE_STMT (use_p)))
5979           phis.safe_push (USE_STMT (use_p));
5980
5981       /* While we expect to have found an exit_phi because of loop-closed-ssa
5982          form we can end up without one if the scalar cycle is dead.  */
5983
5984       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5985         {
5986           if (outer_loop)
5987             {
5988               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5989               gphi *vect_phi;
5990
5991               /* FORNOW. Currently not supporting the case that an inner-loop
5992                  reduction is not used in the outer-loop (but only outside the
5993                  outer-loop), unless it is double reduction.  */
5994               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5995                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5996                           || double_reduc);
5997
5998               if (double_reduc)
5999                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6000               else
6001                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6002               if (!double_reduc
6003                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6004                       != vect_double_reduction_def)
6005                 continue;
6006
6007               /* Handle double reduction:
6008
6009                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
6010                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6011                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
6012                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
6013
6014                  At that point the regular reduction (stmt2 and stmt3) is
6015                  already vectorized, as well as the exit phi node, stmt4.
6016                  Here we vectorize the phi node of double reduction, stmt1, and
6017                  update all relevant statements.  */
6018
6019               /* Go through all the uses of s2 to find double reduction phi
6020                  node, i.e., stmt1 above.  */
6021               orig_name = PHI_RESULT (exit_phi);
6022               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6023                 {
6024                   stmt_vec_info use_stmt_vinfo;
6025                   stmt_vec_info new_phi_vinfo;
6026                   tree vect_phi_init, preheader_arg, vect_phi_res;
6027                   basic_block bb = gimple_bb (use_stmt);
6028                   gimple *use;
6029
6030                   /* Check that USE_STMT is really double reduction phi
6031                      node.  */
6032                   if (gimple_code (use_stmt) != GIMPLE_PHI
6033                       || gimple_phi_num_args (use_stmt) != 2
6034                       || bb->loop_father != outer_loop)
6035                     continue;
6036                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6037                   if (!use_stmt_vinfo
6038                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6039                           != vect_double_reduction_def)
6040                     continue;
6041
6042                   /* Create vector phi node for double reduction:
6043                      vs1 = phi <vs0, vs2>
6044                      vs1 was created previously in this function by a call to
6045                        vect_get_vec_def_for_operand and is stored in
6046                        vec_initial_def;
6047                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6048                      vs0 is created here.  */
6049
6050                   /* Create vector phi node.  */
6051                   vect_phi = create_phi_node (vec_initial_def, bb);
6052                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6053                                     loop_vec_info_for_loop (outer_loop));
6054                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6055
6056                   /* Create vs0 - initial def of the double reduction phi.  */
6057                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6058                                              loop_preheader_edge (outer_loop));
6059                   vect_phi_init = get_initial_def_for_reduction
6060                     (stmt, preheader_arg, NULL);
6061
6062                   /* Update phi node arguments with vs0 and vs2.  */
6063                   add_phi_arg (vect_phi, vect_phi_init,
6064                                loop_preheader_edge (outer_loop),
6065                                UNKNOWN_LOCATION);
6066                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6067                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6068                   if (dump_enabled_p ())
6069                     {
6070                       dump_printf_loc (MSG_NOTE, vect_location,
6071                                        "created double reduction phi node: ");
6072                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6073                     }
6074
6075                   vect_phi_res = PHI_RESULT (vect_phi);
6076
6077                   /* Replace the use, i.e., set the correct vs1 in the regular
6078                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6079                      loop is redundant.  */
6080                   use = reduction_phi;
6081                   for (j = 0; j < ncopies; j++)
6082                     {
6083                       edge pr_edge = loop_preheader_edge (loop);
6084                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6085                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6086                     }
6087                 }
6088             }
6089         }
6090
6091       phis.release ();
6092       if (nested_in_vect_loop)
6093         {
6094           if (double_reduc)
6095             loop = outer_loop;
6096           else
6097             continue;
6098         }
6099
6100       phis.create (3);
6101       /* Find the loop-closed-use at the loop exit of the original scalar
6102          result.  (The reduction result is expected to have two immediate uses,
6103          one at the latch block, and one at the loop exit).  For double
6104          reductions we are looking for exit phis of the outer loop.  */
6105       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6106         {
6107           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6108             {
6109               if (!is_gimple_debug (USE_STMT (use_p)))
6110                 phis.safe_push (USE_STMT (use_p));
6111             }
6112           else
6113             {
6114               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6115                 {
6116                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6117
6118                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6119                     {
6120                       if (!flow_bb_inside_loop_p (loop,
6121                                              gimple_bb (USE_STMT (phi_use_p)))
6122                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6123                         phis.safe_push (USE_STMT (phi_use_p));
6124                     }
6125                 }
6126             }
6127         }
6128
6129       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6130         {
6131           /* Replace the uses:  */
6132           orig_name = PHI_RESULT (exit_phi);
6133           scalar_result = scalar_results[k];
6134           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6135             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6136               SET_USE (use_p, scalar_result);
6137         }
6138
6139       phis.release ();
6140     }
6141 }
6142
6143 /* Return a vector of type VECTYPE that is equal to the vector select
6144    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6145    before GSI.  */
6146
6147 static tree
6148 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6149                      tree vec, tree identity)
6150 {
6151   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6152   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6153                                           mask, vec, identity);
6154   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6155   return cond;
6156 }
6157
6158 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6159    order, starting with LHS.  Insert the extraction statements before GSI and
6160    associate the new scalar SSA names with variable SCALAR_DEST.
6161    Return the SSA name for the result.  */
6162
6163 static tree
6164 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6165                        tree_code code, tree lhs, tree vector_rhs)
6166 {
6167   tree vectype = TREE_TYPE (vector_rhs);
6168   tree scalar_type = TREE_TYPE (vectype);
6169   tree bitsize = TYPE_SIZE (scalar_type);
6170   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6171   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6172
6173   for (unsigned HOST_WIDE_INT bit_offset = 0;
6174        bit_offset < vec_size_in_bits;
6175        bit_offset += element_bitsize)
6176     {
6177       tree bitpos = bitsize_int (bit_offset);
6178       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6179                          bitsize, bitpos);
6180
6181       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6182       rhs = make_ssa_name (scalar_dest, stmt);
6183       gimple_assign_set_lhs (stmt, rhs);
6184       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6185
6186       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6187       tree new_name = make_ssa_name (scalar_dest, stmt);
6188       gimple_assign_set_lhs (stmt, new_name);
6189       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6190       lhs = new_name;
6191     }
6192   return lhs;
6193 }
6194
6195 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6196    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6197    statement.  CODE is the operation performed by STMT and OPS are
6198    its scalar operands.  REDUC_INDEX is the index of the operand in
6199    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6200    implements in-order reduction, or IFN_LAST if we should open-code it.
6201    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6202    that should be used to control the operation in a fully-masked loop.  */
6203
6204 static bool
6205 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6206                                gimple **vec_stmt, slp_tree slp_node,
6207                                gimple *reduc_def_stmt,
6208                                tree_code code, internal_fn reduc_fn,
6209                                tree ops[3], tree vectype_in,
6210                                int reduc_index, vec_loop_masks *masks)
6211 {
6212   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6213   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6214   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6215   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6216   gimple *new_stmt = NULL;
6217
6218   int ncopies;
6219   if (slp_node)
6220     ncopies = 1;
6221   else
6222     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6223
6224   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6225   gcc_assert (ncopies == 1);
6226   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6227   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6228   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6229               == FOLD_LEFT_REDUCTION);
6230
6231   if (slp_node)
6232     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6233                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6234
6235   tree op0 = ops[1 - reduc_index];
6236
6237   int group_size = 1;
6238   gimple *scalar_dest_def;
6239   auto_vec<tree> vec_oprnds0;
6240   if (slp_node)
6241     {
6242       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6243       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6244       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6245     }
6246   else
6247     {
6248       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6249       vec_oprnds0.create (1);
6250       vec_oprnds0.quick_push (loop_vec_def0);
6251       scalar_dest_def = stmt;
6252     }
6253
6254   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6255   tree scalar_type = TREE_TYPE (scalar_dest);
6256   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6257
6258   int vec_num = vec_oprnds0.length ();
6259   gcc_assert (vec_num == 1 || slp_node);
6260   tree vec_elem_type = TREE_TYPE (vectype_out);
6261   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6262
6263   tree vector_identity = NULL_TREE;
6264   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6265     vector_identity = build_zero_cst (vectype_out);
6266
6267   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6268   int i;
6269   tree def0;
6270   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6271     {
6272       tree mask = NULL_TREE;
6273       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6274         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6275
6276       /* Handle MINUS by adding the negative.  */
6277       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6278         {
6279           tree negated = make_ssa_name (vectype_out);
6280           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6281           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6282           def0 = negated;
6283         }
6284
6285       if (mask)
6286         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6287                                     vector_identity);
6288
6289       /* On the first iteration the input is simply the scalar phi
6290          result, and for subsequent iterations it is the output of
6291          the preceding operation.  */
6292       if (reduc_fn != IFN_LAST)
6293         {
6294           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6295           /* For chained SLP reductions the output of the previous reduction
6296              operation serves as the input of the next. For the final statement
6297              the output cannot be a temporary - we reuse the original
6298              scalar destination of the last statement.  */
6299           if (i != vec_num - 1)
6300             {
6301               gimple_set_lhs (new_stmt, scalar_dest_var);
6302               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6303               gimple_set_lhs (new_stmt, reduc_var);
6304             }
6305         }
6306       else
6307         {
6308           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6309                                              reduc_var, def0);
6310           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6311           /* Remove the statement, so that we can use the same code paths
6312              as for statements that we've just created.  */
6313           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6314           gsi_remove (&tmp_gsi, false);
6315         }
6316
6317       if (i == vec_num - 1)
6318         {
6319           gimple_set_lhs (new_stmt, scalar_dest);
6320           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6321         }
6322       else
6323         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6324
6325       if (slp_node)
6326         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6327     }
6328
6329   if (!slp_node)
6330     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6331
6332   return true;
6333 }
6334
6335 /* Function is_nonwrapping_integer_induction.
6336
6337    Check if STMT (which is part of loop LOOP) both increments and
6338    does not cause overflow.  */
6339
6340 static bool
6341 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6342 {
6343   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6344   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6345   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6346   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6347   widest_int ni, max_loop_value, lhs_max;
6348   bool overflow = false;
6349
6350   /* Make sure the loop is integer based.  */
6351   if (TREE_CODE (base) != INTEGER_CST
6352       || TREE_CODE (step) != INTEGER_CST)
6353     return false;
6354
6355   /* Check that the max size of the loop will not wrap.  */
6356
6357   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6358     return true;
6359
6360   if (! max_stmt_executions (loop, &ni))
6361     return false;
6362
6363   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6364                             &overflow);
6365   if (overflow)
6366     return false;
6367
6368   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6369                             TYPE_SIGN (lhs_type), &overflow);
6370   if (overflow)
6371     return false;
6372
6373   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6374           <= TYPE_PRECISION (lhs_type));
6375 }
6376
6377 /* Function vectorizable_reduction.
6378
6379    Check if STMT performs a reduction operation that can be vectorized.
6380    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6381    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6382    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6383
6384    This function also handles reduction idioms (patterns) that have been
6385    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6386    of this form:
6387      X = pattern_expr (arg0, arg1, ..., X)
6388    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6389    sequence that had been detected and replaced by the pattern-stmt (STMT).
6390
6391    This function also handles reduction of condition expressions, for example:
6392      for (int i = 0; i < N; i++)
6393        if (a[i] < value)
6394          last = a[i];
6395    This is handled by vectorising the loop and creating an additional vector
6396    containing the loop indexes for which "a[i] < value" was true.  In the
6397    function epilogue this is reduced to a single max value and then used to
6398    index into the vector of results.
6399
6400    In some cases of reduction patterns, the type of the reduction variable X is
6401    different than the type of the other arguments of STMT.
6402    In such cases, the vectype that is used when transforming STMT into a vector
6403    stmt is different than the vectype that is used to determine the
6404    vectorization factor, because it consists of a different number of elements
6405    than the actual number of elements that are being operated upon in parallel.
6406
6407    For example, consider an accumulation of shorts into an int accumulator.
6408    On some targets it's possible to vectorize this pattern operating on 8
6409    shorts at a time (hence, the vectype for purposes of determining the
6410    vectorization factor should be V8HI); on the other hand, the vectype that
6411    is used to create the vector form is actually V4SI (the type of the result).
6412
6413    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6414    indicates what is the actual level of parallelism (V8HI in the example), so
6415    that the right vectorization factor would be derived.  This vectype
6416    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6417    be used to create the vectorized stmt.  The right vectype for the vectorized
6418    stmt is obtained from the type of the result X:
6419         get_vectype_for_scalar_type (TREE_TYPE (X))
6420
6421    This means that, contrary to "regular" reductions (or "regular" stmts in
6422    general), the following equation:
6423       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6424    does *NOT* necessarily hold for reduction patterns.  */
6425
6426 bool
6427 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6428                         gimple **vec_stmt, slp_tree slp_node,
6429                         slp_instance slp_node_instance)
6430 {
6431   tree vec_dest;
6432   tree scalar_dest;
6433   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6434   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6435   tree vectype_in = NULL_TREE;
6436   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6437   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6438   enum tree_code code, orig_code;
6439   internal_fn reduc_fn;
6440   machine_mode vec_mode;
6441   int op_type;
6442   optab optab;
6443   tree new_temp = NULL_TREE;
6444   gimple *def_stmt;
6445   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6446   gimple *cond_reduc_def_stmt = NULL;
6447   enum tree_code cond_reduc_op_code = ERROR_MARK;
6448   tree scalar_type;
6449   bool is_simple_use;
6450   gimple *orig_stmt;
6451   stmt_vec_info orig_stmt_info = NULL;
6452   int i;
6453   int ncopies;
6454   int epilog_copies;
6455   stmt_vec_info prev_stmt_info, prev_phi_info;
6456   bool single_defuse_cycle = false;
6457   gimple *new_stmt = NULL;
6458   int j;
6459   tree ops[3];
6460   enum vect_def_type dts[3];
6461   bool nested_cycle = false, found_nested_cycle_def = false;
6462   bool double_reduc = false;
6463   basic_block def_bb;
6464   struct loop * def_stmt_loop, *outer_loop = NULL;
6465   tree def_arg;
6466   gimple *def_arg_stmt;
6467   auto_vec<tree> vec_oprnds0;
6468   auto_vec<tree> vec_oprnds1;
6469   auto_vec<tree> vec_oprnds2;
6470   auto_vec<tree> vect_defs;
6471   auto_vec<gimple *> phis;
6472   int vec_num;
6473   tree def0, tem;
6474   bool first_p = true;
6475   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6476   tree cond_reduc_val = NULL_TREE;
6477
6478   /* Make sure it was already recognized as a reduction computation.  */
6479   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6480       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6481     return false;
6482
6483   if (nested_in_vect_loop_p (loop, stmt))
6484     {
6485       outer_loop = loop;
6486       loop = loop->inner;
6487       nested_cycle = true;
6488     }
6489
6490   /* In case of reduction chain we switch to the first stmt in the chain, but
6491      we don't update STMT_INFO, since only the last stmt is marked as reduction
6492      and has reduction properties.  */
6493   if (GROUP_FIRST_ELEMENT (stmt_info)
6494       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6495     {
6496       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6497       first_p = false;
6498     }
6499
6500   if (gimple_code (stmt) == GIMPLE_PHI)
6501     {
6502       /* Analysis is fully done on the reduction stmt invocation.  */
6503       if (! vec_stmt)
6504         {
6505           if (slp_node)
6506             slp_node_instance->reduc_phis = slp_node;
6507
6508           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6509           return true;
6510         }
6511
6512       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6513         /* Leave the scalar phi in place.  Note that checking
6514            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6515            for reductions involving a single statement.  */
6516         return true;
6517
6518       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6519       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6520         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6521
6522       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6523           == EXTRACT_LAST_REDUCTION)
6524         /* Leave the scalar phi in place.  */
6525         return true;
6526
6527       gcc_assert (is_gimple_assign (reduc_stmt));
6528       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6529         {
6530           tree op = gimple_op (reduc_stmt, k);
6531           if (op == gimple_phi_result (stmt))
6532             continue;
6533           if (k == 1
6534               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6535             continue;
6536           if (!vectype_in
6537               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6538                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6539             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6540           break;
6541         }
6542       gcc_assert (vectype_in);
6543
6544       if (slp_node)
6545         ncopies = 1;
6546       else
6547         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6548
6549       use_operand_p use_p;
6550       gimple *use_stmt;
6551       if (ncopies > 1
6552           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6553               <= vect_used_only_live)
6554           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6555           && (use_stmt == reduc_stmt
6556               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6557                   == reduc_stmt)))
6558         single_defuse_cycle = true;
6559
6560       /* Create the destination vector  */
6561       scalar_dest = gimple_assign_lhs (reduc_stmt);
6562       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6563
6564       if (slp_node)
6565         /* The size vect_schedule_slp_instance computes is off for us.  */
6566         vec_num = vect_get_num_vectors
6567           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6568            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6569            vectype_in);
6570       else
6571         vec_num = 1;
6572
6573       /* Generate the reduction PHIs upfront.  */
6574       prev_phi_info = NULL;
6575       for (j = 0; j < ncopies; j++)
6576         {
6577           if (j == 0 || !single_defuse_cycle)
6578             {
6579               for (i = 0; i < vec_num; i++)
6580                 {
6581                   /* Create the reduction-phi that defines the reduction
6582                      operand.  */
6583                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6584                   set_vinfo_for_stmt (new_phi,
6585                                       new_stmt_vec_info (new_phi, loop_vinfo));
6586
6587                   if (slp_node)
6588                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6589                   else
6590                     {
6591                       if (j == 0)
6592                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6593                       else
6594                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6595                       prev_phi_info = vinfo_for_stmt (new_phi);
6596                     }
6597                 }
6598             }
6599         }
6600
6601       return true;
6602     }
6603
6604   /* 1. Is vectorizable reduction?  */
6605   /* Not supportable if the reduction variable is used in the loop, unless
6606      it's a reduction chain.  */
6607   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6608       && !GROUP_FIRST_ELEMENT (stmt_info))
6609     return false;
6610
6611   /* Reductions that are not used even in an enclosing outer-loop,
6612      are expected to be "live" (used out of the loop).  */
6613   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6614       && !STMT_VINFO_LIVE_P (stmt_info))
6615     return false;
6616
6617   /* 2. Has this been recognized as a reduction pattern?
6618
6619      Check if STMT represents a pattern that has been recognized
6620      in earlier analysis stages.  For stmts that represent a pattern,
6621      the STMT_VINFO_RELATED_STMT field records the last stmt in
6622      the original sequence that constitutes the pattern.  */
6623
6624   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6625   if (orig_stmt)
6626     {
6627       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6628       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6629       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6630     }
6631
6632   /* 3. Check the operands of the operation.  The first operands are defined
6633         inside the loop body. The last operand is the reduction variable,
6634         which is defined by the loop-header-phi.  */
6635
6636   gcc_assert (is_gimple_assign (stmt));
6637
6638   /* Flatten RHS.  */
6639   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6640     {
6641     case GIMPLE_BINARY_RHS:
6642       code = gimple_assign_rhs_code (stmt);
6643       op_type = TREE_CODE_LENGTH (code);
6644       gcc_assert (op_type == binary_op);
6645       ops[0] = gimple_assign_rhs1 (stmt);
6646       ops[1] = gimple_assign_rhs2 (stmt);
6647       break;
6648
6649     case GIMPLE_TERNARY_RHS:
6650       code = gimple_assign_rhs_code (stmt);
6651       op_type = TREE_CODE_LENGTH (code);
6652       gcc_assert (op_type == ternary_op);
6653       ops[0] = gimple_assign_rhs1 (stmt);
6654       ops[1] = gimple_assign_rhs2 (stmt);
6655       ops[2] = gimple_assign_rhs3 (stmt);
6656       break;
6657
6658     case GIMPLE_UNARY_RHS:
6659       return false;
6660
6661     default:
6662       gcc_unreachable ();
6663     }
6664
6665   if (code == COND_EXPR && slp_node)
6666     return false;
6667
6668   scalar_dest = gimple_assign_lhs (stmt);
6669   scalar_type = TREE_TYPE (scalar_dest);
6670   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6671       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6672     return false;
6673
6674   /* Do not try to vectorize bit-precision reductions.  */
6675   if (!type_has_mode_precision_p (scalar_type))
6676     return false;
6677
6678   /* All uses but the last are expected to be defined in the loop.
6679      The last use is the reduction variable.  In case of nested cycle this
6680      assumption is not true: we use reduc_index to record the index of the
6681      reduction variable.  */
6682   gimple *reduc_def_stmt = NULL;
6683   int reduc_index = -1;
6684   for (i = 0; i < op_type; i++)
6685     {
6686       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6687       if (i == 0 && code == COND_EXPR)
6688         continue;
6689
6690       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6691                                           &def_stmt, &dts[i], &tem);
6692       dt = dts[i];
6693       gcc_assert (is_simple_use);
6694       if (dt == vect_reduction_def)
6695         {
6696           reduc_def_stmt = def_stmt;
6697           reduc_index = i;
6698           continue;
6699         }
6700       else if (tem)
6701         {
6702           /* To properly compute ncopies we are interested in the widest
6703              input type in case we're looking at a widening accumulation.  */
6704           if (!vectype_in
6705               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6706                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6707             vectype_in = tem;
6708         }
6709
6710       if (dt != vect_internal_def
6711           && dt != vect_external_def
6712           && dt != vect_constant_def
6713           && dt != vect_induction_def
6714           && !(dt == vect_nested_cycle && nested_cycle))
6715         return false;
6716
6717       if (dt == vect_nested_cycle)
6718         {
6719           found_nested_cycle_def = true;
6720           reduc_def_stmt = def_stmt;
6721           reduc_index = i;
6722         }
6723
6724       if (i == 1 && code == COND_EXPR)
6725         {
6726           /* Record how value of COND_EXPR is defined.  */
6727           if (dt == vect_constant_def)
6728             {
6729               cond_reduc_dt = dt;
6730               cond_reduc_val = ops[i];
6731             }
6732           if (dt == vect_induction_def
6733               && def_stmt != NULL
6734               && is_nonwrapping_integer_induction (def_stmt, loop))
6735             {
6736               cond_reduc_dt = dt;
6737               cond_reduc_def_stmt = def_stmt;
6738             }
6739         }
6740     }
6741
6742   if (!vectype_in)
6743     vectype_in = vectype_out;
6744
6745   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6746      directy used in stmt.  */
6747   if (reduc_index == -1)
6748     {
6749       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6750         {
6751           if (dump_enabled_p ())
6752             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6753                              "in-order reduction chain without SLP.\n");
6754           return false;
6755         }
6756
6757       if (orig_stmt)
6758         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6759       else
6760         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6761     }
6762
6763   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6764     return false;
6765
6766   if (!(reduc_index == -1
6767         || dts[reduc_index] == vect_reduction_def
6768         || dts[reduc_index] == vect_nested_cycle
6769         || ((dts[reduc_index] == vect_internal_def
6770              || dts[reduc_index] == vect_external_def
6771              || dts[reduc_index] == vect_constant_def
6772              || dts[reduc_index] == vect_induction_def)
6773             && nested_cycle && found_nested_cycle_def)))
6774     {
6775       /* For pattern recognized stmts, orig_stmt might be a reduction,
6776          but some helper statements for the pattern might not, or
6777          might be COND_EXPRs with reduction uses in the condition.  */
6778       gcc_assert (orig_stmt);
6779       return false;
6780     }
6781
6782   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6783   enum vect_reduction_type v_reduc_type
6784     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6785   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6786
6787   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6788   /* If we have a condition reduction, see if we can simplify it further.  */
6789   if (v_reduc_type == COND_REDUCTION)
6790     {
6791       /* TODO: We can't yet handle reduction chains, since we need to treat
6792          each COND_EXPR in the chain specially, not just the last one.
6793          E.g. for:
6794
6795             x_1 = PHI <x_3, ...>
6796             x_2 = a_2 ? ... : x_1;
6797             x_3 = a_3 ? ... : x_2;
6798
6799          we're interested in the last element in x_3 for which a_2 || a_3
6800          is true, whereas the current reduction chain handling would
6801          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6802          as a reduction operation.  */
6803       if (reduc_index == -1)
6804         {
6805           if (dump_enabled_p ())
6806             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6807                              "conditional reduction chains not supported\n");
6808           return false;
6809         }
6810
6811       /* vect_is_simple_reduction ensured that operand 2 is the
6812          loop-carried operand.  */
6813       gcc_assert (reduc_index == 2);
6814
6815       /* Loop peeling modifies initial value of reduction PHI, which
6816          makes the reduction stmt to be transformed different to the
6817          original stmt analyzed.  We need to record reduction code for
6818          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6819          it can be used directly at transform stage.  */
6820       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6821           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6822         {
6823           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6824           gcc_assert (cond_reduc_dt == vect_constant_def);
6825           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6826         }
6827       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6828                                                vectype_in, OPTIMIZE_FOR_SPEED))
6829         {
6830           if (dump_enabled_p ())
6831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6832                              "optimizing condition reduction with"
6833                              " FOLD_EXTRACT_LAST.\n");
6834           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6835         }
6836       else if (cond_reduc_dt == vect_induction_def)
6837         {
6838           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6839           tree base
6840             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6841           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6842
6843           gcc_assert (TREE_CODE (base) == INTEGER_CST
6844                       && TREE_CODE (step) == INTEGER_CST);
6845           cond_reduc_val = NULL_TREE;
6846           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6847              above base; punt if base is the minimum value of the type for
6848              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6849           if (tree_int_cst_sgn (step) == -1)
6850             {
6851               cond_reduc_op_code = MIN_EXPR;
6852               if (tree_int_cst_sgn (base) == -1)
6853                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6854               else if (tree_int_cst_lt (base,
6855                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6856                 cond_reduc_val
6857                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6858             }
6859           else
6860             {
6861               cond_reduc_op_code = MAX_EXPR;
6862               if (tree_int_cst_sgn (base) == 1)
6863                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6864               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6865                                         base))
6866                 cond_reduc_val
6867                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6868             }
6869           if (cond_reduc_val)
6870             {
6871               if (dump_enabled_p ())
6872                 dump_printf_loc (MSG_NOTE, vect_location,
6873                                  "condition expression based on "
6874                                  "integer induction.\n");
6875               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6876                 = INTEGER_INDUC_COND_REDUCTION;
6877             }
6878         }
6879       else if (cond_reduc_dt == vect_constant_def)
6880         {
6881           enum vect_def_type cond_initial_dt;
6882           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6883           tree cond_initial_val
6884             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6885
6886           gcc_assert (cond_reduc_val != NULL_TREE);
6887           vect_is_simple_use (cond_initial_val, loop_vinfo,
6888                               &def_stmt, &cond_initial_dt);
6889           if (cond_initial_dt == vect_constant_def
6890               && types_compatible_p (TREE_TYPE (cond_initial_val),
6891                                      TREE_TYPE (cond_reduc_val)))
6892             {
6893               tree e = fold_binary (LE_EXPR, boolean_type_node,
6894                                     cond_initial_val, cond_reduc_val);
6895               if (e && (integer_onep (e) || integer_zerop (e)))
6896                 {
6897                   if (dump_enabled_p ())
6898                     dump_printf_loc (MSG_NOTE, vect_location,
6899                                      "condition expression based on "
6900                                      "compile time constant.\n");
6901                   /* Record reduction code at analysis stage.  */
6902                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6903                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6904                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6905                     = CONST_COND_REDUCTION;
6906                 }
6907             }
6908         }
6909     }
6910
6911   if (orig_stmt)
6912     gcc_assert (tmp == orig_stmt
6913                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6914   else
6915     /* We changed STMT to be the first stmt in reduction chain, hence we
6916        check that in this case the first element in the chain is STMT.  */
6917     gcc_assert (stmt == tmp
6918                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6919
6920   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6921     return false;
6922
6923   if (slp_node)
6924     ncopies = 1;
6925   else
6926     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6927
6928   gcc_assert (ncopies >= 1);
6929
6930   vec_mode = TYPE_MODE (vectype_in);
6931   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6932
6933   if (code == COND_EXPR)
6934     {
6935       /* Only call during the analysis stage, otherwise we'll lose
6936          STMT_VINFO_TYPE.  */
6937       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6938                                                 ops[reduc_index], 0, NULL))
6939         {
6940           if (dump_enabled_p ())
6941             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6942                              "unsupported condition in reduction\n");
6943           return false;
6944         }
6945     }
6946   else
6947     {
6948       /* 4. Supportable by target?  */
6949
6950       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6951           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6952         {
6953           /* Shifts and rotates are only supported by vectorizable_shifts,
6954              not vectorizable_reduction.  */
6955           if (dump_enabled_p ())
6956             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6957                              "unsupported shift or rotation.\n");
6958           return false;
6959         }
6960
6961       /* 4.1. check support for the operation in the loop  */
6962       optab = optab_for_tree_code (code, vectype_in, optab_default);
6963       if (!optab)
6964         {
6965           if (dump_enabled_p ())
6966             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6967                              "no optab.\n");
6968
6969           return false;
6970         }
6971
6972       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6973         {
6974           if (dump_enabled_p ())
6975             dump_printf (MSG_NOTE, "op not supported by target.\n");
6976
6977           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6978               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6979             return false;
6980
6981           if (dump_enabled_p ())
6982             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6983         }
6984
6985       /* Worthwhile without SIMD support?  */
6986       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6987           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6988         {
6989           if (dump_enabled_p ())
6990             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6991                              "not worthwhile without SIMD support.\n");
6992
6993           return false;
6994         }
6995     }
6996
6997   /* 4.2. Check support for the epilog operation.
6998
6999           If STMT represents a reduction pattern, then the type of the
7000           reduction variable may be different than the type of the rest
7001           of the arguments.  For example, consider the case of accumulation
7002           of shorts into an int accumulator; The original code:
7003                         S1: int_a = (int) short_a;
7004           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7005
7006           was replaced with:
7007                         STMT: int_acc = widen_sum <short_a, int_acc>
7008
7009           This means that:
7010           1. The tree-code that is used to create the vector operation in the
7011              epilog code (that reduces the partial results) is not the
7012              tree-code of STMT, but is rather the tree-code of the original
7013              stmt from the pattern that STMT is replacing.  I.e, in the example
7014              above we want to use 'widen_sum' in the loop, but 'plus' in the
7015              epilog.
7016           2. The type (mode) we use to check available target support
7017              for the vector operation to be created in the *epilog*, is
7018              determined by the type of the reduction variable (in the example
7019              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7020              However the type (mode) we use to check available target support
7021              for the vector operation to be created *inside the loop*, is
7022              determined by the type of the other arguments to STMT (in the
7023              example we'd check this: optab_handler (widen_sum_optab,
7024              vect_short_mode)).
7025
7026           This is contrary to "regular" reductions, in which the types of all
7027           the arguments are the same as the type of the reduction variable.
7028           For "regular" reductions we can therefore use the same vector type
7029           (and also the same tree-code) when generating the epilog code and
7030           when generating the code inside the loop.  */
7031
7032   vect_reduction_type reduction_type
7033     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7034   if (orig_stmt
7035       && (reduction_type == TREE_CODE_REDUCTION
7036           || reduction_type == FOLD_LEFT_REDUCTION))
7037     {
7038       /* This is a reduction pattern: get the vectype from the type of the
7039          reduction variable, and get the tree-code from orig_stmt.  */
7040       orig_code = gimple_assign_rhs_code (orig_stmt);
7041       gcc_assert (vectype_out);
7042       vec_mode = TYPE_MODE (vectype_out);
7043     }
7044   else
7045     {
7046       /* Regular reduction: use the same vectype and tree-code as used for
7047          the vector code inside the loop can be used for the epilog code. */
7048       orig_code = code;
7049
7050       if (code == MINUS_EXPR)
7051         orig_code = PLUS_EXPR;
7052
7053       /* For simple condition reductions, replace with the actual expression
7054          we want to base our reduction around.  */
7055       if (reduction_type == CONST_COND_REDUCTION)
7056         {
7057           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7058           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7059         }
7060       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7061         orig_code = cond_reduc_op_code;
7062     }
7063
7064   if (nested_cycle)
7065     {
7066       def_bb = gimple_bb (reduc_def_stmt);
7067       def_stmt_loop = def_bb->loop_father;
7068       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7069                                        loop_preheader_edge (def_stmt_loop));
7070       if (TREE_CODE (def_arg) == SSA_NAME
7071           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7072           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7073           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7074           && vinfo_for_stmt (def_arg_stmt)
7075           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7076               == vect_double_reduction_def)
7077         double_reduc = true;
7078     }
7079
7080   reduc_fn = IFN_LAST;
7081
7082   if (reduction_type == TREE_CODE_REDUCTION
7083       || reduction_type == FOLD_LEFT_REDUCTION
7084       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7085       || reduction_type == CONST_COND_REDUCTION)
7086     {
7087       if (reduction_type == FOLD_LEFT_REDUCTION
7088           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7089           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7090         {
7091           if (reduc_fn != IFN_LAST
7092               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7093                                                   OPTIMIZE_FOR_SPEED))
7094             {
7095               if (dump_enabled_p ())
7096                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7097                                  "reduc op not supported by target.\n");
7098
7099               reduc_fn = IFN_LAST;
7100             }
7101         }
7102       else
7103         {
7104           if (!nested_cycle || double_reduc)
7105             {
7106               if (dump_enabled_p ())
7107                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7108                                  "no reduc code for scalar code.\n");
7109
7110               return false;
7111             }
7112         }
7113     }
7114   else if (reduction_type == COND_REDUCTION)
7115     {
7116       int scalar_precision
7117         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7118       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7119       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7120                                                 nunits_out);
7121
7122       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7123                                           OPTIMIZE_FOR_SPEED))
7124         reduc_fn = IFN_REDUC_MAX;
7125     }
7126
7127   if (reduction_type != EXTRACT_LAST_REDUCTION
7128       && reduc_fn == IFN_LAST
7129       && !nunits_out.is_constant ())
7130     {
7131       if (dump_enabled_p ())
7132         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7133                          "missing target support for reduction on"
7134                          " variable-length vectors.\n");
7135       return false;
7136     }
7137
7138   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7139       && ncopies > 1)
7140     {
7141       if (dump_enabled_p ())
7142         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7143                          "multiple types in double reduction or condition "
7144                          "reduction.\n");
7145       return false;
7146     }
7147
7148   /* For SLP reductions, see if there is a neutral value we can use.  */
7149   tree neutral_op = NULL_TREE;
7150   if (slp_node)
7151     neutral_op
7152       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7153                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7154
7155   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7156     {
7157       /* We can't support in-order reductions of code such as this:
7158
7159            for (int i = 0; i < n1; ++i)
7160              for (int j = 0; j < n2; ++j)
7161                l += a[j];
7162
7163          since GCC effectively transforms the loop when vectorizing:
7164
7165            for (int i = 0; i < n1 / VF; ++i)
7166              for (int j = 0; j < n2; ++j)
7167                for (int k = 0; k < VF; ++k)
7168                  l += a[j];
7169
7170          which is a reassociation of the original operation.  */
7171       if (dump_enabled_p ())
7172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7173                          "in-order double reduction not supported.\n");
7174
7175       return false;
7176     }
7177
7178   if (reduction_type == FOLD_LEFT_REDUCTION
7179       && slp_node
7180       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7181     {
7182       /* We cannot use in-order reductions in this case because there is
7183          an implicit reassociation of the operations involved.  */
7184       if (dump_enabled_p ())
7185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7186                          "in-order unchained SLP reductions not supported.\n");
7187       return false;
7188     }
7189
7190   /* For double reductions, and for SLP reductions with a neutral value,
7191      we construct a variable-length initial vector by loading a vector
7192      full of the neutral value and then shift-and-inserting the start
7193      values into the low-numbered elements.  */
7194   if ((double_reduc || neutral_op)
7195       && !nunits_out.is_constant ()
7196       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7197                                           vectype_out, OPTIMIZE_FOR_SPEED))
7198     {
7199       if (dump_enabled_p ())
7200         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7201                          "reduction on variable-length vectors requires"
7202                          " target support for a vector-shift-and-insert"
7203                          " operation.\n");
7204       return false;
7205     }
7206
7207   /* Check extra constraints for variable-length unchained SLP reductions.  */
7208   if (STMT_SLP_TYPE (stmt_info)
7209       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7210       && !nunits_out.is_constant ())
7211     {
7212       /* We checked above that we could build the initial vector when
7213          there's a neutral element value.  Check here for the case in
7214          which each SLP statement has its own initial value and in which
7215          that value needs to be repeated for every instance of the
7216          statement within the initial vector.  */
7217       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7218       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7219       if (!neutral_op
7220           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7221         {
7222           if (dump_enabled_p ())
7223             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7224                              "unsupported form of SLP reduction for"
7225                              " variable-length vectors: cannot build"
7226                              " initial vector.\n");
7227           return false;
7228         }
7229       /* The epilogue code relies on the number of elements being a multiple
7230          of the group size.  The duplicate-and-interleave approach to setting
7231          up the the initial vector does too.  */
7232       if (!multiple_p (nunits_out, group_size))
7233         {
7234           if (dump_enabled_p ())
7235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7236                              "unsupported form of SLP reduction for"
7237                              " variable-length vectors: the vector size"
7238                              " is not a multiple of the number of results.\n");
7239           return false;
7240         }
7241     }
7242
7243   /* In case of widenning multiplication by a constant, we update the type
7244      of the constant to be the type of the other operand.  We check that the
7245      constant fits the type in the pattern recognition pass.  */
7246   if (code == DOT_PROD_EXPR
7247       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7248     {
7249       if (TREE_CODE (ops[0]) == INTEGER_CST)
7250         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7251       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7252         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7253       else
7254         {
7255           if (dump_enabled_p ())
7256             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7257                              "invalid types in dot-prod\n");
7258
7259           return false;
7260         }
7261     }
7262
7263   if (reduction_type == COND_REDUCTION)
7264     {
7265       widest_int ni;
7266
7267       if (! max_loop_iterations (loop, &ni))
7268         {
7269           if (dump_enabled_p ())
7270             dump_printf_loc (MSG_NOTE, vect_location,
7271                              "loop count not known, cannot create cond "
7272                              "reduction.\n");
7273           return false;
7274         }
7275       /* Convert backedges to iterations.  */
7276       ni += 1;
7277
7278       /* The additional index will be the same type as the condition.  Check
7279          that the loop can fit into this less one (because we'll use up the
7280          zero slot for when there are no matches).  */
7281       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7282       if (wi::geu_p (ni, wi::to_widest (max_index)))
7283         {
7284           if (dump_enabled_p ())
7285             dump_printf_loc (MSG_NOTE, vect_location,
7286                              "loop size is greater than data size.\n");
7287           return false;
7288         }
7289     }
7290
7291   /* In case the vectorization factor (VF) is bigger than the number
7292      of elements that we can fit in a vectype (nunits), we have to generate
7293      more than one vector stmt - i.e - we need to "unroll" the
7294      vector stmt by a factor VF/nunits.  For more details see documentation
7295      in vectorizable_operation.  */
7296
7297   /* If the reduction is used in an outer loop we need to generate
7298      VF intermediate results, like so (e.g. for ncopies=2):
7299         r0 = phi (init, r0)
7300         r1 = phi (init, r1)
7301         r0 = x0 + r0;
7302         r1 = x1 + r1;
7303     (i.e. we generate VF results in 2 registers).
7304     In this case we have a separate def-use cycle for each copy, and therefore
7305     for each copy we get the vector def for the reduction variable from the
7306     respective phi node created for this copy.
7307
7308     Otherwise (the reduction is unused in the loop nest), we can combine
7309     together intermediate results, like so (e.g. for ncopies=2):
7310         r = phi (init, r)
7311         r = x0 + r;
7312         r = x1 + r;
7313    (i.e. we generate VF/2 results in a single register).
7314    In this case for each copy we get the vector def for the reduction variable
7315    from the vectorized reduction operation generated in the previous iteration.
7316
7317    This only works when we see both the reduction PHI and its only consumer
7318    in vectorizable_reduction and there are no intermediate stmts
7319    participating.  */
7320   use_operand_p use_p;
7321   gimple *use_stmt;
7322   if (ncopies > 1
7323       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7324       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7325       && (use_stmt == stmt
7326           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7327     {
7328       single_defuse_cycle = true;
7329       epilog_copies = 1;
7330     }
7331   else
7332     epilog_copies = ncopies;
7333
7334   /* If the reduction stmt is one of the patterns that have lane
7335      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7336   if ((ncopies > 1
7337        && ! single_defuse_cycle)
7338       && (code == DOT_PROD_EXPR
7339           || code == WIDEN_SUM_EXPR
7340           || code == SAD_EXPR))
7341     {
7342       if (dump_enabled_p ())
7343         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7344                          "multi def-use cycle not possible for lane-reducing "
7345                          "reduction operation\n");
7346       return false;
7347     }
7348
7349   if (slp_node)
7350     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7351   else
7352     vec_num = 1;
7353
7354   internal_fn cond_fn = get_conditional_internal_fn (code);
7355   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7356
7357   if (!vec_stmt) /* transformation not required.  */
7358     {
7359       if (first_p)
7360         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7361       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7362         {
7363           if (reduction_type != FOLD_LEFT_REDUCTION
7364               && (cond_fn == IFN_LAST
7365                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7366                                                       OPTIMIZE_FOR_SPEED)))
7367             {
7368               if (dump_enabled_p ())
7369                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7370                                  "can't use a fully-masked loop because no"
7371                                  " conditional operation is available.\n");
7372               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7373             }
7374           else if (reduc_index == -1)
7375             {
7376               if (dump_enabled_p ())
7377                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7378                                  "can't use a fully-masked loop for chained"
7379                                  " reductions.\n");
7380               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7381             }
7382           else
7383             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7384                                    vectype_in);
7385         }
7386       if (dump_enabled_p ()
7387           && reduction_type == FOLD_LEFT_REDUCTION)
7388         dump_printf_loc (MSG_NOTE, vect_location,
7389                          "using an in-order (fold-left) reduction.\n");
7390       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7391       return true;
7392     }
7393
7394   /* Transform.  */
7395
7396   if (dump_enabled_p ())
7397     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7398
7399   /* FORNOW: Multiple types are not supported for condition.  */
7400   if (code == COND_EXPR)
7401     gcc_assert (ncopies == 1);
7402
7403   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7404
7405   if (reduction_type == FOLD_LEFT_REDUCTION)
7406     return vectorize_fold_left_reduction
7407       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7408        reduc_fn, ops, vectype_in, reduc_index, masks);
7409
7410   if (reduction_type == EXTRACT_LAST_REDUCTION)
7411     {
7412       gcc_assert (!slp_node);
7413       return vectorizable_condition (stmt, gsi, vec_stmt,
7414                                      NULL, reduc_index, NULL);
7415     }
7416
7417   /* Create the destination vector  */
7418   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7419
7420   prev_stmt_info = NULL;
7421   prev_phi_info = NULL;
7422   if (!slp_node)
7423     {
7424       vec_oprnds0.create (1);
7425       vec_oprnds1.create (1);
7426       if (op_type == ternary_op)
7427         vec_oprnds2.create (1);
7428     }
7429
7430   phis.create (vec_num);
7431   vect_defs.create (vec_num);
7432   if (!slp_node)
7433     vect_defs.quick_push (NULL_TREE);
7434
7435   if (slp_node)
7436     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7437   else
7438     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7439
7440   for (j = 0; j < ncopies; j++)
7441     {
7442       if (code == COND_EXPR)
7443         {
7444           gcc_assert (!slp_node);
7445           vectorizable_condition (stmt, gsi, vec_stmt,
7446                                   PHI_RESULT (phis[0]),
7447                                   reduc_index, NULL);
7448           /* Multiple types are not supported for condition.  */
7449           break;
7450         }
7451
7452       /* Handle uses.  */
7453       if (j == 0)
7454         {
7455           if (slp_node)
7456             {
7457               /* Get vec defs for all the operands except the reduction index,
7458                  ensuring the ordering of the ops in the vector is kept.  */
7459               auto_vec<tree, 3> slp_ops;
7460               auto_vec<vec<tree>, 3> vec_defs;
7461
7462               slp_ops.quick_push (ops[0]);
7463               slp_ops.quick_push (ops[1]);
7464               if (op_type == ternary_op)
7465                 slp_ops.quick_push (ops[2]);
7466
7467               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7468
7469               vec_oprnds0.safe_splice (vec_defs[0]);
7470               vec_defs[0].release ();
7471               vec_oprnds1.safe_splice (vec_defs[1]);
7472               vec_defs[1].release ();
7473               if (op_type == ternary_op)
7474                 {
7475                   vec_oprnds2.safe_splice (vec_defs[2]);
7476                   vec_defs[2].release ();
7477                 }
7478             }
7479           else
7480             {
7481               vec_oprnds0.quick_push
7482                 (vect_get_vec_def_for_operand (ops[0], stmt));
7483               vec_oprnds1.quick_push
7484                 (vect_get_vec_def_for_operand (ops[1], stmt));
7485               if (op_type == ternary_op)
7486                 vec_oprnds2.quick_push
7487                   (vect_get_vec_def_for_operand (ops[2], stmt));
7488             }
7489         }
7490       else
7491         {
7492           if (!slp_node)
7493             {
7494               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7495
7496               if (single_defuse_cycle && reduc_index == 0)
7497                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7498               else
7499                 vec_oprnds0[0]
7500                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7501               if (single_defuse_cycle && reduc_index == 1)
7502                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7503               else
7504                 vec_oprnds1[0]
7505                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7506               if (op_type == ternary_op)
7507                 {
7508                   if (single_defuse_cycle && reduc_index == 2)
7509                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7510                   else
7511                     vec_oprnds2[0]
7512                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7513                 }
7514             }
7515         }
7516
7517       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7518         {
7519           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7520           if (masked_loop_p)
7521             {
7522               /* Make sure that the reduction accumulator is vop[0].  */
7523               if (reduc_index == 1)
7524                 {
7525                   gcc_assert (commutative_tree_code (code));
7526                   std::swap (vop[0], vop[1]);
7527                 }
7528               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7529                                               vectype_in, i * ncopies + j);
7530               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7531                                                         vop[0], vop[1]);
7532               new_temp = make_ssa_name (vec_dest, call);
7533               gimple_call_set_lhs (call, new_temp);
7534               gimple_call_set_nothrow (call, true);
7535               new_stmt = call;
7536             }
7537           else
7538             {
7539               if (op_type == ternary_op)
7540                 vop[2] = vec_oprnds2[i];
7541
7542               new_temp = make_ssa_name (vec_dest, new_stmt);
7543               new_stmt = gimple_build_assign (new_temp, code,
7544                                               vop[0], vop[1], vop[2]);
7545             }
7546           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7547
7548           if (slp_node)
7549             {
7550               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7551               vect_defs.quick_push (new_temp);
7552             }
7553           else
7554             vect_defs[0] = new_temp;
7555         }
7556
7557       if (slp_node)
7558         continue;
7559
7560       if (j == 0)
7561         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7562       else
7563         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7564
7565       prev_stmt_info = vinfo_for_stmt (new_stmt);
7566     }
7567
7568   /* Finalize the reduction-phi (set its arguments) and create the
7569      epilog reduction code.  */
7570   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7571     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7572
7573   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7574                                     epilog_copies, reduc_fn, phis,
7575                                     double_reduc, slp_node, slp_node_instance,
7576                                     cond_reduc_val, cond_reduc_op_code,
7577                                     neutral_op);
7578
7579   return true;
7580 }
7581
7582 /* Function vect_min_worthwhile_factor.
7583
7584    For a loop where we could vectorize the operation indicated by CODE,
7585    return the minimum vectorization factor that makes it worthwhile
7586    to use generic vectors.  */
7587 static unsigned int
7588 vect_min_worthwhile_factor (enum tree_code code)
7589 {
7590   switch (code)
7591     {
7592     case PLUS_EXPR:
7593     case MINUS_EXPR:
7594     case NEGATE_EXPR:
7595       return 4;
7596
7597     case BIT_AND_EXPR:
7598     case BIT_IOR_EXPR:
7599     case BIT_XOR_EXPR:
7600     case BIT_NOT_EXPR:
7601       return 2;
7602
7603     default:
7604       return INT_MAX;
7605     }
7606 }
7607
7608 /* Return true if VINFO indicates we are doing loop vectorization and if
7609    it is worth decomposing CODE operations into scalar operations for
7610    that loop's vectorization factor.  */
7611
7612 bool
7613 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7614 {
7615   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7616   unsigned HOST_WIDE_INT value;
7617   return (loop_vinfo
7618           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7619           && value >= vect_min_worthwhile_factor (code));
7620 }
7621
7622 /* Function vectorizable_induction
7623
7624    Check if PHI performs an induction computation that can be vectorized.
7625    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7626    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7627    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7628
7629 bool
7630 vectorizable_induction (gimple *phi,
7631                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7632                         gimple **vec_stmt, slp_tree slp_node)
7633 {
7634   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7635   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7637   unsigned ncopies;
7638   bool nested_in_vect_loop = false;
7639   struct loop *iv_loop;
7640   tree vec_def;
7641   edge pe = loop_preheader_edge (loop);
7642   basic_block new_bb;
7643   tree new_vec, vec_init, vec_step, t;
7644   tree new_name;
7645   gimple *new_stmt;
7646   gphi *induction_phi;
7647   tree induc_def, vec_dest;
7648   tree init_expr, step_expr;
7649   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7650   unsigned i;
7651   tree expr;
7652   gimple_seq stmts;
7653   imm_use_iterator imm_iter;
7654   use_operand_p use_p;
7655   gimple *exit_phi;
7656   edge latch_e;
7657   tree loop_arg;
7658   gimple_stmt_iterator si;
7659   basic_block bb = gimple_bb (phi);
7660
7661   if (gimple_code (phi) != GIMPLE_PHI)
7662     return false;
7663
7664   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7665     return false;
7666
7667   /* Make sure it was recognized as induction computation.  */
7668   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7669     return false;
7670
7671   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7672   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7673
7674   if (slp_node)
7675     ncopies = 1;
7676   else
7677     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7678   gcc_assert (ncopies >= 1);
7679
7680   /* FORNOW. These restrictions should be relaxed.  */
7681   if (nested_in_vect_loop_p (loop, phi))
7682     {
7683       imm_use_iterator imm_iter;
7684       use_operand_p use_p;
7685       gimple *exit_phi;
7686       edge latch_e;
7687       tree loop_arg;
7688
7689       if (ncopies > 1)
7690         {
7691           if (dump_enabled_p ())
7692             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7693                              "multiple types in nested loop.\n");
7694           return false;
7695         }
7696
7697       /* FORNOW: outer loop induction with SLP not supported.  */
7698       if (STMT_SLP_TYPE (stmt_info))
7699         return false;
7700
7701       exit_phi = NULL;
7702       latch_e = loop_latch_edge (loop->inner);
7703       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7704       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7705         {
7706           gimple *use_stmt = USE_STMT (use_p);
7707           if (is_gimple_debug (use_stmt))
7708             continue;
7709
7710           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7711             {
7712               exit_phi = use_stmt;
7713               break;
7714             }
7715         }
7716       if (exit_phi)
7717         {
7718           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7719           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7720                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7721             {
7722               if (dump_enabled_p ())
7723                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7724                                  "inner-loop induction only used outside "
7725                                  "of the outer vectorized loop.\n");
7726               return false;
7727             }
7728         }
7729
7730       nested_in_vect_loop = true;
7731       iv_loop = loop->inner;
7732     }
7733   else
7734     iv_loop = loop;
7735   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7736
7737   if (slp_node && !nunits.is_constant ())
7738     {
7739       /* The current SLP code creates the initial value element-by-element.  */
7740       if (dump_enabled_p ())
7741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7742                          "SLP induction not supported for variable-length"
7743                          " vectors.\n");
7744       return false;
7745     }
7746
7747   if (!vec_stmt) /* transformation not required.  */
7748     {
7749       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7750       if (dump_enabled_p ())
7751         dump_printf_loc (MSG_NOTE, vect_location,
7752                          "=== vectorizable_induction ===\n");
7753       vect_model_induction_cost (stmt_info, ncopies);
7754       return true;
7755     }
7756
7757   /* Transform.  */
7758
7759   /* Compute a vector variable, initialized with the first VF values of
7760      the induction variable.  E.g., for an iv with IV_PHI='X' and
7761      evolution S, for a vector of 4 units, we want to compute:
7762      [X, X + S, X + 2*S, X + 3*S].  */
7763
7764   if (dump_enabled_p ())
7765     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7766
7767   latch_e = loop_latch_edge (iv_loop);
7768   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7769
7770   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7771   gcc_assert (step_expr != NULL_TREE);
7772
7773   pe = loop_preheader_edge (iv_loop);
7774   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7775                                      loop_preheader_edge (iv_loop));
7776
7777   stmts = NULL;
7778   if (!nested_in_vect_loop)
7779     {
7780       /* Convert the initial value to the desired type.  */
7781       tree new_type = TREE_TYPE (vectype);
7782       init_expr = gimple_convert (&stmts, new_type, init_expr);
7783
7784       /* If we are using the loop mask to "peel" for alignment then we need
7785          to adjust the start value here.  */
7786       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7787       if (skip_niters != NULL_TREE)
7788         {
7789           if (FLOAT_TYPE_P (vectype))
7790             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7791                                         skip_niters);
7792           else
7793             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7794           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7795                                          skip_niters, step_expr);
7796           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7797                                     init_expr, skip_step);
7798         }
7799     }
7800
7801   /* Convert the step to the desired type.  */
7802   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7803
7804   if (stmts)
7805     {
7806       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7807       gcc_assert (!new_bb);
7808     }
7809
7810   /* Find the first insertion point in the BB.  */
7811   si = gsi_after_labels (bb);
7812
7813   /* For SLP induction we have to generate several IVs as for example
7814      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7815      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7816      [VF*S, VF*S, VF*S, VF*S] for all.  */
7817   if (slp_node)
7818     {
7819       /* Enforced above.  */
7820       unsigned int const_nunits = nunits.to_constant ();
7821
7822       /* Generate [VF*S, VF*S, ... ].  */
7823       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7824         {
7825           expr = build_int_cst (integer_type_node, vf);
7826           expr = fold_convert (TREE_TYPE (step_expr), expr);
7827         }
7828       else
7829         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7830       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7831                               expr, step_expr);
7832       if (! CONSTANT_CLASS_P (new_name))
7833         new_name = vect_init_vector (phi, new_name,
7834                                      TREE_TYPE (step_expr), NULL);
7835       new_vec = build_vector_from_val (vectype, new_name);
7836       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7837
7838       /* Now generate the IVs.  */
7839       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7840       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7841       unsigned elts = const_nunits * nvects;
7842       unsigned nivs = least_common_multiple (group_size,
7843                                              const_nunits) / const_nunits;
7844       gcc_assert (elts % group_size == 0);
7845       tree elt = init_expr;
7846       unsigned ivn;
7847       for (ivn = 0; ivn < nivs; ++ivn)
7848         {
7849           tree_vector_builder elts (vectype, const_nunits, 1);
7850           stmts = NULL;
7851           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7852             {
7853               if (ivn*const_nunits + eltn >= group_size
7854                   && (ivn * const_nunits + eltn) % group_size == 0)
7855                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7856                                     elt, step_expr);
7857               elts.quick_push (elt);
7858             }
7859           vec_init = gimple_build_vector (&stmts, &elts);
7860           if (stmts)
7861             {
7862               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7863               gcc_assert (!new_bb);
7864             }
7865
7866           /* Create the induction-phi that defines the induction-operand.  */
7867           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7868           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7869           set_vinfo_for_stmt (induction_phi,
7870                               new_stmt_vec_info (induction_phi, loop_vinfo));
7871           induc_def = PHI_RESULT (induction_phi);
7872
7873           /* Create the iv update inside the loop  */
7874           vec_def = make_ssa_name (vec_dest);
7875           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7876           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7877           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7878
7879           /* Set the arguments of the phi node:  */
7880           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7881           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7882                        UNKNOWN_LOCATION);
7883
7884           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7885         }
7886
7887       /* Re-use IVs when we can.  */
7888       if (ivn < nvects)
7889         {
7890           unsigned vfp
7891             = least_common_multiple (group_size, const_nunits) / group_size;
7892           /* Generate [VF'*S, VF'*S, ... ].  */
7893           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7894             {
7895               expr = build_int_cst (integer_type_node, vfp);
7896               expr = fold_convert (TREE_TYPE (step_expr), expr);
7897             }
7898           else
7899             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7900           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7901                                   expr, step_expr);
7902           if (! CONSTANT_CLASS_P (new_name))
7903             new_name = vect_init_vector (phi, new_name,
7904                                          TREE_TYPE (step_expr), NULL);
7905           new_vec = build_vector_from_val (vectype, new_name);
7906           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7907           for (; ivn < nvects; ++ivn)
7908             {
7909               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7910               tree def;
7911               if (gimple_code (iv) == GIMPLE_PHI)
7912                 def = gimple_phi_result (iv);
7913               else
7914                 def = gimple_assign_lhs (iv);
7915               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7916                                               PLUS_EXPR,
7917                                               def, vec_step);
7918               if (gimple_code (iv) == GIMPLE_PHI)
7919                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7920               else
7921                 {
7922                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7923                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7924                 }
7925               set_vinfo_for_stmt (new_stmt,
7926                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7927               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7928             }
7929         }
7930
7931       return true;
7932     }
7933
7934   /* Create the vector that holds the initial_value of the induction.  */
7935   if (nested_in_vect_loop)
7936     {
7937       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7938          been created during vectorization of previous stmts.  We obtain it
7939          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7940       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7941       /* If the initial value is not of proper type, convert it.  */
7942       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7943         {
7944           new_stmt
7945             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7946                                                           vect_simple_var,
7947                                                           "vec_iv_"),
7948                                    VIEW_CONVERT_EXPR,
7949                                    build1 (VIEW_CONVERT_EXPR, vectype,
7950                                            vec_init));
7951           vec_init = gimple_assign_lhs (new_stmt);
7952           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7953                                                  new_stmt);
7954           gcc_assert (!new_bb);
7955           set_vinfo_for_stmt (new_stmt,
7956                               new_stmt_vec_info (new_stmt, loop_vinfo));
7957         }
7958     }
7959   else
7960     {
7961       /* iv_loop is the loop to be vectorized. Create:
7962          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7963       stmts = NULL;
7964       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7965
7966       unsigned HOST_WIDE_INT const_nunits;
7967       if (nunits.is_constant (&const_nunits))
7968         {
7969           tree_vector_builder elts (vectype, const_nunits, 1);
7970           elts.quick_push (new_name);
7971           for (i = 1; i < const_nunits; i++)
7972             {
7973               /* Create: new_name_i = new_name + step_expr  */
7974               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7975                                        new_name, step_expr);
7976               elts.quick_push (new_name);
7977             }
7978           /* Create a vector from [new_name_0, new_name_1, ...,
7979              new_name_nunits-1]  */
7980           vec_init = gimple_build_vector (&stmts, &elts);
7981         }
7982       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7983         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7984         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7985                                  new_name, step_expr);
7986       else
7987         {
7988           /* Build:
7989                 [base, base, base, ...]
7990                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7991           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7992           gcc_assert (flag_associative_math);
7993           tree index = build_index_vector (vectype, 0, 1);
7994           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7995                                                         new_name);
7996           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7997                                                         step_expr);
7998           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7999           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8000                                    vec_init, step_vec);
8001           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8002                                    vec_init, base_vec);
8003         }
8004
8005       if (stmts)
8006         {
8007           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8008           gcc_assert (!new_bb);
8009         }
8010     }
8011
8012
8013   /* Create the vector that holds the step of the induction.  */
8014   if (nested_in_vect_loop)
8015     /* iv_loop is nested in the loop to be vectorized. Generate:
8016        vec_step = [S, S, S, S]  */
8017     new_name = step_expr;
8018   else
8019     {
8020       /* iv_loop is the loop to be vectorized. Generate:
8021           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8022       gimple_seq seq = NULL;
8023       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8024         {
8025           expr = build_int_cst (integer_type_node, vf);
8026           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8027         }
8028       else
8029         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8030       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8031                                expr, step_expr);
8032       if (seq)
8033         {
8034           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8035           gcc_assert (!new_bb);
8036         }
8037     }
8038
8039   t = unshare_expr (new_name);
8040   gcc_assert (CONSTANT_CLASS_P (new_name)
8041               || TREE_CODE (new_name) == SSA_NAME);
8042   new_vec = build_vector_from_val (vectype, t);
8043   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8044
8045
8046   /* Create the following def-use cycle:
8047      loop prolog:
8048          vec_init = ...
8049          vec_step = ...
8050      loop:
8051          vec_iv = PHI <vec_init, vec_loop>
8052          ...
8053          STMT
8054          ...
8055          vec_loop = vec_iv + vec_step;  */
8056
8057   /* Create the induction-phi that defines the induction-operand.  */
8058   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8059   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8060   set_vinfo_for_stmt (induction_phi,
8061                       new_stmt_vec_info (induction_phi, loop_vinfo));
8062   induc_def = PHI_RESULT (induction_phi);
8063
8064   /* Create the iv update inside the loop  */
8065   vec_def = make_ssa_name (vec_dest);
8066   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8067   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8068   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8069
8070   /* Set the arguments of the phi node:  */
8071   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8072   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8073                UNKNOWN_LOCATION);
8074
8075   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8076
8077   /* In case that vectorization factor (VF) is bigger than the number
8078      of elements that we can fit in a vectype (nunits), we have to generate
8079      more than one vector stmt - i.e - we need to "unroll" the
8080      vector stmt by a factor VF/nunits.  For more details see documentation
8081      in vectorizable_operation.  */
8082
8083   if (ncopies > 1)
8084     {
8085       gimple_seq seq = NULL;
8086       stmt_vec_info prev_stmt_vinfo;
8087       /* FORNOW. This restriction should be relaxed.  */
8088       gcc_assert (!nested_in_vect_loop);
8089
8090       /* Create the vector that holds the step of the induction.  */
8091       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8092         {
8093           expr = build_int_cst (integer_type_node, nunits);
8094           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8095         }
8096       else
8097         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8098       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8099                                expr, step_expr);
8100       if (seq)
8101         {
8102           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8103           gcc_assert (!new_bb);
8104         }
8105
8106       t = unshare_expr (new_name);
8107       gcc_assert (CONSTANT_CLASS_P (new_name)
8108                   || TREE_CODE (new_name) == SSA_NAME);
8109       new_vec = build_vector_from_val (vectype, t);
8110       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8111
8112       vec_def = induc_def;
8113       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8114       for (i = 1; i < ncopies; i++)
8115         {
8116           /* vec_i = vec_prev + vec_step  */
8117           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8118                                           vec_def, vec_step);
8119           vec_def = make_ssa_name (vec_dest, new_stmt);
8120           gimple_assign_set_lhs (new_stmt, vec_def);
8121
8122           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8123           set_vinfo_for_stmt (new_stmt,
8124                               new_stmt_vec_info (new_stmt, loop_vinfo));
8125           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8126           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8127         }
8128     }
8129
8130   if (nested_in_vect_loop)
8131     {
8132       /* Find the loop-closed exit-phi of the induction, and record
8133          the final vector of induction results:  */
8134       exit_phi = NULL;
8135       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8136         {
8137           gimple *use_stmt = USE_STMT (use_p);
8138           if (is_gimple_debug (use_stmt))
8139             continue;
8140
8141           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8142             {
8143               exit_phi = use_stmt;
8144               break;
8145             }
8146         }
8147       if (exit_phi)
8148         {
8149           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8150           /* FORNOW. Currently not supporting the case that an inner-loop induction
8151              is not used in the outer-loop (i.e. only outside the outer-loop).  */
8152           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8153                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
8154
8155           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8156           if (dump_enabled_p ())
8157             {
8158               dump_printf_loc (MSG_NOTE, vect_location,
8159                                "vector of inductions after inner-loop:");
8160               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8161             }
8162         }
8163     }
8164
8165
8166   if (dump_enabled_p ())
8167     {
8168       dump_printf_loc (MSG_NOTE, vect_location,
8169                        "transform induction: created def-use cycle: ");
8170       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8171       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8172                         SSA_NAME_DEF_STMT (vec_def), 0);
8173     }
8174
8175   return true;
8176 }
8177
8178 /* Function vectorizable_live_operation.
8179
8180    STMT computes a value that is used outside the loop.  Check if
8181    it can be supported.  */
8182
8183 bool
8184 vectorizable_live_operation (gimple *stmt,
8185                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8186                              slp_tree slp_node, int slp_index,
8187                              gimple **vec_stmt)
8188 {
8189   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8190   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8191   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8192   imm_use_iterator imm_iter;
8193   tree lhs, lhs_type, bitsize, vec_bitsize;
8194   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8195   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8196   int ncopies;
8197   gimple *use_stmt;
8198   auto_vec<tree> vec_oprnds;
8199   int vec_entry = 0;
8200   poly_uint64 vec_index = 0;
8201
8202   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8203
8204   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8205     return false;
8206
8207   /* FORNOW.  CHECKME.  */
8208   if (nested_in_vect_loop_p (loop, stmt))
8209     return false;
8210
8211   /* If STMT is not relevant and it is a simple assignment and its inputs are
8212      invariant then it can remain in place, unvectorized.  The original last
8213      scalar value that it computes will be used.  */
8214   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8215     {
8216       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8217       if (dump_enabled_p ())
8218         dump_printf_loc (MSG_NOTE, vect_location,
8219                          "statement is simple and uses invariant.  Leaving in "
8220                          "place.\n");
8221       return true;
8222     }
8223
8224   if (slp_node)
8225     ncopies = 1;
8226   else
8227     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8228
8229   if (slp_node)
8230     {
8231       gcc_assert (slp_index >= 0);
8232
8233       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8234       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8235
8236       /* Get the last occurrence of the scalar index from the concatenation of
8237          all the slp vectors. Calculate which slp vector it is and the index
8238          within.  */
8239       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8240
8241       /* Calculate which vector contains the result, and which lane of
8242          that vector we need.  */
8243       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8244         {
8245           if (dump_enabled_p ())
8246             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8247                              "Cannot determine which vector holds the"
8248                              " final result.\n");
8249           return false;
8250         }
8251     }
8252
8253   if (!vec_stmt)
8254     {
8255       /* No transformation required.  */
8256       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8257         {
8258           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8259                                                OPTIMIZE_FOR_SPEED))
8260             {
8261               if (dump_enabled_p ())
8262                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8263                                  "can't use a fully-masked loop because "
8264                                  "the target doesn't support extract last "
8265                                  "reduction.\n");
8266               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8267             }
8268           else if (slp_node)
8269             {
8270               if (dump_enabled_p ())
8271                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8272                                  "can't use a fully-masked loop because an "
8273                                  "SLP statement is live after the loop.\n");
8274               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8275             }
8276           else if (ncopies > 1)
8277             {
8278               if (dump_enabled_p ())
8279                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8280                                  "can't use a fully-masked loop because"
8281                                  " ncopies is greater than 1.\n");
8282               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8283             }
8284           else
8285             {
8286               gcc_assert (ncopies == 1 && !slp_node);
8287               vect_record_loop_mask (loop_vinfo,
8288                                      &LOOP_VINFO_MASKS (loop_vinfo),
8289                                      1, vectype);
8290             }
8291         }
8292       return true;
8293     }
8294
8295   /* If stmt has a related stmt, then use that for getting the lhs.  */
8296   if (is_pattern_stmt_p (stmt_info))
8297     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8298
8299   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8300         : gimple_get_lhs (stmt);
8301   lhs_type = TREE_TYPE (lhs);
8302
8303   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8304              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8305              : TYPE_SIZE (TREE_TYPE (vectype)));
8306   vec_bitsize = TYPE_SIZE (vectype);
8307
8308   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8309   tree vec_lhs, bitstart;
8310   if (slp_node)
8311     {
8312       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8313
8314       /* Get the correct slp vectorized stmt.  */
8315       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8316       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8317         vec_lhs = gimple_phi_result (phi);
8318       else
8319         vec_lhs = gimple_get_lhs (vec_stmt);
8320
8321       /* Get entry to use.  */
8322       bitstart = bitsize_int (vec_index);
8323       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8324     }
8325   else
8326     {
8327       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8328       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8329       gcc_checking_assert (ncopies == 1
8330                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8331
8332       /* For multiple copies, get the last copy.  */
8333       for (int i = 1; i < ncopies; ++i)
8334         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8335                                                   vec_lhs);
8336
8337       /* Get the last lane in the vector.  */
8338       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8339     }
8340
8341   gimple_seq stmts = NULL;
8342   tree new_tree;
8343   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8344     {
8345       /* Emit:
8346
8347            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8348
8349          where VEC_LHS is the vectorized live-out result and MASK is
8350          the loop mask for the final iteration.  */
8351       gcc_assert (ncopies == 1 && !slp_node);
8352       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8353       tree scalar_res = make_ssa_name (scalar_type);
8354       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8355                                       1, vectype, 0);
8356       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8357                                                     2, mask, vec_lhs);
8358       gimple_call_set_lhs (new_stmt, scalar_res);
8359       gimple_seq_add_stmt (&stmts, new_stmt);
8360
8361       /* Convert the extracted vector element to the required scalar type.  */
8362       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8363     }
8364   else
8365     {
8366       tree bftype = TREE_TYPE (vectype);
8367       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8368         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8369       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8370       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8371                                        &stmts, true, NULL_TREE);
8372     }
8373
8374   if (stmts)
8375     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8376
8377   /* Replace use of lhs with newly computed result.  If the use stmt is a
8378      single arg PHI, just replace all uses of PHI result.  It's necessary
8379      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8380   use_operand_p use_p;
8381   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8382     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8383         && !is_gimple_debug (use_stmt))
8384     {
8385       if (gimple_code (use_stmt) == GIMPLE_PHI
8386           && gimple_phi_num_args (use_stmt) == 1)
8387         {
8388           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8389         }
8390       else
8391         {
8392           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8393             SET_USE (use_p, new_tree);
8394         }
8395       update_stmt (use_stmt);
8396     }
8397
8398   return true;
8399 }
8400
8401 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8402
8403 static void
8404 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8405 {
8406   ssa_op_iter op_iter;
8407   imm_use_iterator imm_iter;
8408   def_operand_p def_p;
8409   gimple *ustmt;
8410
8411   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8412     {
8413       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8414         {
8415           basic_block bb;
8416
8417           if (!is_gimple_debug (ustmt))
8418             continue;
8419
8420           bb = gimple_bb (ustmt);
8421
8422           if (!flow_bb_inside_loop_p (loop, bb))
8423             {
8424               if (gimple_debug_bind_p (ustmt))
8425                 {
8426                   if (dump_enabled_p ())
8427                     dump_printf_loc (MSG_NOTE, vect_location,
8428                                      "killing debug use\n");
8429
8430                   gimple_debug_bind_reset_value (ustmt);
8431                   update_stmt (ustmt);
8432                 }
8433               else
8434                 gcc_unreachable ();
8435             }
8436         }
8437     }
8438 }
8439
8440 /* Given loop represented by LOOP_VINFO, return true if computation of
8441    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8442    otherwise.  */
8443
8444 static bool
8445 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8446 {
8447   /* Constant case.  */
8448   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8449     {
8450       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8451       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8452
8453       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8454       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8455       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8456         return true;
8457     }
8458
8459   widest_int max;
8460   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8461   /* Check the upper bound of loop niters.  */
8462   if (get_max_loop_iterations (loop, &max))
8463     {
8464       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8465       signop sgn = TYPE_SIGN (type);
8466       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8467       if (max < type_max)
8468         return true;
8469     }
8470   return false;
8471 }
8472
8473 /* Return a mask type with half the number of elements as TYPE.  */
8474
8475 tree
8476 vect_halve_mask_nunits (tree type)
8477 {
8478   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8479   return build_truth_vector_type (nunits, current_vector_size);
8480 }
8481
8482 /* Return a mask type with twice as many elements as TYPE.  */
8483
8484 tree
8485 vect_double_mask_nunits (tree type)
8486 {
8487   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8488   return build_truth_vector_type (nunits, current_vector_size);
8489 }
8490
8491 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8492    contain a sequence of NVECTORS masks that each control a vector of type
8493    VECTYPE.  */
8494
8495 void
8496 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8497                        unsigned int nvectors, tree vectype)
8498 {
8499   gcc_assert (nvectors != 0);
8500   if (masks->length () < nvectors)
8501     masks->safe_grow_cleared (nvectors);
8502   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8503   /* The number of scalars per iteration and the number of vectors are
8504      both compile-time constants.  */
8505   unsigned int nscalars_per_iter
8506     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8507                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8508   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8509     {
8510       rgm->max_nscalars_per_iter = nscalars_per_iter;
8511       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8512     }
8513 }
8514
8515 /* Given a complete set of masks MASKS, extract mask number INDEX
8516    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8517    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8518
8519    See the comment above vec_loop_masks for more details about the mask
8520    arrangement.  */
8521
8522 tree
8523 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8524                     unsigned int nvectors, tree vectype, unsigned int index)
8525 {
8526   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8527   tree mask_type = rgm->mask_type;
8528
8529   /* Populate the rgroup's mask array, if this is the first time we've
8530      used it.  */
8531   if (rgm->masks.is_empty ())
8532     {
8533       rgm->masks.safe_grow_cleared (nvectors);
8534       for (unsigned int i = 0; i < nvectors; ++i)
8535         {
8536           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8537           /* Provide a dummy definition until the real one is available.  */
8538           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8539           rgm->masks[i] = mask;
8540         }
8541     }
8542
8543   tree mask = rgm->masks[index];
8544   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8545                 TYPE_VECTOR_SUBPARTS (vectype)))
8546     {
8547       /* A loop mask for data type X can be reused for data type Y
8548          if X has N times more elements than Y and if Y's elements
8549          are N times bigger than X's.  In this case each sequence
8550          of N elements in the loop mask will be all-zero or all-one.
8551          We can then view-convert the mask so that each sequence of
8552          N elements is replaced by a single element.  */
8553       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8554                               TYPE_VECTOR_SUBPARTS (vectype)));
8555       gimple_seq seq = NULL;
8556       mask_type = build_same_sized_truth_vector_type (vectype);
8557       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8558       if (seq)
8559         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8560     }
8561   return mask;
8562 }
8563
8564 /* Scale profiling counters by estimation for LOOP which is vectorized
8565    by factor VF.  */
8566
8567 static void
8568 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8569 {
8570   edge preheader = loop_preheader_edge (loop);
8571   /* Reduce loop iterations by the vectorization factor.  */
8572   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8573   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8574
8575   if (freq_h.nonzero_p ())
8576     {
8577       profile_probability p;
8578
8579       /* Avoid dropping loop body profile counter to 0 because of zero count
8580          in loop's preheader.  */
8581       if (!(freq_e == profile_count::zero ()))
8582         freq_e = freq_e.force_nonzero ();
8583       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8584       scale_loop_frequencies (loop, p);
8585     }
8586
8587   edge exit_e = single_exit (loop);
8588   exit_e->probability = profile_probability::always ()
8589                                  .apply_scale (1, new_est_niter + 1);
8590
8591   edge exit_l = single_pred_edge (loop->latch);
8592   profile_probability prob = exit_l->probability;
8593   exit_l->probability = exit_e->probability.invert ();
8594   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8595     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8596 }
8597
8598 /* Function vect_transform_loop.
8599
8600    The analysis phase has determined that the loop is vectorizable.
8601    Vectorize the loop - created vectorized stmts to replace the scalar
8602    stmts in the loop, and update the loop exit condition.
8603    Returns scalar epilogue loop if any.  */
8604
8605 struct loop *
8606 vect_transform_loop (loop_vec_info loop_vinfo)
8607 {
8608   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8609   struct loop *epilogue = NULL;
8610   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8611   int nbbs = loop->num_nodes;
8612   int i;
8613   tree niters_vector = NULL_TREE;
8614   tree step_vector = NULL_TREE;
8615   tree niters_vector_mult_vf = NULL_TREE;
8616   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8617   unsigned int lowest_vf = constant_lower_bound (vf);
8618   bool grouped_store;
8619   bool slp_scheduled = false;
8620   gimple *stmt, *pattern_stmt;
8621   gimple_seq pattern_def_seq = NULL;
8622   gimple_stmt_iterator pattern_def_si = gsi_none ();
8623   bool transform_pattern_stmt = false;
8624   bool check_profitability = false;
8625   unsigned int th;
8626
8627   if (dump_enabled_p ())
8628     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8629
8630   /* Use the more conservative vectorization threshold.  If the number
8631      of iterations is constant assume the cost check has been performed
8632      by our caller.  If the threshold makes all loops profitable that
8633      run at least the (estimated) vectorization factor number of times
8634      checking is pointless, too.  */
8635   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8636   if (th >= vect_vf_for_cost (loop_vinfo)
8637       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8638     {
8639       if (dump_enabled_p ())
8640         dump_printf_loc (MSG_NOTE, vect_location,
8641                          "Profitability threshold is %d loop iterations.\n",
8642                          th);
8643       check_profitability = true;
8644     }
8645
8646   /* Make sure there exists a single-predecessor exit bb.  Do this before
8647      versioning.   */
8648   edge e = single_exit (loop);
8649   if (! single_pred_p (e->dest))
8650     {
8651       split_loop_exit_edge (e);
8652       if (dump_enabled_p ())
8653         dump_printf (MSG_NOTE, "split exit edge\n");
8654     }
8655
8656   /* Version the loop first, if required, so the profitability check
8657      comes first.  */
8658
8659   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8660     {
8661       poly_uint64 versioning_threshold
8662         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8663       if (check_profitability
8664           && ordered_p (poly_uint64 (th), versioning_threshold))
8665         {
8666           versioning_threshold = ordered_max (poly_uint64 (th),
8667                                               versioning_threshold);
8668           check_profitability = false;
8669         }
8670       vect_loop_versioning (loop_vinfo, th, check_profitability,
8671                             versioning_threshold);
8672       check_profitability = false;
8673     }
8674
8675   /* Make sure there exists a single-predecessor exit bb also on the
8676      scalar loop copy.  Do this after versioning but before peeling
8677      so CFG structure is fine for both scalar and if-converted loop
8678      to make slpeel_duplicate_current_defs_from_edges face matched
8679      loop closed PHI nodes on the exit.  */
8680   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8681     {
8682       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8683       if (! single_pred_p (e->dest))
8684         {
8685           split_loop_exit_edge (e);
8686           if (dump_enabled_p ())
8687             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8688         }
8689     }
8690
8691   tree niters = vect_build_loop_niters (loop_vinfo);
8692   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8693   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8694   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8695   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8696                               &step_vector, &niters_vector_mult_vf, th,
8697                               check_profitability, niters_no_overflow);
8698
8699   if (niters_vector == NULL_TREE)
8700     {
8701       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8702           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8703           && known_eq (lowest_vf, vf))
8704         {
8705           niters_vector
8706             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8707                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8708           step_vector = build_one_cst (TREE_TYPE (niters));
8709         }
8710       else
8711         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8712                                      &step_vector, niters_no_overflow);
8713     }
8714
8715   /* 1) Make sure the loop header has exactly two entries
8716      2) Make sure we have a preheader basic block.  */
8717
8718   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8719
8720   split_edge (loop_preheader_edge (loop));
8721
8722   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8723       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8724     /* This will deal with any possible peeling.  */
8725     vect_prepare_for_masked_peels (loop_vinfo);
8726
8727   /* FORNOW: the vectorizer supports only loops which body consist
8728      of one basic block (header + empty latch). When the vectorizer will
8729      support more involved loop forms, the order by which the BBs are
8730      traversed need to be reconsidered.  */
8731
8732   for (i = 0; i < nbbs; i++)
8733     {
8734       basic_block bb = bbs[i];
8735       stmt_vec_info stmt_info;
8736
8737       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8738            gsi_next (&si))
8739         {
8740           gphi *phi = si.phi ();
8741           if (dump_enabled_p ())
8742             {
8743               dump_printf_loc (MSG_NOTE, vect_location,
8744                                "------>vectorizing phi: ");
8745               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8746             }
8747           stmt_info = vinfo_for_stmt (phi);
8748           if (!stmt_info)
8749             continue;
8750
8751           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8752             vect_loop_kill_debug_uses (loop, phi);
8753
8754           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8755               && !STMT_VINFO_LIVE_P (stmt_info))
8756             continue;
8757
8758           if (STMT_VINFO_VECTYPE (stmt_info)
8759               && (maybe_ne
8760                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8761               && dump_enabled_p ())
8762             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8763
8764           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8765                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8766                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8767               && ! PURE_SLP_STMT (stmt_info))
8768             {
8769               if (dump_enabled_p ())
8770                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8771               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8772             }
8773         }
8774
8775       pattern_stmt = NULL;
8776       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8777            !gsi_end_p (si) || transform_pattern_stmt;)
8778         {
8779           bool is_store;
8780
8781           if (transform_pattern_stmt)
8782             stmt = pattern_stmt;
8783           else
8784             {
8785               stmt = gsi_stmt (si);
8786               /* During vectorization remove existing clobber stmts.  */
8787               if (gimple_clobber_p (stmt))
8788                 {
8789                   unlink_stmt_vdef (stmt);
8790                   gsi_remove (&si, true);
8791                   release_defs (stmt);
8792                   continue;
8793                 }
8794             }
8795
8796           if (dump_enabled_p ())
8797             {
8798               dump_printf_loc (MSG_NOTE, vect_location,
8799                                "------>vectorizing statement: ");
8800               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8801             }
8802
8803           stmt_info = vinfo_for_stmt (stmt);
8804
8805           /* vector stmts created in the outer-loop during vectorization of
8806              stmts in an inner-loop may not have a stmt_info, and do not
8807              need to be vectorized.  */
8808           if (!stmt_info)
8809             {
8810               gsi_next (&si);
8811               continue;
8812             }
8813
8814           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8815             vect_loop_kill_debug_uses (loop, stmt);
8816
8817           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8818               && !STMT_VINFO_LIVE_P (stmt_info))
8819             {
8820               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8821                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8822                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8823                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8824                 {
8825                   stmt = pattern_stmt;
8826                   stmt_info = vinfo_for_stmt (stmt);
8827                 }
8828               else
8829                 {
8830                   gsi_next (&si);
8831                   continue;
8832                 }
8833             }
8834           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8835                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8836                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8837                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8838             transform_pattern_stmt = true;
8839
8840           /* If pattern statement has def stmts, vectorize them too.  */
8841           if (is_pattern_stmt_p (stmt_info))
8842             {
8843               if (pattern_def_seq == NULL)
8844                 {
8845                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8846                   pattern_def_si = gsi_start (pattern_def_seq);
8847                 }
8848               else if (!gsi_end_p (pattern_def_si))
8849                 gsi_next (&pattern_def_si);
8850               if (pattern_def_seq != NULL)
8851                 {
8852                   gimple *pattern_def_stmt = NULL;
8853                   stmt_vec_info pattern_def_stmt_info = NULL;
8854
8855                   while (!gsi_end_p (pattern_def_si))
8856                     {
8857                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8858                       pattern_def_stmt_info
8859                         = vinfo_for_stmt (pattern_def_stmt);
8860                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8861                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8862                         break;
8863                       gsi_next (&pattern_def_si);
8864                     }
8865
8866                   if (!gsi_end_p (pattern_def_si))
8867                     {
8868                       if (dump_enabled_p ())
8869                         {
8870                           dump_printf_loc (MSG_NOTE, vect_location,
8871                                            "==> vectorizing pattern def "
8872                                            "stmt: ");
8873                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8874                                             pattern_def_stmt, 0);
8875                         }
8876
8877                       stmt = pattern_def_stmt;
8878                       stmt_info = pattern_def_stmt_info;
8879                     }
8880                   else
8881                     {
8882                       pattern_def_si = gsi_none ();
8883                       transform_pattern_stmt = false;
8884                     }
8885                 }
8886               else
8887                 transform_pattern_stmt = false;
8888             }
8889
8890           if (STMT_VINFO_VECTYPE (stmt_info))
8891             {
8892               poly_uint64 nunits
8893                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8894               if (!STMT_SLP_TYPE (stmt_info)
8895                   && maybe_ne (nunits, vf)
8896                   && dump_enabled_p ())
8897                   /* For SLP VF is set according to unrolling factor, and not
8898                      to vector size, hence for SLP this print is not valid.  */
8899                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8900             }
8901
8902           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8903              reached.  */
8904           if (STMT_SLP_TYPE (stmt_info))
8905             {
8906               if (!slp_scheduled)
8907                 {
8908                   slp_scheduled = true;
8909
8910                   if (dump_enabled_p ())
8911                     dump_printf_loc (MSG_NOTE, vect_location,
8912                                      "=== scheduling SLP instances ===\n");
8913
8914                   vect_schedule_slp (loop_vinfo);
8915                 }
8916
8917               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8918               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8919                 {
8920                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8921                     {
8922                       pattern_def_seq = NULL;
8923                       gsi_next (&si);
8924                     }
8925                   continue;
8926                 }
8927             }
8928
8929           /* -------- vectorize statement ------------ */
8930           if (dump_enabled_p ())
8931             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8932
8933           grouped_store = false;
8934           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8935           if (is_store)
8936             {
8937               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8938                 {
8939                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8940                      interleaving chain was completed - free all the stores in
8941                      the chain.  */
8942                   gsi_next (&si);
8943                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8944                 }
8945               else
8946                 {
8947                   /* Free the attached stmt_vec_info and remove the stmt.  */
8948                   gimple *store = gsi_stmt (si);
8949                   free_stmt_vec_info (store);
8950                   unlink_stmt_vdef (store);
8951                   gsi_remove (&si, true);
8952                   release_defs (store);
8953                 }
8954
8955               /* Stores can only appear at the end of pattern statements.  */
8956               gcc_assert (!transform_pattern_stmt);
8957               pattern_def_seq = NULL;
8958             }
8959           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8960             {
8961               pattern_def_seq = NULL;
8962               gsi_next (&si);
8963             }
8964         }                       /* stmts in BB */
8965
8966       /* Stub out scalar statements that must not survive vectorization.
8967          Doing this here helps with grouped statements, or statements that
8968          are involved in patterns.  */
8969       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8970            !gsi_end_p (gsi); gsi_next (&gsi))
8971         {
8972           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8973           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8974             {
8975               tree lhs = gimple_get_lhs (call);
8976               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8977                 {
8978                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8979                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8980                   gsi_replace (&gsi, new_stmt, true);
8981                 }
8982             }
8983         }
8984     }                           /* BBs in loop */
8985
8986   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8987      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8988   if (integer_onep (step_vector))
8989     niters_no_overflow = true;
8990   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8991                            niters_vector_mult_vf, !niters_no_overflow);
8992
8993   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8994   scale_profile_for_vect_loop (loop, assumed_vf);
8995
8996   /* True if the final iteration might not handle a full vector's
8997      worth of scalar iterations.  */
8998   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8999   /* The minimum number of iterations performed by the epilogue.  This
9000      is 1 when peeling for gaps because we always need a final scalar
9001      iteration.  */
9002   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9003   /* +1 to convert latch counts to loop iteration counts,
9004      -min_epilogue_iters to remove iterations that cannot be performed
9005        by the vector code.  */
9006   int bias_for_lowest = 1 - min_epilogue_iters;
9007   int bias_for_assumed = bias_for_lowest;
9008   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9009   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9010     {
9011       /* When the amount of peeling is known at compile time, the first
9012          iteration will have exactly alignment_npeels active elements.
9013          In the worst case it will have at least one.  */
9014       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9015       bias_for_lowest += lowest_vf - min_first_active;
9016       bias_for_assumed += assumed_vf - min_first_active;
9017     }
9018   /* In these calculations the "- 1" converts loop iteration counts
9019      back to latch counts.  */
9020   if (loop->any_upper_bound)
9021     loop->nb_iterations_upper_bound
9022       = (final_iter_may_be_partial
9023          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9024                           lowest_vf) - 1
9025          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9026                            lowest_vf) - 1);
9027   if (loop->any_likely_upper_bound)
9028     loop->nb_iterations_likely_upper_bound
9029       = (final_iter_may_be_partial
9030          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9031                           + bias_for_lowest, lowest_vf) - 1
9032          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9033                            + bias_for_lowest, lowest_vf) - 1);
9034   if (loop->any_estimate)
9035     loop->nb_iterations_estimate
9036       = (final_iter_may_be_partial
9037          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9038                           assumed_vf) - 1
9039          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9040                            assumed_vf) - 1);
9041
9042   if (dump_enabled_p ())
9043     {
9044       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9045         {
9046           dump_printf_loc (MSG_NOTE, vect_location,
9047                            "LOOP VECTORIZED\n");
9048           if (loop->inner)
9049             dump_printf_loc (MSG_NOTE, vect_location,
9050                              "OUTER LOOP VECTORIZED\n");
9051           dump_printf (MSG_NOTE, "\n");
9052         }
9053       else
9054         {
9055           dump_printf_loc (MSG_NOTE, vect_location,
9056                            "LOOP EPILOGUE VECTORIZED (VS=");
9057           dump_dec (MSG_NOTE, current_vector_size);
9058           dump_printf (MSG_NOTE, ")\n");
9059         }
9060     }
9061
9062   /* Free SLP instances here because otherwise stmt reference counting
9063      won't work.  */
9064   slp_instance instance;
9065   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9066     vect_free_slp_instance (instance);
9067   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9068   /* Clear-up safelen field since its value is invalid after vectorization
9069      since vectorized loop can have loop-carried dependencies.  */
9070   loop->safelen = 0;
9071
9072   /* Don't vectorize epilogue for epilogue.  */
9073   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9074     epilogue = NULL;
9075
9076   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9077     epilogue = NULL;
9078
9079   if (epilogue)
9080     {
9081       auto_vector_sizes vector_sizes;
9082       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9083       unsigned int next_size = 0;
9084
9085       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9086           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9087           && known_eq (vf, lowest_vf))
9088         {
9089           unsigned int eiters
9090             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9091                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9092           eiters = eiters % lowest_vf;
9093           epilogue->nb_iterations_upper_bound = eiters - 1;
9094
9095           unsigned int ratio;
9096           while (next_size < vector_sizes.length ()
9097                  && !(constant_multiple_p (current_vector_size,
9098                                            vector_sizes[next_size], &ratio)
9099                       && eiters >= lowest_vf / ratio))
9100             next_size += 1;
9101         }
9102       else
9103         while (next_size < vector_sizes.length ()
9104                && maybe_lt (current_vector_size, vector_sizes[next_size]))
9105           next_size += 1;
9106
9107       if (next_size == vector_sizes.length ())
9108         epilogue = NULL;
9109     }
9110
9111   if (epilogue)
9112     {
9113       epilogue->force_vectorize = loop->force_vectorize;
9114       epilogue->safelen = loop->safelen;
9115       epilogue->dont_vectorize = false;
9116
9117       /* We may need to if-convert epilogue to vectorize it.  */
9118       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9119         tree_if_conversion (epilogue);
9120     }
9121
9122   return epilogue;
9123 }
9124
9125 /* The code below is trying to perform simple optimization - revert
9126    if-conversion for masked stores, i.e. if the mask of a store is zero
9127    do not perform it and all stored value producers also if possible.
9128    For example,
9129      for (i=0; i<n; i++)
9130        if (c[i])
9131         {
9132           p1[i] += 1;
9133           p2[i] = p3[i] +2;
9134         }
9135    this transformation will produce the following semi-hammock:
9136
9137    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9138      {
9139        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9140        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9141        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9142        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9143        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9144        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9145      }
9146 */
9147
9148 void
9149 optimize_mask_stores (struct loop *loop)
9150 {
9151   basic_block *bbs = get_loop_body (loop);
9152   unsigned nbbs = loop->num_nodes;
9153   unsigned i;
9154   basic_block bb;
9155   struct loop *bb_loop;
9156   gimple_stmt_iterator gsi;
9157   gimple *stmt;
9158   auto_vec<gimple *> worklist;
9159
9160   vect_location = find_loop_location (loop);
9161   /* Pick up all masked stores in loop if any.  */
9162   for (i = 0; i < nbbs; i++)
9163     {
9164       bb = bbs[i];
9165       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9166            gsi_next (&gsi))
9167         {
9168           stmt = gsi_stmt (gsi);
9169           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9170             worklist.safe_push (stmt);
9171         }
9172     }
9173
9174   free (bbs);
9175   if (worklist.is_empty ())
9176     return;
9177
9178   /* Loop has masked stores.  */
9179   while (!worklist.is_empty ())
9180     {
9181       gimple *last, *last_store;
9182       edge e, efalse;
9183       tree mask;
9184       basic_block store_bb, join_bb;
9185       gimple_stmt_iterator gsi_to;
9186       tree vdef, new_vdef;
9187       gphi *phi;
9188       tree vectype;
9189       tree zero;
9190
9191       last = worklist.pop ();
9192       mask = gimple_call_arg (last, 2);
9193       bb = gimple_bb (last);
9194       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9195          the same loop as if_bb.  It could be different to LOOP when two
9196          level loop-nest is vectorized and mask_store belongs to the inner
9197          one.  */
9198       e = split_block (bb, last);
9199       bb_loop = bb->loop_father;
9200       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9201       join_bb = e->dest;
9202       store_bb = create_empty_bb (bb);
9203       add_bb_to_loop (store_bb, bb_loop);
9204       e->flags = EDGE_TRUE_VALUE;
9205       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9206       /* Put STORE_BB to likely part.  */
9207       efalse->probability = profile_probability::unlikely ();
9208       store_bb->count = efalse->count ();
9209       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9210       if (dom_info_available_p (CDI_DOMINATORS))
9211         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9212       if (dump_enabled_p ())
9213         dump_printf_loc (MSG_NOTE, vect_location,
9214                          "Create new block %d to sink mask stores.",
9215                          store_bb->index);
9216       /* Create vector comparison with boolean result.  */
9217       vectype = TREE_TYPE (mask);
9218       zero = build_zero_cst (vectype);
9219       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9220       gsi = gsi_last_bb (bb);
9221       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9222       /* Create new PHI node for vdef of the last masked store:
9223          .MEM_2 = VDEF <.MEM_1>
9224          will be converted to
9225          .MEM.3 = VDEF <.MEM_1>
9226          and new PHI node will be created in join bb
9227          .MEM_2 = PHI <.MEM_1, .MEM_3>
9228       */
9229       vdef = gimple_vdef (last);
9230       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9231       gimple_set_vdef (last, new_vdef);
9232       phi = create_phi_node (vdef, join_bb);
9233       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9234
9235       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9236       while (true)
9237         {
9238           gimple_stmt_iterator gsi_from;
9239           gimple *stmt1 = NULL;
9240
9241           /* Move masked store to STORE_BB.  */
9242           last_store = last;
9243           gsi = gsi_for_stmt (last);
9244           gsi_from = gsi;
9245           /* Shift GSI to the previous stmt for further traversal.  */
9246           gsi_prev (&gsi);
9247           gsi_to = gsi_start_bb (store_bb);
9248           gsi_move_before (&gsi_from, &gsi_to);
9249           /* Setup GSI_TO to the non-empty block start.  */
9250           gsi_to = gsi_start_bb (store_bb);
9251           if (dump_enabled_p ())
9252             {
9253               dump_printf_loc (MSG_NOTE, vect_location,
9254                                "Move stmt to created bb\n");
9255               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9256             }
9257           /* Move all stored value producers if possible.  */
9258           while (!gsi_end_p (gsi))
9259             {
9260               tree lhs;
9261               imm_use_iterator imm_iter;
9262               use_operand_p use_p;
9263               bool res;
9264
9265               /* Skip debug statements.  */
9266               if (is_gimple_debug (gsi_stmt (gsi)))
9267                 {
9268                   gsi_prev (&gsi);
9269                   continue;
9270                 }
9271               stmt1 = gsi_stmt (gsi);
9272               /* Do not consider statements writing to memory or having
9273                  volatile operand.  */
9274               if (gimple_vdef (stmt1)
9275                   || gimple_has_volatile_ops (stmt1))
9276                 break;
9277               gsi_from = gsi;
9278               gsi_prev (&gsi);
9279               lhs = gimple_get_lhs (stmt1);
9280               if (!lhs)
9281                 break;
9282
9283               /* LHS of vectorized stmt must be SSA_NAME.  */
9284               if (TREE_CODE (lhs) != SSA_NAME)
9285                 break;
9286
9287               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9288                 {
9289                   /* Remove dead scalar statement.  */
9290                   if (has_zero_uses (lhs))
9291                     {
9292                       gsi_remove (&gsi_from, true);
9293                       continue;
9294                     }
9295                 }
9296
9297               /* Check that LHS does not have uses outside of STORE_BB.  */
9298               res = true;
9299               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9300                 {
9301                   gimple *use_stmt;
9302                   use_stmt = USE_STMT (use_p);
9303                   if (is_gimple_debug (use_stmt))
9304                     continue;
9305                   if (gimple_bb (use_stmt) != store_bb)
9306                     {
9307                       res = false;
9308                       break;
9309                     }
9310                 }
9311               if (!res)
9312                 break;
9313
9314               if (gimple_vuse (stmt1)
9315                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9316                 break;
9317
9318               /* Can move STMT1 to STORE_BB.  */
9319               if (dump_enabled_p ())
9320                 {
9321                   dump_printf_loc (MSG_NOTE, vect_location,
9322                                    "Move stmt to created bb\n");
9323                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9324                 }
9325               gsi_move_before (&gsi_from, &gsi_to);
9326               /* Shift GSI_TO for further insertion.  */
9327               gsi_prev (&gsi_to);
9328             }
9329           /* Put other masked stores with the same mask to STORE_BB.  */
9330           if (worklist.is_empty ()
9331               || gimple_call_arg (worklist.last (), 2) != mask
9332               || worklist.last () != stmt1)
9333             break;
9334           last = worklist.pop ();
9335         }
9336       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9337     }
9338 }