gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Function vect_determine_vectorization_factor
 159
 160    Determine the vectorization factor (VF).  VF is the number of data elements
 161    that are operated upon in parallel in a single iteration of the vectorized
 162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 164    elements can fit in a single vector register.
 165
 166    We currently support vectorization of loops in which all types operated upon
 167    are of the same size.  Therefore this function currently sets VF according to
 168    the size of the types operated upon, and fails if there are multiple sizes
 169    in the loop.
 170
 171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 172    original loop:
 173         for (i=0; i<N; i++){
 174           a[i] = b[i] + c[i];
 175         }
 176
 177    vectorized loop:
 178         for (i=0; i<N; i+=VF){
 179           a[i:VF] = b[i:VF] + c[i:VF];
 180         }
 181 */
 182
 183 static bool
 184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 185 {
 186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 188   unsigned nbbs = loop->num_nodes;
 189   poly_uint64 vectorization_factor = 1;
 190   tree scalar_type = NULL_TREE;
 191   gphi *phi;
 192   tree vectype;
 193   stmt_vec_info stmt_info;
 194   unsigned i;
 195   HOST_WIDE_INT dummy;
 196   gimple *stmt, *pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200   bool bool_result;
 201   auto_vec<stmt_vec_info> mask_producers;
 202
 203   if (dump_enabled_p ())
 204     dump_printf_loc (MSG_NOTE, vect_location,
 205                      "=== vect_determine_vectorization_factor ===\n");
 206
 207   for (i = 0; i < nbbs; i++)
 208     {
 209       basic_block bb = bbs[i];
 210
 211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 212            gsi_next (&si))
 213         {
 214           phi = si.phi ();
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220             }
 221
 222           gcc_assert (stmt_info);
 223
 224           if (STMT_VINFO_RELEVANT_P (stmt_info)
 225               || STMT_VINFO_LIVE_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 264                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 265                   dump_printf (MSG_NOTE, "\n");
 266                 }
 267
 268               vect_update_max_nunits (&vectorization_factor, vectype);
 269             }
 270         }
 271
 272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 273            !gsi_end_p (si) || analyze_pattern_stmt;)
 274         {
 275           tree vf_vectype;
 276
 277           if (analyze_pattern_stmt)
 278             stmt = pattern_stmt;
 279           else
 280             stmt = gsi_stmt (si);
 281
 282           stmt_info = vinfo_for_stmt (stmt);
 283
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_NOTE, vect_location,
 287                                "==> examining statement: ");
 288               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                     }
 311                 }
 312               else
 313                 {
 314                   if (dump_enabled_p ())
 315                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 316                   gsi_next (&si);
 317                   continue;
 318                 }
 319             }
 320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 324             analyze_pattern_stmt = true;
 325
 326           /* If a pattern statement has def stmts, analyze them too.  */
 327           if (is_pattern_stmt_p (stmt_info))
 328             {
 329               if (pattern_def_seq == NULL)
 330                 {
 331                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 332                   pattern_def_si = gsi_start (pattern_def_seq);
 333                 }
 334               else if (!gsi_end_p (pattern_def_si))
 335                 gsi_next (&pattern_def_si);
 336               if (pattern_def_seq != NULL)
 337                 {
 338                   gimple *pattern_def_stmt = NULL;
 339                   stmt_vec_info pattern_def_stmt_info = NULL;
 340
 341                   while (!gsi_end_p (pattern_def_si))
 342                     {
 343                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 344                       pattern_def_stmt_info
 345                         = vinfo_for_stmt (pattern_def_stmt);
 346                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 347                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 348                         break;
 349                       gsi_next (&pattern_def_si);
 350                     }
 351
 352                   if (!gsi_end_p (pattern_def_si))
 353                     {
 354                       if (dump_enabled_p ())
 355                         {
 356                           dump_printf_loc (MSG_NOTE, vect_location,
 357                                            "==> examining pattern def stmt: ");
 358                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 359                                             pattern_def_stmt, 0);
 360                         }
 361
 362                       stmt = pattern_def_stmt;
 363                       stmt_info = pattern_def_stmt_info;
 364                     }
 365                   else
 366                     {
 367                       pattern_def_si = gsi_none ();
 368                       analyze_pattern_stmt = false;
 369                     }
 370                 }
 371               else
 372                 analyze_pattern_stmt = false;
 373             }
 374
 375           if (gimple_get_lhs (stmt) == NULL_TREE
 376               /* MASK_STORE has no lhs, but is ok.  */
 377               && (!is_gimple_call (stmt)
 378                   || !gimple_call_internal_p (stmt)
 379                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 380             {
 381               if (is_gimple_call (stmt))
 382                 {
 383                   /* Ignore calls with no lhs.  These must be calls to
 384                      #pragma omp simd functions, and what vectorization factor
 385                      it really needs can't be determined until
 386                      vectorizable_simd_clone_call.  */
 387                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 388                     {
 389                       pattern_def_seq = NULL;
 390                       gsi_next (&si);
 391                     }
 392                   continue;
 393                 }
 394               if (dump_enabled_p ())
 395                 {
 396                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 397                                    "not vectorized: irregular stmt.");
 398                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 399                                     0);
 400                 }
 401               return false;
 402             }
 403
 404           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 405             {
 406               if (dump_enabled_p ())
 407                 {
 408                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 409                                    "not vectorized: vector stmt in loop:");
 410                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 411                 }
 412               return false;
 413             }
 414
 415           bool_result = false;
 416
 417           if (STMT_VINFO_VECTYPE (stmt_info))
 418             {
 419               /* The only case when a vectype had been already set is for stmts
 420                  that contain a dataref, or for "pattern-stmts" (stmts
 421                  generated by the vectorizer to represent/replace a certain
 422                  idiom).  */
 423               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 424                           || is_pattern_stmt_p (stmt_info)
 425                           || !gsi_end_p (pattern_def_si));
 426               vectype = STMT_VINFO_VECTYPE (stmt_info);
 427             }
 428           else
 429             {
 430               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 431               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 432                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 433               else
 434                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 435
 436               /* Bool ops don't participate in vectorization factor
 437                  computation.  For comparison use compared types to
 438                  compute a factor.  */
 439               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 440                   && is_gimple_assign (stmt)
 441                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 442                 {
 443                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 444                       || STMT_VINFO_LIVE_P (stmt_info))
 445                     mask_producers.safe_push (stmt_info);
 446                   bool_result = true;
 447
 448                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 449                       == tcc_comparison
 450                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 451                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 452                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 453                   else
 454                     {
 455                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 456                         {
 457                           pattern_def_seq = NULL;
 458                           gsi_next (&si);
 459                         }
 460                       continue;
 461                     }
 462                 }
 463
 464               if (dump_enabled_p ())
 465                 {
 466                   dump_printf_loc (MSG_NOTE, vect_location,
 467                                    "get vectype for scalar type:  ");
 468                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 469                   dump_printf (MSG_NOTE, "\n");
 470                 }
 471               vectype = get_vectype_for_scalar_type (scalar_type);
 472               if (!vectype)
 473                 {
 474                   if (dump_enabled_p ())
 475                     {
 476                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                                        "not vectorized: unsupported "
 478                                        "data-type ");
 479                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 480                                          scalar_type);
 481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 482                     }
 483                   return false;
 484                 }
 485
 486               if (!bool_result)
 487                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 488
 489               if (dump_enabled_p ())
 490                 {
 491                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 492                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 493                   dump_printf (MSG_NOTE, "\n");
 494                 }
 495             }
 496
 497           /* Don't try to compute VF out scalar types if we stmt
 498              produces boolean vector.  Use result vectype instead.  */
 499           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 500             vf_vectype = vectype;
 501           else
 502             {
 503               /* The vectorization factor is according to the smallest
 504                  scalar type (or the largest vector size, but we only
 505                  support one vector size per loop).  */
 506               if (!bool_result)
 507                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 508                                                              &dummy);
 509               if (dump_enabled_p ())
 510                 {
 511                   dump_printf_loc (MSG_NOTE, vect_location,
 512                                    "get vectype for scalar type:  ");
 513                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 514                   dump_printf (MSG_NOTE, "\n");
 515                 }
 516               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 517             }
 518           if (!vf_vectype)
 519             {
 520               if (dump_enabled_p ())
 521                 {
 522                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                                    "not vectorized: unsupported data-type ");
 524                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 525                                      scalar_type);
 526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 527                 }
 528               return false;
 529             }
 530
 531           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 532                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 533             {
 534               if (dump_enabled_p ())
 535                 {
 536                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 537                                    "not vectorized: different sized vector "
 538                                    "types in statement, ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 542                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 543                                      vf_vectype);
 544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 545                 }
 546               return false;
 547             }
 548
 549           if (dump_enabled_p ())
 550             {
 551               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 552               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 553               dump_printf (MSG_NOTE, "\n");
 554             }
 555
 556           if (dump_enabled_p ())
 557             {
 558               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 559               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 560               dump_printf (MSG_NOTE, "\n");
 561             }
 562
 563           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 564
 565           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 566             {
 567               pattern_def_seq = NULL;
 568               gsi_next (&si);
 569             }
 570         }
 571     }
 572
 573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 574   if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 577       dump_dec (MSG_NOTE, vectorization_factor);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   if (known_le (vectorization_factor, 1U))
 582     {
 583       if (dump_enabled_p ())
 584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 585                          "not vectorized: unsupported data-type\n");
 586       return false;
 587     }
 588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 589
 590   for (i = 0; i < mask_producers.length (); i++)
 591     {
 592       tree mask_type = NULL;
 593
 594       stmt = STMT_VINFO_STMT (mask_producers[i]);
 595
 596       if (is_gimple_assign (stmt)
 597           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 598           && !VECT_SCALAR_BOOLEAN_TYPE_P
 599                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 600         {
 601           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 602           mask_type = get_mask_type_for_scalar_type (scalar_type);
 603
 604           if (!mask_type)
 605             {
 606               if (dump_enabled_p ())
 607                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                                  "not vectorized: unsupported mask\n");
 609               return false;
 610             }
 611         }
 612       else
 613         {
 614           tree rhs;
 615           ssa_op_iter iter;
 616           gimple *def_stmt;
 617           enum vect_def_type dt;
 618
 619           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 620             {
 621               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 622                                        &def_stmt, &dt, &vectype))
 623                 {
 624                   if (dump_enabled_p ())
 625                     {
 626                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                                        "not vectorized: can't compute mask type "
 628                                        "for statement, ");
 629                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 630                                         0);
 631                     }
 632                   return false;
 633                 }
 634
 635               /* No vectype probably means external definition.
 636                  Allow it in case there is another operand which
 637                  allows to determine mask type.  */
 638               if (!vectype)
 639                 continue;
 640
 641               if (!mask_type)
 642                 mask_type = vectype;
 643               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 644                                  TYPE_VECTOR_SUBPARTS (vectype)))
 645                 {
 646                   if (dump_enabled_p ())
 647                     {
 648                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 649                                        "not vectorized: different sized masks "
 650                                        "types in statement, ");
 651                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 652                                          mask_type);
 653                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 654                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 655                                          vectype);
 656                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 657                     }
 658                   return false;
 659                 }
 660               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 661                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 662                 {
 663                   if (dump_enabled_p ())
 664                     {
 665                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 666                                        "not vectorized: mixed mask and "
 667                                        "nonmask vector types in statement, ");
 668                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 669                                          mask_type);
 670                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 671                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 672                                          vectype);
 673                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 674                     }
 675                   return false;
 676                 }
 677             }
 678
 679           /* We may compare boolean value loaded as vector of integers.
 680              Fix mask_type in such case.  */
 681           if (mask_type
 682               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 683               && gimple_code (stmt) == GIMPLE_ASSIGN
 684               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 685             mask_type = build_same_sized_truth_vector_type (mask_type);
 686         }
 687
 688       /* No mask_type should mean loop invariant predicate.
 689          This is probably a subject for optimization in
 690          if-conversion.  */
 691       if (!mask_type)
 692         {
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 696                                "not vectorized: can't compute mask type "
 697                                "for statement, ");
 698               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 699                                 0);
 700             }
 701           return false;
 702         }
 703
 704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 705     }
 706
 707   return true;
 708 }
 709
 710
 711 /* Function vect_is_simple_iv_evolution.
 712
 713    FORNOW: A simple evolution of an induction variables in the loop is
 714    considered a polynomial evolution.  */
 715
 716 static bool
 717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 718                              tree * step)
 719 {
 720   tree init_expr;
 721   tree step_expr;
 722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 723   basic_block bb;
 724
 725   /* When there is no evolution in this loop, the evolution function
 726      is not "simple".  */
 727   if (evolution_part == NULL_TREE)
 728     return false;
 729
 730   /* When the evolution is a polynomial of degree >= 2
 731      the evolution function is not "simple".  */
 732   if (tree_is_chrec (evolution_part))
 733     return false;
 734
 735   step_expr = evolution_part;
 736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 737
 738   if (dump_enabled_p ())
 739     {
 740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 742       dump_printf (MSG_NOTE, ",  init: ");
 743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 744       dump_printf (MSG_NOTE, "\n");
 745     }
 746
 747   *init = init_expr;
 748   *step = step_expr;
 749
 750   if (TREE_CODE (step_expr) != INTEGER_CST
 751       && (TREE_CODE (step_expr) != SSA_NAME
 752           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 753               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 754           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 755               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 756                   || !flag_associative_math)))
 757       && (TREE_CODE (step_expr) != REAL_CST
 758           || !flag_associative_math))
 759     {
 760       if (dump_enabled_p ())
 761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 762                          "step unknown.\n");
 763       return false;
 764     }
 765
 766   return true;
 767 }
 768
 769 /* Function vect_analyze_scalar_cycles_1.
 770
 771    Examine the cross iteration def-use cycles of scalar variables
 772    in LOOP.  LOOP_VINFO represents the loop that is now being
 773    considered for vectorization (can be LOOP, or an outer-loop
 774    enclosing LOOP).  */
 775
 776 static void
 777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 778 {
 779   basic_block bb = loop->header;
 780   tree init, step;
 781   auto_vec<gimple *, 64> worklist;
 782   gphi_iterator gsi;
 783   bool double_reduc;
 784
 785   if (dump_enabled_p ())
 786     dump_printf_loc (MSG_NOTE, vect_location,
 787                      "=== vect_analyze_scalar_cycles ===\n");
 788
 789   /* First - identify all inductions.  Reduction detection assumes that all the
 790      inductions have been identified, therefore, this order must not be
 791      changed.  */
 792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 793     {
 794       gphi *phi = gsi.phi ();
 795       tree access_fn = NULL;
 796       tree def = PHI_RESULT (phi);
 797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 798
 799       if (dump_enabled_p ())
 800         {
 801           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 802           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 803         }
 804
 805       /* Skip virtual phi's.  The data dependences that are associated with
 806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 807       if (virtual_operand_p (def))
 808         continue;
 809
 810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 811
 812       /* Analyze the evolution function.  */
 813       access_fn = analyze_scalar_evolution (loop, def);
 814       if (access_fn)
 815         {
 816           STRIP_NOPS (access_fn);
 817           if (dump_enabled_p ())
 818             {
 819               dump_printf_loc (MSG_NOTE, vect_location,
 820                                "Access function of PHI: ");
 821               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 822               dump_printf (MSG_NOTE, "\n");
 823             }
 824           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 825             = initial_condition_in_loop_num (access_fn, loop->num);
 826           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 827             = evolution_part_in_loop_num (access_fn, loop->num);
 828         }
 829
 830       if (!access_fn
 831           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 832           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 833               && TREE_CODE (step) != INTEGER_CST))
 834         {
 835           worklist.safe_push (phi);
 836           continue;
 837         }
 838
 839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 840                   != NULL_TREE);
 841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 842
 843       if (dump_enabled_p ())
 844         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 846     }
 847
 848
 849   /* Second - identify all reductions and nested cycles.  */
 850   while (worklist.length () > 0)
 851     {
 852       gimple *phi = worklist.pop ();
 853       tree def = PHI_RESULT (phi);
 854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 855       gimple *reduc_stmt;
 856
 857       if (dump_enabled_p ())
 858         {
 859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 861         }
 862
 863       gcc_assert (!virtual_operand_p (def)
 864                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 865
 866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 867                                                 &double_reduc, false);
 868       if (reduc_stmt)
 869         {
 870           if (double_reduc)
 871             {
 872               if (dump_enabled_p ())
 873                 dump_printf_loc (MSG_NOTE, vect_location,
 874                                  "Detected double reduction.\n");
 875
 876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 878                                                     vect_double_reduction_def;
 879             }
 880           else
 881             {
 882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 883                 {
 884                   if (dump_enabled_p ())
 885                     dump_printf_loc (MSG_NOTE, vect_location,
 886                                      "Detected vectorizable nested cycle.\n");
 887
 888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 890                                                              vect_nested_cycle;
 891                 }
 892               else
 893                 {
 894                   if (dump_enabled_p ())
 895                     dump_printf_loc (MSG_NOTE, vect_location,
 896                                      "Detected reduction.\n");
 897
 898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 900                                                            vect_reduction_def;
 901                   /* Store the reduction cycles for possible vectorization in
 902                      loop-aware SLP if it was not detected as reduction
 903                      chain.  */
 904                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 905                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 906                 }
 907             }
 908         }
 909       else
 910         if (dump_enabled_p ())
 911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                            "Unknown def-use cycle pattern.\n");
 913     }
 914 }
 915
 916
 917 /* Function vect_analyze_scalar_cycles.
 918
 919    Examine the cross iteration def-use cycles of scalar variables, by
 920    analyzing the loop-header PHIs of scalar variables.  Classify each
 921    cycle as one of the following: invariant, induction, reduction, unknown.
 922    We do that for the loop represented by LOOP_VINFO, and also to its
 923    inner-loop, if exists.
 924    Examples for scalar cycles:
 925
 926    Example1: reduction:
 927
 928               loop1:
 929               for (i=0; i<N; i++)
 930                  sum += a[i];
 931
 932    Example2: induction:
 933
 934               loop2:
 935               for (i=0; i<N; i++)
 936                  a[i] = i;  */
 937
 938 static void
 939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 940 {
 941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 942
 943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 944
 945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 946      Reductions in such inner-loop therefore have different properties than
 947      the reductions in the nest that gets vectorized:
 948      1. When vectorized, they are executed in the same order as in the original
 949         scalar loop, so we can't change the order of computation when
 950         vectorizing them.
 951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 952         current checks are too strict.  */
 953
 954   if (loop->inner)
 955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 956 }
 957
 958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 959
 960 static void
 961 vect_fixup_reduc_chain (gimple *stmt)
 962 {
 963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964   gimple *stmtp;
 965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 966               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 968   do
 969     {
 970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 973       if (stmt)
 974         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 975           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 976     }
 977   while (stmt);
 978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 979 }
 980
 981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 982
 983 static void
 984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 985 {
 986   gimple *first;
 987   unsigned i;
 988
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 991       {
 992         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 993         while (next)
 994           {
 995             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 996               break;
 997             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 998           }
 999         /* If not all stmt in the chain are patterns try to handle
1000            the chain without patterns.  */
1001         if (! next)
1002           {
1003             vect_fixup_reduc_chain (first);
1004             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006           }
1007       }
1008 }
1009
1010 /* Function vect_get_loop_niters.
1011
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016
1017    Return the loop exit condition.  */
1018
1019
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022                       tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034                      "=== get_loop_niters ===\n");
1035
1036   if (!exit)
1037     return cond;
1038
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056         {
1057           /* Try to combine may_be_zero with assumptions, this can simplify
1058              computation of niter expression.  */
1059           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061                                              niter_assumptions,
1062                                              fold_build1 (TRUTH_NOT_EXPR,
1063                                                           boolean_type_node,
1064                                                           may_be_zero));
1065           else
1066             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067                                  build_int_cst (TREE_TYPE (niter), 0),
1068                                  rewrite_to_non_trapping_overflow (niter));
1069
1070           may_be_zero = NULL_TREE;
1071         }
1072       else if (integer_nonzerop (may_be_zero))
1073         {
1074           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076           return cond;
1077         }
1078       else
1079         return cond;
1080     }
1081
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091                           build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093
1094   return cond;
1095 }
1096
1097 /* Function bb_in_loop_p
1098
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109
1110
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     slp_unrolling_factor (1),
1132     single_scalar_iteration_cost (0),
1133     vectorizable (false),
1134     can_fully_mask_p (true),
1135     fully_masked_p (false),
1136     peeling_for_gaps (false),
1137     peeling_for_niter (false),
1138     operands_swapped (false),
1139     no_data_dependencies (false),
1140     has_mask_store (false),
1141     scalar_loop (NULL),
1142     orig_loop_info (NULL)
1143 {
1144   /* Create/Update stmt_info for all stmts in the loop.  */
1145   basic_block *body = get_loop_body (loop);
1146   for (unsigned int i = 0; i < loop->num_nodes; i++)
1147     {
1148       basic_block bb = body[i];
1149       gimple_stmt_iterator si;
1150
1151       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1152         {
1153           gimple *phi = gsi_stmt (si);
1154           gimple_set_uid (phi, 0);
1155           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1156         }
1157
1158       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1159         {
1160           gimple *stmt = gsi_stmt (si);
1161           gimple_set_uid (stmt, 0);
1162           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1163         }
1164     }
1165   free (body);
1166
1167   /* CHECKME: We want to visit all BBs before their successors (except for
1168      latch blocks, for which this assertion wouldn't hold).  In the simple
1169      case of the loop forms we allow, a dfs order of the BBs would the same
1170      as reversed postorder traversal, so we are safe.  */
1171
1172   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1173                                           bbs, loop->num_nodes, loop);
1174   gcc_assert (nbbs == loop->num_nodes);
1175 }
1176
1177 /* Free all levels of MASKS.  */
1178
1179 void
1180 release_vec_loop_masks (vec_loop_masks *masks)
1181 {
1182   rgroup_masks *rgm;
1183   unsigned int i;
1184   FOR_EACH_VEC_ELT (*masks, i, rgm)
1185     rgm->masks.release ();
1186   masks->release ();
1187 }
1188
1189 /* Free all memory used by the _loop_vec_info, as well as all the
1190    stmt_vec_info structs of all the stmts in the loop.  */
1191
1192 _loop_vec_info::~_loop_vec_info ()
1193 {
1194   int nbbs;
1195   gimple_stmt_iterator si;
1196   int j;
1197
1198   nbbs = loop->num_nodes;
1199   for (j = 0; j < nbbs; j++)
1200     {
1201       basic_block bb = bbs[j];
1202       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1203         free_stmt_vec_info (gsi_stmt (si));
1204
1205       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1206         {
1207           gimple *stmt = gsi_stmt (si);
1208
1209           /* We may have broken canonical form by moving a constant
1210              into RHS1 of a commutative op.  Fix such occurrences.  */
1211           if (operands_swapped && is_gimple_assign (stmt))
1212             {
1213               enum tree_code code = gimple_assign_rhs_code (stmt);
1214
1215               if ((code == PLUS_EXPR
1216                    || code == POINTER_PLUS_EXPR
1217                    || code == MULT_EXPR)
1218                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1219                 swap_ssa_operands (stmt,
1220                                    gimple_assign_rhs1_ptr (stmt),
1221                                    gimple_assign_rhs2_ptr (stmt));
1222               else if (code == COND_EXPR
1223                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1224                 {
1225                   tree cond_expr = gimple_assign_rhs1 (stmt);
1226                   enum tree_code cond_code = TREE_CODE (cond_expr);
1227
1228                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1229                     {
1230                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1231                                                                   0));
1232                       cond_code = invert_tree_comparison (cond_code,
1233                                                           honor_nans);
1234                       if (cond_code != ERROR_MARK)
1235                         {
1236                           TREE_SET_CODE (cond_expr, cond_code);
1237                           swap_ssa_operands (stmt,
1238                                              gimple_assign_rhs2_ptr (stmt),
1239                                              gimple_assign_rhs3_ptr (stmt));
1240                         }
1241                     }
1242                 }
1243             }
1244
1245           /* Free stmt_vec_info.  */
1246           free_stmt_vec_info (stmt);
1247           gsi_next (&si);
1248         }
1249     }
1250
1251   free (bbs);
1252
1253   release_vec_loop_masks (&masks);
1254
1255   loop->aux = NULL;
1256 }
1257
1258 /* Return true if we can use CMP_TYPE as the comparison type to produce
1259    all masks required to mask LOOP_VINFO.  */
1260
1261 static bool
1262 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1263 {
1264   rgroup_masks *rgm;
1265   unsigned int i;
1266   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1267     if (rgm->mask_type != NULL_TREE
1268         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1269                                             cmp_type, rgm->mask_type,
1270                                             OPTIMIZE_FOR_SPEED))
1271       return false;
1272   return true;
1273 }
1274
1275 /* Calculate the maximum number of scalars per iteration for every
1276    rgroup in LOOP_VINFO.  */
1277
1278 static unsigned int
1279 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1280 {
1281   unsigned int res = 1;
1282   unsigned int i;
1283   rgroup_masks *rgm;
1284   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1285     res = MAX (res, rgm->max_nscalars_per_iter);
1286   return res;
1287 }
1288
1289 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1290    whether we can actually generate the masks required.  Return true if so,
1291    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1292
1293 static bool
1294 vect_verify_full_masking (loop_vec_info loop_vinfo)
1295 {
1296   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1297   unsigned int min_ni_width;
1298
1299   /* Use a normal loop if there are no statements that need masking.
1300      This only happens in rare degenerate cases: it means that the loop
1301      has no loads, no stores, and no live-out values.  */
1302   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1303     return false;
1304
1305   /* Get the maximum number of iterations that is representable
1306      in the counter type.  */
1307   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1308   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1309
1310   /* Get a more refined estimate for the number of iterations.  */
1311   widest_int max_back_edges;
1312   if (max_loop_iterations (loop, &max_back_edges))
1313     max_ni = wi::smin (max_ni, max_back_edges + 1);
1314
1315   /* Account for rgroup masks, in which each bit is replicated N times.  */
1316   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1317
1318   /* Work out how many bits we need to represent the limit.  */
1319   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1320
1321   /* Find a scalar mode for which WHILE_ULT is supported.  */
1322   opt_scalar_int_mode cmp_mode_iter;
1323   tree cmp_type = NULL_TREE;
1324   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1325     {
1326       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1327       if (cmp_bits >= min_ni_width
1328           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1329         {
1330           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1331           if (this_type
1332               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1333             {
1334               /* Although we could stop as soon as we find a valid mode,
1335                  it's often better to continue until we hit Pmode, since the
1336                  operands to the WHILE are more likely to be reusable in
1337                  address calculations.  */
1338               cmp_type = this_type;
1339               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1340                 break;
1341             }
1342         }
1343     }
1344
1345   if (!cmp_type)
1346     return false;
1347
1348   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1349   return true;
1350 }
1351
1352 /* Calculate the cost of one scalar iteration of the loop.  */
1353 static void
1354 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1355 {
1356   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1357   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1358   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1359   int innerloop_iters, i;
1360
1361   /* Count statements in scalar loop.  Using this as scalar cost for a single
1362      iteration for now.
1363
1364      TODO: Add outer loop support.
1365
1366      TODO: Consider assigning different costs to different scalar
1367      statements.  */
1368
1369   /* FORNOW.  */
1370   innerloop_iters = 1;
1371   if (loop->inner)
1372     innerloop_iters = 50; /* FIXME */
1373
1374   for (i = 0; i < nbbs; i++)
1375     {
1376       gimple_stmt_iterator si;
1377       basic_block bb = bbs[i];
1378
1379       if (bb->loop_father == loop->inner)
1380         factor = innerloop_iters;
1381       else
1382         factor = 1;
1383
1384       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1385         {
1386           gimple *stmt = gsi_stmt (si);
1387           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1388
1389           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1390             continue;
1391
1392           /* Skip stmts that are not vectorized inside the loop.  */
1393           if (stmt_info
1394               && !STMT_VINFO_RELEVANT_P (stmt_info)
1395               && (!STMT_VINFO_LIVE_P (stmt_info)
1396                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1397               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1398             continue;
1399
1400           vect_cost_for_stmt kind;
1401           if (STMT_VINFO_DATA_REF (stmt_info))
1402             {
1403               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1404                kind = scalar_load;
1405              else
1406                kind = scalar_store;
1407             }
1408           else
1409             kind = scalar_stmt;
1410
1411           scalar_single_iter_cost
1412             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1413                                  factor, kind, stmt_info, 0, vect_prologue);
1414         }
1415     }
1416   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1417     = scalar_single_iter_cost;
1418 }
1419
1420
1421 /* Function vect_analyze_loop_form_1.
1422
1423    Verify that certain CFG restrictions hold, including:
1424    - the loop has a pre-header
1425    - the loop has a single entry and exit
1426    - the loop exit condition is simple enough
1427    - the number of iterations can be analyzed, i.e, a countable loop.  The
1428      niter could be analyzed under some assumptions.  */
1429
1430 bool
1431 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1432                           tree *assumptions, tree *number_of_iterationsm1,
1433                           tree *number_of_iterations, gcond **inner_loop_cond)
1434 {
1435   if (dump_enabled_p ())
1436     dump_printf_loc (MSG_NOTE, vect_location,
1437                      "=== vect_analyze_loop_form ===\n");
1438
1439   /* Different restrictions apply when we are considering an inner-most loop,
1440      vs. an outer (nested) loop.
1441      (FORNOW. May want to relax some of these restrictions in the future).  */
1442
1443   if (!loop->inner)
1444     {
1445       /* Inner-most loop.  We currently require that the number of BBs is
1446          exactly 2 (the header and latch).  Vectorizable inner-most loops
1447          look like this:
1448
1449                         (pre-header)
1450                            |
1451                           header <--------+
1452                            | |            |
1453                            | +--> latch --+
1454                            |
1455                         (exit-bb)  */
1456
1457       if (loop->num_nodes != 2)
1458         {
1459           if (dump_enabled_p ())
1460             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1461                              "not vectorized: control flow in loop.\n");
1462           return false;
1463         }
1464
1465       if (empty_block_p (loop->header))
1466         {
1467           if (dump_enabled_p ())
1468             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1469                              "not vectorized: empty loop.\n");
1470           return false;
1471         }
1472     }
1473   else
1474     {
1475       struct loop *innerloop = loop->inner;
1476       edge entryedge;
1477
1478       /* Nested loop. We currently require that the loop is doubly-nested,
1479          contains a single inner loop, and the number of BBs is exactly 5.
1480          Vectorizable outer-loops look like this:
1481
1482                         (pre-header)
1483                            |
1484                           header <---+
1485                            |         |
1486                           inner-loop |
1487                            |         |
1488                           tail ------+
1489                            |
1490                         (exit-bb)
1491
1492          The inner-loop has the properties expected of inner-most loops
1493          as described above.  */
1494
1495       if ((loop->inner)->inner || (loop->inner)->next)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                              "not vectorized: multiple nested loops.\n");
1500           return false;
1501         }
1502
1503       if (loop->num_nodes != 5)
1504         {
1505           if (dump_enabled_p ())
1506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                              "not vectorized: control flow in loop.\n");
1508           return false;
1509         }
1510
1511       entryedge = loop_preheader_edge (innerloop);
1512       if (entryedge->src != loop->header
1513           || !single_exit (innerloop)
1514           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1515         {
1516           if (dump_enabled_p ())
1517             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1518                              "not vectorized: unsupported outerloop form.\n");
1519           return false;
1520         }
1521
1522       /* Analyze the inner-loop.  */
1523       tree inner_niterm1, inner_niter, inner_assumptions;
1524       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1525                                       &inner_assumptions, &inner_niterm1,
1526                                       &inner_niter, NULL)
1527           /* Don't support analyzing niter under assumptions for inner
1528              loop.  */
1529           || !integer_onep (inner_assumptions))
1530         {
1531           if (dump_enabled_p ())
1532             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                              "not vectorized: Bad inner loop.\n");
1534           return false;
1535         }
1536
1537       if (!expr_invariant_in_loop_p (loop, inner_niter))
1538         {
1539           if (dump_enabled_p ())
1540             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1541                              "not vectorized: inner-loop count not"
1542                              " invariant.\n");
1543           return false;
1544         }
1545
1546       if (dump_enabled_p ())
1547         dump_printf_loc (MSG_NOTE, vect_location,
1548                          "Considering outer-loop vectorization.\n");
1549     }
1550
1551   if (!single_exit (loop)
1552       || EDGE_COUNT (loop->header->preds) != 2)
1553     {
1554       if (dump_enabled_p ())
1555         {
1556           if (!single_exit (loop))
1557             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558                              "not vectorized: multiple exits.\n");
1559           else if (EDGE_COUNT (loop->header->preds) != 2)
1560             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1561                              "not vectorized: too many incoming edges.\n");
1562         }
1563       return false;
1564     }
1565
1566   /* We assume that the loop exit condition is at the end of the loop. i.e,
1567      that the loop is represented as a do-while (with a proper if-guard
1568      before the loop if needed), where the loop header contains all the
1569      executable statements, and the latch is empty.  */
1570   if (!empty_block_p (loop->latch)
1571       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1572     {
1573       if (dump_enabled_p ())
1574         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1575                          "not vectorized: latch block not empty.\n");
1576       return false;
1577     }
1578
1579   /* Make sure the exit is not abnormal.  */
1580   edge e = single_exit (loop);
1581   if (e->flags & EDGE_ABNORMAL)
1582     {
1583       if (dump_enabled_p ())
1584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1585                          "not vectorized: abnormal loop exit edge.\n");
1586       return false;
1587     }
1588
1589   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1590                                      number_of_iterationsm1);
1591   if (!*loop_cond)
1592     {
1593       if (dump_enabled_p ())
1594         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1595                          "not vectorized: complicated exit condition.\n");
1596       return false;
1597     }
1598
1599   if (integer_zerop (*assumptions)
1600       || !*number_of_iterations
1601       || chrec_contains_undetermined (*number_of_iterations))
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "not vectorized: number of iterations cannot be "
1606                          "computed.\n");
1607       return false;
1608     }
1609
1610   if (integer_zerop (*number_of_iterations))
1611     {
1612       if (dump_enabled_p ())
1613         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1614                          "not vectorized: number of iterations = 0.\n");
1615       return false;
1616     }
1617
1618   return true;
1619 }
1620
1621 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1622
1623 loop_vec_info
1624 vect_analyze_loop_form (struct loop *loop)
1625 {
1626   tree assumptions, number_of_iterations, number_of_iterationsm1;
1627   gcond *loop_cond, *inner_loop_cond = NULL;
1628
1629   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1630                                   &assumptions, &number_of_iterationsm1,
1631                                   &number_of_iterations, &inner_loop_cond))
1632     return NULL;
1633
1634   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1635   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1636   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1637   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1638   if (!integer_onep (assumptions))
1639     {
1640       /* We consider to vectorize this loop by versioning it under
1641          some assumptions.  In order to do this, we need to clear
1642          existing information computed by scev and niter analyzer.  */
1643       scev_reset_htab ();
1644       free_numbers_of_iterations_estimates (loop);
1645       /* Also set flag for this loop so that following scev and niter
1646          analysis are done under the assumptions.  */
1647       loop_constraint_set (loop, LOOP_C_FINITE);
1648       /* Also record the assumptions for versioning.  */
1649       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1650     }
1651
1652   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1653     {
1654       if (dump_enabled_p ())
1655         {
1656           dump_printf_loc (MSG_NOTE, vect_location,
1657                            "Symbolic number of iterations is ");
1658           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1659           dump_printf (MSG_NOTE, "\n");
1660         }
1661     }
1662
1663   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1664   if (inner_loop_cond)
1665     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1666       = loop_exit_ctrl_vec_info_type;
1667
1668   gcc_assert (!loop->aux);
1669   loop->aux = loop_vinfo;
1670   return loop_vinfo;
1671 }
1672
1673
1674
1675 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1676    statements update the vectorization factor.  */
1677
1678 static void
1679 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1680 {
1681   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1682   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1683   int nbbs = loop->num_nodes;
1684   poly_uint64 vectorization_factor;
1685   int i;
1686
1687   if (dump_enabled_p ())
1688     dump_printf_loc (MSG_NOTE, vect_location,
1689                      "=== vect_update_vf_for_slp ===\n");
1690
1691   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1692   gcc_assert (known_ne (vectorization_factor, 0U));
1693
1694   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1695      vectorization factor of the loop is the unrolling factor required by
1696      the SLP instances.  If that unrolling factor is 1, we say, that we
1697      perform pure SLP on loop - cross iteration parallelism is not
1698      exploited.  */
1699   bool only_slp_in_loop = true;
1700   for (i = 0; i < nbbs; i++)
1701     {
1702       basic_block bb = bbs[i];
1703       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1704            gsi_next (&si))
1705         {
1706           gimple *stmt = gsi_stmt (si);
1707           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1708           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1709               && STMT_VINFO_RELATED_STMT (stmt_info))
1710             {
1711               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1712               stmt_info = vinfo_for_stmt (stmt);
1713             }
1714           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1715                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1716               && !PURE_SLP_STMT (stmt_info))
1717             /* STMT needs both SLP and loop-based vectorization.  */
1718             only_slp_in_loop = false;
1719         }
1720     }
1721
1722   if (only_slp_in_loop)
1723     {
1724       dump_printf_loc (MSG_NOTE, vect_location,
1725                        "Loop contains only SLP stmts\n");
1726       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1727     }
1728   else
1729     {
1730       dump_printf_loc (MSG_NOTE, vect_location,
1731                        "Loop contains SLP and non-SLP stmts\n");
1732       /* Both the vectorization factor and unroll factor have the form
1733          current_vector_size * X for some rational X, so they must have
1734          a common multiple.  */
1735       vectorization_factor
1736         = force_common_multiple (vectorization_factor,
1737                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1738     }
1739
1740   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1741   if (dump_enabled_p ())
1742     {
1743       dump_printf_loc (MSG_NOTE, vect_location,
1744                        "Updating vectorization factor to ");
1745       dump_dec (MSG_NOTE, vectorization_factor);
1746       dump_printf (MSG_NOTE, ".\n");
1747     }
1748 }
1749
1750 /* Return true if STMT_INFO describes a double reduction phi and if
1751    the other phi in the reduction is also relevant for vectorization.
1752    This rejects cases such as:
1753
1754       outer1:
1755         x_1 = PHI <x_3(outer2), ...>;
1756         ...
1757
1758       inner:
1759         x_2 = ...;
1760         ...
1761
1762       outer2:
1763         x_3 = PHI <x_2(inner)>;
1764
1765    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1766
1767 static bool
1768 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1769 {
1770   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1771     return false;
1772
1773   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1774   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1775 }
1776
1777 /* Function vect_analyze_loop_operations.
1778
1779    Scan the loop stmts and make sure they are all vectorizable.  */
1780
1781 static bool
1782 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1783 {
1784   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1785   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1786   int nbbs = loop->num_nodes;
1787   int i;
1788   stmt_vec_info stmt_info;
1789   bool need_to_vectorize = false;
1790   bool ok;
1791
1792   if (dump_enabled_p ())
1793     dump_printf_loc (MSG_NOTE, vect_location,
1794                      "=== vect_analyze_loop_operations ===\n");
1795
1796   for (i = 0; i < nbbs; i++)
1797     {
1798       basic_block bb = bbs[i];
1799
1800       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1801            gsi_next (&si))
1802         {
1803           gphi *phi = si.phi ();
1804           ok = true;
1805
1806           stmt_info = vinfo_for_stmt (phi);
1807           if (dump_enabled_p ())
1808             {
1809               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1810               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1811             }
1812           if (virtual_operand_p (gimple_phi_result (phi)))
1813             continue;
1814
1815           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1816              (i.e., a phi in the tail of the outer-loop).  */
1817           if (! is_loop_header_bb_p (bb))
1818             {
1819               /* FORNOW: we currently don't support the case that these phis
1820                  are not used in the outerloop (unless it is double reduction,
1821                  i.e., this phi is vect_reduction_def), cause this case
1822                  requires to actually do something here.  */
1823               if (STMT_VINFO_LIVE_P (stmt_info)
1824                   && !vect_active_double_reduction_p (stmt_info))
1825                 {
1826                   if (dump_enabled_p ())
1827                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1828                                      "Unsupported loop-closed phi in "
1829                                      "outer-loop.\n");
1830                   return false;
1831                 }
1832
1833               /* If PHI is used in the outer loop, we check that its operand
1834                  is defined in the inner loop.  */
1835               if (STMT_VINFO_RELEVANT_P (stmt_info))
1836                 {
1837                   tree phi_op;
1838                   gimple *op_def_stmt;
1839
1840                   if (gimple_phi_num_args (phi) != 1)
1841                     return false;
1842
1843                   phi_op = PHI_ARG_DEF (phi, 0);
1844                   if (TREE_CODE (phi_op) != SSA_NAME)
1845                     return false;
1846
1847                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1848                   if (gimple_nop_p (op_def_stmt)
1849                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1850                       || !vinfo_for_stmt (op_def_stmt))
1851                     return false;
1852
1853                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1854                         != vect_used_in_outer
1855                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1856                            != vect_used_in_outer_by_reduction)
1857                     return false;
1858                 }
1859
1860               continue;
1861             }
1862
1863           gcc_assert (stmt_info);
1864
1865           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1866                || STMT_VINFO_LIVE_P (stmt_info))
1867               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1868             {
1869               /* A scalar-dependence cycle that we don't support.  */
1870               if (dump_enabled_p ())
1871                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                                  "not vectorized: scalar dependence cycle.\n");
1873               return false;
1874             }
1875
1876           if (STMT_VINFO_RELEVANT_P (stmt_info))
1877             {
1878               need_to_vectorize = true;
1879               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1880                   && ! PURE_SLP_STMT (stmt_info))
1881                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1882               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1883                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1884                        && ! PURE_SLP_STMT (stmt_info))
1885                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1886             }
1887
1888           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1889           if (ok
1890               && STMT_VINFO_LIVE_P (stmt_info)
1891               && !PURE_SLP_STMT (stmt_info))
1892             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1893
1894           if (!ok)
1895             {
1896               if (dump_enabled_p ())
1897                 {
1898                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1899                                    "not vectorized: relevant phi not "
1900                                    "supported: ");
1901                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1902                 }
1903               return false;
1904             }
1905         }
1906
1907       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1908            gsi_next (&si))
1909         {
1910           gimple *stmt = gsi_stmt (si);
1911           if (!gimple_clobber_p (stmt)
1912               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1913             return false;
1914         }
1915     } /* bbs */
1916
1917   /* All operations in the loop are either irrelevant (deal with loop
1918      control, or dead), or only used outside the loop and can be moved
1919      out of the loop (e.g. invariants, inductions).  The loop can be
1920      optimized away by scalar optimizations.  We're better off not
1921      touching this loop.  */
1922   if (!need_to_vectorize)
1923     {
1924       if (dump_enabled_p ())
1925         dump_printf_loc (MSG_NOTE, vect_location,
1926                          "All the computation can be taken out of the loop.\n");
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "not vectorized: redundant loop. no profit to "
1930                          "vectorize.\n");
1931       return false;
1932     }
1933
1934   return true;
1935 }
1936
1937 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1938    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1939    definitely no, or -1 if it's worth retrying.  */
1940
1941 static int
1942 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1943 {
1944   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1945   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1946
1947   /* Only fully-masked loops can have iteration counts less than the
1948      vectorization factor.  */
1949   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1950     {
1951       HOST_WIDE_INT max_niter;
1952
1953       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1954         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1955       else
1956         max_niter = max_stmt_executions_int (loop);
1957
1958       if (max_niter != -1
1959           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1960         {
1961           if (dump_enabled_p ())
1962             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1963                              "not vectorized: iteration count smaller than "
1964                              "vectorization factor.\n");
1965           return 0;
1966         }
1967     }
1968
1969   int min_profitable_iters, min_profitable_estimate;
1970   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1971                                       &min_profitable_estimate);
1972
1973   if (min_profitable_iters < 0)
1974     {
1975       if (dump_enabled_p ())
1976         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1977                          "not vectorized: vectorization not profitable.\n");
1978       if (dump_enabled_p ())
1979         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1980                          "not vectorized: vector version will never be "
1981                          "profitable.\n");
1982       return -1;
1983     }
1984
1985   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1986                                * assumed_vf);
1987
1988   /* Use the cost model only if it is more conservative than user specified
1989      threshold.  */
1990   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
1991                                     min_profitable_iters);
1992
1993   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1994
1995   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1996       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
1997     {
1998       if (dump_enabled_p ())
1999         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2000                          "not vectorized: vectorization not profitable.\n");
2001       if (dump_enabled_p ())
2002         dump_printf_loc (MSG_NOTE, vect_location,
2003                          "not vectorized: iteration count smaller than user "
2004                          "specified loop bound parameter or minimum profitable "
2005                          "iterations (whichever is more conservative).\n");
2006       return 0;
2007     }
2008
2009   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2010   if (estimated_niter == -1)
2011     estimated_niter = likely_max_stmt_executions_int (loop);
2012   if (estimated_niter != -1
2013       && ((unsigned HOST_WIDE_INT) estimated_niter
2014           < MAX (th, (unsigned) min_profitable_estimate)))
2015     {
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "not vectorized: estimated iteration count too "
2019                          "small.\n");
2020       if (dump_enabled_p ())
2021         dump_printf_loc (MSG_NOTE, vect_location,
2022                          "not vectorized: estimated iteration count smaller "
2023                          "than specified loop bound parameter or minimum "
2024                          "profitable iterations (whichever is more "
2025                          "conservative).\n");
2026       return -1;
2027     }
2028
2029   return 1;
2030 }
2031
2032
2033 /* Function vect_analyze_loop_2.
2034
2035    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2036    for it.  The different analyses will record information in the
2037    loop_vec_info struct.  */
2038 static bool
2039 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2040 {
2041   bool ok;
2042   int res;
2043   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2044   poly_uint64 min_vf = 2;
2045   unsigned int n_stmts = 0;
2046
2047   /* The first group of checks is independent of the vector size.  */
2048   fatal = true;
2049
2050   /* Find all data references in the loop (which correspond to vdefs/vuses)
2051      and analyze their evolution in the loop.  */
2052
2053   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2054
2055   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2056   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2057     {
2058       if (dump_enabled_p ())
2059         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2060                          "not vectorized: loop nest containing two "
2061                          "or more consecutive inner loops cannot be "
2062                          "vectorized\n");
2063       return false;
2064     }
2065
2066   for (unsigned i = 0; i < loop->num_nodes; i++)
2067     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2068          !gsi_end_p (gsi); gsi_next (&gsi))
2069       {
2070         gimple *stmt = gsi_stmt (gsi);
2071         if (is_gimple_debug (stmt))
2072           continue;
2073         ++n_stmts;
2074         if (!find_data_references_in_stmt (loop, stmt,
2075                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2076           {
2077             if (is_gimple_call (stmt) && loop->safelen)
2078               {
2079                 tree fndecl = gimple_call_fndecl (stmt), op;
2080                 if (fndecl != NULL_TREE)
2081                   {
2082                     cgraph_node *node = cgraph_node::get (fndecl);
2083                     if (node != NULL && node->simd_clones != NULL)
2084                       {
2085                         unsigned int j, n = gimple_call_num_args (stmt);
2086                         for (j = 0; j < n; j++)
2087                           {
2088                             op = gimple_call_arg (stmt, j);
2089                             if (DECL_P (op)
2090                                 || (REFERENCE_CLASS_P (op)
2091                                     && get_base_address (op)))
2092                               break;
2093                           }
2094                         op = gimple_call_lhs (stmt);
2095                         /* Ignore #pragma omp declare simd functions
2096                            if they don't have data references in the
2097                            call stmt itself.  */
2098                         if (j == n
2099                             && !(op
2100                                  && (DECL_P (op)
2101                                      || (REFERENCE_CLASS_P (op)
2102                                          && get_base_address (op)))))
2103                           continue;
2104                       }
2105                   }
2106               }
2107             if (dump_enabled_p ())
2108               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2109                                "not vectorized: loop contains function "
2110                                "calls or data references that cannot "
2111                                "be analyzed\n");
2112             return false;
2113           }
2114       }
2115
2116   /* Analyze the data references and also adjust the minimal
2117      vectorization factor according to the loads and stores.  */
2118
2119   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2120   if (!ok)
2121     {
2122       if (dump_enabled_p ())
2123         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2124                          "bad data references.\n");
2125       return false;
2126     }
2127
2128   /* Classify all cross-iteration scalar data-flow cycles.
2129      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2130   vect_analyze_scalar_cycles (loop_vinfo);
2131
2132   vect_pattern_recog (loop_vinfo);
2133
2134   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2135
2136   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2137      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2138
2139   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2140   if (!ok)
2141     {
2142       if (dump_enabled_p ())
2143         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2144                          "bad data access.\n");
2145       return false;
2146     }
2147
2148   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2149
2150   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2151   if (!ok)
2152     {
2153       if (dump_enabled_p ())
2154         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2155                          "unexpected pattern.\n");
2156       return false;
2157     }
2158
2159   /* While the rest of the analysis below depends on it in some way.  */
2160   fatal = false;
2161
2162   /* Analyze data dependences between the data-refs in the loop
2163      and adjust the maximum vectorization factor according to
2164      the dependences.
2165      FORNOW: fail at the first data dependence that we encounter.  */
2166
2167   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2168   if (!ok
2169       || (max_vf != MAX_VECTORIZATION_FACTOR
2170           && maybe_lt (max_vf, min_vf)))
2171     {
2172       if (dump_enabled_p ())
2173             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2174                              "bad data dependence.\n");
2175       return false;
2176     }
2177   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2178
2179   ok = vect_determine_vectorization_factor (loop_vinfo);
2180   if (!ok)
2181     {
2182       if (dump_enabled_p ())
2183         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2184                          "can't determine vectorization factor.\n");
2185       return false;
2186     }
2187   if (max_vf != MAX_VECTORIZATION_FACTOR
2188       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2189     {
2190       if (dump_enabled_p ())
2191         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2192                          "bad data dependence.\n");
2193       return false;
2194     }
2195
2196   /* Compute the scalar iteration cost.  */
2197   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2198
2199   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2200   unsigned th;
2201
2202   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2203   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2204   if (!ok)
2205     return false;
2206
2207   /* If there are any SLP instances mark them as pure_slp.  */
2208   bool slp = vect_make_slp_decision (loop_vinfo);
2209   if (slp)
2210     {
2211       /* Find stmts that need to be both vectorized and SLPed.  */
2212       vect_detect_hybrid_slp (loop_vinfo);
2213
2214       /* Update the vectorization factor based on the SLP decision.  */
2215       vect_update_vf_for_slp (loop_vinfo);
2216     }
2217
2218   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2219
2220   /* We don't expect to have to roll back to anything other than an empty
2221      set of rgroups.  */
2222   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2223
2224   /* This is the point where we can re-start analysis with SLP forced off.  */
2225 start_over:
2226
2227   /* Now the vectorization factor is final.  */
2228   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2229   gcc_assert (known_ne (vectorization_factor, 0U));
2230
2231   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2232     {
2233       dump_printf_loc (MSG_NOTE, vect_location,
2234                        "vectorization_factor = ");
2235       dump_dec (MSG_NOTE, vectorization_factor);
2236       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2237                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2238     }
2239
2240   HOST_WIDE_INT max_niter
2241     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2242
2243   /* Analyze the alignment of the data-refs in the loop.
2244      Fail if a data reference is found that cannot be vectorized.  */
2245
2246   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2247   if (!ok)
2248     {
2249       if (dump_enabled_p ())
2250         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2251                          "bad data alignment.\n");
2252       return false;
2253     }
2254
2255   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2256      It is important to call pruning after vect_analyze_data_ref_accesses,
2257      since we use grouping information gathered by interleaving analysis.  */
2258   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2259   if (!ok)
2260     return false;
2261
2262   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2263      vectorization.  */
2264   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2265     {
2266     /* This pass will decide on using loop versioning and/or loop peeling in
2267        order to enhance the alignment of data references in the loop.  */
2268     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2269     if (!ok)
2270       {
2271         if (dump_enabled_p ())
2272           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2273                            "bad data alignment.\n");
2274         return false;
2275       }
2276     }
2277
2278   if (slp)
2279     {
2280       /* Analyze operations in the SLP instances.  Note this may
2281          remove unsupported SLP instances which makes the above
2282          SLP kind detection invalid.  */
2283       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2284       vect_slp_analyze_operations (loop_vinfo);
2285       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2286         goto again;
2287     }
2288
2289   /* Scan all the remaining operations in the loop that are not subject
2290      to SLP and make sure they are vectorizable.  */
2291   ok = vect_analyze_loop_operations (loop_vinfo);
2292   if (!ok)
2293     {
2294       if (dump_enabled_p ())
2295         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296                          "bad operation or unsupported loop bound.\n");
2297       return false;
2298     }
2299
2300   /* Decide whether to use a fully-masked loop for this vectorization
2301      factor.  */
2302   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2303     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2304        && vect_verify_full_masking (loop_vinfo));
2305   if (dump_enabled_p ())
2306     {
2307       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2308         dump_printf_loc (MSG_NOTE, vect_location,
2309                          "using a fully-masked loop.\n");
2310       else
2311         dump_printf_loc (MSG_NOTE, vect_location,
2312                          "not using a fully-masked loop.\n");
2313     }
2314
2315   /* If epilog loop is required because of data accesses with gaps,
2316      one additional iteration needs to be peeled.  Check if there is
2317      enough iterations for vectorization.  */
2318   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2319       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2320       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2321     {
2322       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2323       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2324
2325       if (known_lt (wi::to_widest (scalar_niters), vf))
2326         {
2327           if (dump_enabled_p ())
2328             dump_printf_loc (MSG_NOTE, vect_location,
2329                              "loop has no enough iterations to support"
2330                              " peeling for gaps.\n");
2331           return false;
2332         }
2333     }
2334
2335   /* Check the costings of the loop make vectorizing worthwhile.  */
2336   res = vect_analyze_loop_costing (loop_vinfo);
2337   if (res < 0)
2338     goto again;
2339   if (!res)
2340     {
2341       if (dump_enabled_p ())
2342         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2343                          "Loop costings not worthwhile.\n");
2344       return false;
2345     }
2346
2347   /* Decide whether we need to create an epilogue loop to handle
2348      remaining scalar iterations.  */
2349   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2350
2351   unsigned HOST_WIDE_INT const_vf;
2352   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2353     /* The main loop handles all iterations.  */
2354     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2355   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2356            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2357     {
2358       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2359                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2360                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2361         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2362     }
2363   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2364            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2365            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2366                 < (unsigned) exact_log2 (const_vf))
2367                /* In case of versioning, check if the maximum number of
2368                   iterations is greater than th.  If they are identical,
2369                   the epilogue is unnecessary.  */
2370                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2371                    || ((unsigned HOST_WIDE_INT) max_niter
2372                        > (th / const_vf) * const_vf))))
2373     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2374
2375   /* If an epilogue loop is required make sure we can create one.  */
2376   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2377       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2378     {
2379       if (dump_enabled_p ())
2380         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2381       if (!vect_can_advance_ivs_p (loop_vinfo)
2382           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2383                                            single_exit (LOOP_VINFO_LOOP
2384                                                          (loop_vinfo))))
2385         {
2386           if (dump_enabled_p ())
2387             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2388                              "not vectorized: can't create required "
2389                              "epilog loop\n");
2390           goto again;
2391         }
2392     }
2393
2394   /* During peeling, we need to check if number of loop iterations is
2395      enough for both peeled prolog loop and vector loop.  This check
2396      can be merged along with threshold check of loop versioning, so
2397      increase threshold for this case if necessary.  */
2398   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2399     {
2400       poly_uint64 niters_th = 0;
2401
2402       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2403         {
2404           /* Niters for peeled prolog loop.  */
2405           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2406             {
2407               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2408               tree vectype
2409                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2410               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2411             }
2412           else
2413             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2414         }
2415
2416       /* Niters for at least one iteration of vectorized loop.  */
2417       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2418         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2419       /* One additional iteration because of peeling for gap.  */
2420       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2421         niters_th += 1;
2422       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2423     }
2424
2425   gcc_assert (known_eq (vectorization_factor,
2426                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2427
2428   /* Ok to vectorize!  */
2429   return true;
2430
2431 again:
2432   /* Try again with SLP forced off but if we didn't do any SLP there is
2433      no point in re-trying.  */
2434   if (!slp)
2435     return false;
2436
2437   /* If there are reduction chains re-trying will fail anyway.  */
2438   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2439     return false;
2440
2441   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2442      via interleaving or lane instructions.  */
2443   slp_instance instance;
2444   slp_tree node;
2445   unsigned i, j;
2446   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2447     {
2448       stmt_vec_info vinfo;
2449       vinfo = vinfo_for_stmt
2450           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2451       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2452         continue;
2453       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2454       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2455       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2456       if (! vect_store_lanes_supported (vectype, size, false)
2457           && ! vect_grouped_store_supported (vectype, size))
2458         return false;
2459       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2460         {
2461           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2462           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2463           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2464           size = STMT_VINFO_GROUP_SIZE (vinfo);
2465           vectype = STMT_VINFO_VECTYPE (vinfo);
2466           if (! vect_load_lanes_supported (vectype, size, false)
2467               && ! vect_grouped_load_supported (vectype, single_element_p,
2468                                                 size))
2469             return false;
2470         }
2471     }
2472
2473   if (dump_enabled_p ())
2474     dump_printf_loc (MSG_NOTE, vect_location,
2475                      "re-trying with SLP disabled\n");
2476
2477   /* Roll back state appropriately.  No SLP this time.  */
2478   slp = false;
2479   /* Restore vectorization factor as it were without SLP.  */
2480   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2481   /* Free the SLP instances.  */
2482   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2483     vect_free_slp_instance (instance);
2484   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2485   /* Reset SLP type to loop_vect on all stmts.  */
2486   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2487     {
2488       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2489       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2490            !gsi_end_p (si); gsi_next (&si))
2491         {
2492           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2493           STMT_SLP_TYPE (stmt_info) = loop_vect;
2494         }
2495       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2496            !gsi_end_p (si); gsi_next (&si))
2497         {
2498           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2499           STMT_SLP_TYPE (stmt_info) = loop_vect;
2500           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2501             {
2502               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2503               STMT_SLP_TYPE (stmt_info) = loop_vect;
2504               for (gimple_stmt_iterator pi
2505                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2506                    !gsi_end_p (pi); gsi_next (&pi))
2507                 {
2508                   gimple *pstmt = gsi_stmt (pi);
2509                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2510                 }
2511             }
2512         }
2513     }
2514   /* Free optimized alias test DDRS.  */
2515   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2516   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2517   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2518   /* Reset target cost data.  */
2519   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2520   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2521     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2522   /* Reset accumulated rgroup information.  */
2523   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2524   /* Reset assorted flags.  */
2525   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2526   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2527   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2528   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2529   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2530
2531   goto start_over;
2532 }
2533
2534 /* Function vect_analyze_loop.
2535
2536    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2537    for it.  The different analyses will record information in the
2538    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2539    be vectorized.  */
2540 loop_vec_info
2541 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2542 {
2543   loop_vec_info loop_vinfo;
2544   auto_vector_sizes vector_sizes;
2545
2546   /* Autodetect first vector size we try.  */
2547   current_vector_size = 0;
2548   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2549   unsigned int next_size = 0;
2550
2551   if (dump_enabled_p ())
2552     dump_printf_loc (MSG_NOTE, vect_location,
2553                      "===== analyze_loop_nest =====\n");
2554
2555   if (loop_outer (loop)
2556       && loop_vec_info_for_loop (loop_outer (loop))
2557       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2558     {
2559       if (dump_enabled_p ())
2560         dump_printf_loc (MSG_NOTE, vect_location,
2561                          "outer-loop already vectorized.\n");
2562       return NULL;
2563     }
2564
2565   poly_uint64 autodetected_vector_size = 0;
2566   while (1)
2567     {
2568       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2569       loop_vinfo = vect_analyze_loop_form (loop);
2570       if (!loop_vinfo)
2571         {
2572           if (dump_enabled_p ())
2573             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2574                              "bad loop form.\n");
2575           return NULL;
2576         }
2577
2578       bool fatal = false;
2579
2580       if (orig_loop_vinfo)
2581         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2582
2583       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2584         {
2585           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2586
2587           return loop_vinfo;
2588         }
2589
2590       delete loop_vinfo;
2591
2592       if (next_size == 0)
2593         autodetected_vector_size = current_vector_size;
2594
2595       if (next_size < vector_sizes.length ()
2596           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2597         next_size += 1;
2598
2599       if (fatal
2600           || next_size == vector_sizes.length ()
2601           || known_eq (current_vector_size, 0U))
2602         return NULL;
2603
2604       /* Try the next biggest vector size.  */
2605       current_vector_size = vector_sizes[next_size++];
2606       if (dump_enabled_p ())
2607         {
2608           dump_printf_loc (MSG_NOTE, vect_location,
2609                            "***** Re-trying analysis with "
2610                            "vector size ");
2611           dump_dec (MSG_NOTE, current_vector_size);
2612           dump_printf (MSG_NOTE, "\n");
2613         }
2614     }
2615 }
2616
2617 /* Return true if there is an in-order reduction function for CODE, storing
2618    it in *REDUC_FN if so.  */
2619
2620 static bool
2621 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2622 {
2623   switch (code)
2624     {
2625     case PLUS_EXPR:
2626       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2627       return true;
2628
2629     default:
2630       return false;
2631     }
2632 }
2633
2634 /* Function reduction_fn_for_scalar_code
2635
2636    Input:
2637    CODE - tree_code of a reduction operations.
2638
2639    Output:
2640    REDUC_FN - the corresponding internal function to be used to reduce the
2641       vector of partial results into a single scalar result, or IFN_LAST
2642       if the operation is a supported reduction operation, but does not have
2643       such an internal function.
2644
2645    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2646
2647 static bool
2648 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2649 {
2650   switch (code)
2651     {
2652       case MAX_EXPR:
2653         *reduc_fn = IFN_REDUC_MAX;
2654         return true;
2655
2656       case MIN_EXPR:
2657         *reduc_fn = IFN_REDUC_MIN;
2658         return true;
2659
2660       case PLUS_EXPR:
2661         *reduc_fn = IFN_REDUC_PLUS;
2662         return true;
2663
2664       case BIT_AND_EXPR:
2665         *reduc_fn = IFN_REDUC_AND;
2666         return true;
2667
2668       case BIT_IOR_EXPR:
2669         *reduc_fn = IFN_REDUC_IOR;
2670         return true;
2671
2672       case BIT_XOR_EXPR:
2673         *reduc_fn = IFN_REDUC_XOR;
2674         return true;
2675
2676       case MULT_EXPR:
2677       case MINUS_EXPR:
2678         *reduc_fn = IFN_LAST;
2679         return true;
2680
2681       default:
2682        return false;
2683     }
2684 }
2685
2686 /* If there is a neutral value X such that SLP reduction NODE would not
2687    be affected by the introduction of additional X elements, return that X,
2688    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2689    is true if the SLP statements perform a single reduction, false if each
2690    statement performs an independent reduction.  */
2691
2692 static tree
2693 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2694                               bool reduc_chain)
2695 {
2696   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2697   gimple *stmt = stmts[0];
2698   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2699   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2700   tree scalar_type = TREE_TYPE (vector_type);
2701   struct loop *loop = gimple_bb (stmt)->loop_father;
2702   gcc_assert (loop);
2703
2704   switch (code)
2705     {
2706     case WIDEN_SUM_EXPR:
2707     case DOT_PROD_EXPR:
2708     case SAD_EXPR:
2709     case PLUS_EXPR:
2710     case MINUS_EXPR:
2711     case BIT_IOR_EXPR:
2712     case BIT_XOR_EXPR:
2713       return build_zero_cst (scalar_type);
2714
2715     case MULT_EXPR:
2716       return build_one_cst (scalar_type);
2717
2718     case BIT_AND_EXPR:
2719       return build_all_ones_cst (scalar_type);
2720
2721     case MAX_EXPR:
2722     case MIN_EXPR:
2723       /* For MIN/MAX the initial values are neutral.  A reduction chain
2724          has only a single initial value, so that value is neutral for
2725          all statements.  */
2726       if (reduc_chain)
2727         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2728       return NULL_TREE;
2729
2730     default:
2731       return NULL_TREE;
2732     }
2733 }
2734
2735 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2736    STMT is printed with a message MSG. */
2737
2738 static void
2739 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2740 {
2741   dump_printf_loc (msg_type, vect_location, "%s", msg);
2742   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2743 }
2744
2745
2746 /* Detect SLP reduction of the form:
2747
2748    #a1 = phi <a5, a0>
2749    a2 = operation (a1)
2750    a3 = operation (a2)
2751    a4 = operation (a3)
2752    a5 = operation (a4)
2753
2754    #a = phi <a5>
2755
2756    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2757    FIRST_STMT is the first reduction stmt in the chain
2758    (a2 = operation (a1)).
2759
2760    Return TRUE if a reduction chain was detected.  */
2761
2762 static bool
2763 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2764                        gimple *first_stmt)
2765 {
2766   struct loop *loop = (gimple_bb (phi))->loop_father;
2767   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2768   enum tree_code code;
2769   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2770   stmt_vec_info use_stmt_info, current_stmt_info;
2771   tree lhs;
2772   imm_use_iterator imm_iter;
2773   use_operand_p use_p;
2774   int nloop_uses, size = 0, n_out_of_loop_uses;
2775   bool found = false;
2776
2777   if (loop != vect_loop)
2778     return false;
2779
2780   lhs = PHI_RESULT (phi);
2781   code = gimple_assign_rhs_code (first_stmt);
2782   while (1)
2783     {
2784       nloop_uses = 0;
2785       n_out_of_loop_uses = 0;
2786       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2787         {
2788           gimple *use_stmt = USE_STMT (use_p);
2789           if (is_gimple_debug (use_stmt))
2790             continue;
2791
2792           /* Check if we got back to the reduction phi.  */
2793           if (use_stmt == phi)
2794             {
2795               loop_use_stmt = use_stmt;
2796               found = true;
2797               break;
2798             }
2799
2800           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2801             {
2802               loop_use_stmt = use_stmt;
2803               nloop_uses++;
2804             }
2805            else
2806              n_out_of_loop_uses++;
2807
2808            /* There are can be either a single use in the loop or two uses in
2809               phi nodes.  */
2810            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2811              return false;
2812         }
2813
2814       if (found)
2815         break;
2816
2817       /* We reached a statement with no loop uses.  */
2818       if (nloop_uses == 0)
2819         return false;
2820
2821       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2822       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2823         return false;
2824
2825       if (!is_gimple_assign (loop_use_stmt)
2826           || code != gimple_assign_rhs_code (loop_use_stmt)
2827           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2828         return false;
2829
2830       /* Insert USE_STMT into reduction chain.  */
2831       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2832       if (current_stmt)
2833         {
2834           current_stmt_info = vinfo_for_stmt (current_stmt);
2835           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2836           GROUP_FIRST_ELEMENT (use_stmt_info)
2837             = GROUP_FIRST_ELEMENT (current_stmt_info);
2838         }
2839       else
2840         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2841
2842       lhs = gimple_assign_lhs (loop_use_stmt);
2843       current_stmt = loop_use_stmt;
2844       size++;
2845    }
2846
2847   if (!found || loop_use_stmt != phi || size < 2)
2848     return false;
2849
2850   /* Swap the operands, if needed, to make the reduction operand be the second
2851      operand.  */
2852   lhs = PHI_RESULT (phi);
2853   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2854   while (next_stmt)
2855     {
2856       if (gimple_assign_rhs2 (next_stmt) == lhs)
2857         {
2858           tree op = gimple_assign_rhs1 (next_stmt);
2859           gimple *def_stmt = NULL;
2860
2861           if (TREE_CODE (op) == SSA_NAME)
2862             def_stmt = SSA_NAME_DEF_STMT (op);
2863
2864           /* Check that the other def is either defined in the loop
2865              ("vect_internal_def"), or it's an induction (defined by a
2866              loop-header phi-node).  */
2867           if (def_stmt
2868               && gimple_bb (def_stmt)
2869               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2870               && (is_gimple_assign (def_stmt)
2871                   || is_gimple_call (def_stmt)
2872                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2873                            == vect_induction_def
2874                   || (gimple_code (def_stmt) == GIMPLE_PHI
2875                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2876                                   == vect_internal_def
2877                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2878             {
2879               lhs = gimple_assign_lhs (next_stmt);
2880               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2881               continue;
2882             }
2883
2884           return false;
2885         }
2886       else
2887         {
2888           tree op = gimple_assign_rhs2 (next_stmt);
2889           gimple *def_stmt = NULL;
2890
2891           if (TREE_CODE (op) == SSA_NAME)
2892             def_stmt = SSA_NAME_DEF_STMT (op);
2893
2894           /* Check that the other def is either defined in the loop
2895             ("vect_internal_def"), or it's an induction (defined by a
2896             loop-header phi-node).  */
2897           if (def_stmt
2898               && gimple_bb (def_stmt)
2899               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2900               && (is_gimple_assign (def_stmt)
2901                   || is_gimple_call (def_stmt)
2902                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2903                               == vect_induction_def
2904                   || (gimple_code (def_stmt) == GIMPLE_PHI
2905                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2906                                   == vect_internal_def
2907                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2908             {
2909               if (dump_enabled_p ())
2910                 {
2911                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2912                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2913                 }
2914
2915               swap_ssa_operands (next_stmt,
2916                                  gimple_assign_rhs1_ptr (next_stmt),
2917                                  gimple_assign_rhs2_ptr (next_stmt));
2918               update_stmt (next_stmt);
2919
2920               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2921                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2922             }
2923           else
2924             return false;
2925         }
2926
2927       lhs = gimple_assign_lhs (next_stmt);
2928       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2929     }
2930
2931   /* Save the chain for further analysis in SLP detection.  */
2932   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2933   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2934   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2935
2936   return true;
2937 }
2938
2939 /* Return true if we need an in-order reduction for operation CODE
2940    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2941    overflow must wrap.  */
2942
2943 static bool
2944 needs_fold_left_reduction_p (tree type, tree_code code,
2945                              bool need_wrapping_integral_overflow)
2946 {
2947   /* CHECKME: check for !flag_finite_math_only too?  */
2948   if (SCALAR_FLOAT_TYPE_P (type))
2949     switch (code)
2950       {
2951       case MIN_EXPR:
2952       case MAX_EXPR:
2953         return false;
2954
2955       default:
2956         return !flag_associative_math;
2957       }
2958
2959   if (INTEGRAL_TYPE_P (type))
2960     {
2961       if (!operation_no_trapping_overflow (type, code))
2962         return true;
2963       if (need_wrapping_integral_overflow
2964           && !TYPE_OVERFLOW_WRAPS (type)
2965           && operation_can_overflow (code))
2966         return true;
2967       return false;
2968     }
2969
2970   if (SAT_FIXED_POINT_TYPE_P (type))
2971     return true;
2972
2973   return false;
2974 }
2975
2976 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
2977    reduction operation CODE has a handled computation expression.  */
2978
2979 bool
2980 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
2981                       enum tree_code code)
2982 {
2983   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
2984   auto_bitmap visited;
2985   tree lookfor = PHI_RESULT (phi);
2986   ssa_op_iter curri;
2987   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
2988   while (USE_FROM_PTR (curr) != loop_arg)
2989     curr = op_iter_next_use (&curri);
2990   curri.i = curri.numops;
2991   do
2992     {
2993       path.safe_push (std::make_pair (curri, curr));
2994       tree use = USE_FROM_PTR (curr);
2995       if (use == lookfor)
2996         break;
2997       gimple *def = SSA_NAME_DEF_STMT (use);
2998       if (gimple_nop_p (def)
2999           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3000         {
3001 pop:
3002           do
3003             {
3004               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3005               curri = x.first;
3006               curr = x.second;
3007               do
3008                 curr = op_iter_next_use (&curri);
3009               /* Skip already visited or non-SSA operands (from iterating
3010                  over PHI args).  */
3011               while (curr != NULL_USE_OPERAND_P
3012                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3013                          || ! bitmap_set_bit (visited,
3014                                               SSA_NAME_VERSION
3015                                                 (USE_FROM_PTR (curr)))));
3016             }
3017           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3018           if (curr == NULL_USE_OPERAND_P)
3019             break;
3020         }
3021       else
3022         {
3023           if (gimple_code (def) == GIMPLE_PHI)
3024             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3025           else
3026             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3027           while (curr != NULL_USE_OPERAND_P
3028                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3029                      || ! bitmap_set_bit (visited,
3030                                           SSA_NAME_VERSION
3031                                             (USE_FROM_PTR (curr)))))
3032             curr = op_iter_next_use (&curri);
3033           if (curr == NULL_USE_OPERAND_P)
3034             goto pop;
3035         }
3036     }
3037   while (1);
3038   if (dump_file && (dump_flags & TDF_DETAILS))
3039     {
3040       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3041       unsigned i;
3042       std::pair<ssa_op_iter, use_operand_p> *x;
3043       FOR_EACH_VEC_ELT (path, i, x)
3044         {
3045           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3046           dump_printf (MSG_NOTE, " ");
3047         }
3048       dump_printf (MSG_NOTE, "\n");
3049     }
3050
3051   /* Check whether the reduction path detected is valid.  */
3052   bool fail = path.length () == 0;
3053   bool neg = false;
3054   for (unsigned i = 1; i < path.length (); ++i)
3055     {
3056       gimple *use_stmt = USE_STMT (path[i].second);
3057       tree op = USE_FROM_PTR (path[i].second);
3058       if (! has_single_use (op)
3059           || ! is_gimple_assign (use_stmt))
3060         {
3061           fail = true;
3062           break;
3063         }
3064       if (gimple_assign_rhs_code (use_stmt) != code)
3065         {
3066           if (code == PLUS_EXPR
3067               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3068             {
3069               /* Track whether we negate the reduction value each iteration.  */
3070               if (gimple_assign_rhs2 (use_stmt) == op)
3071                 neg = ! neg;
3072             }
3073           else
3074             {
3075               fail = true;
3076               break;
3077             }
3078         }
3079     }
3080   return ! fail && ! neg;
3081 }
3082
3083
3084 /* Function vect_is_simple_reduction
3085
3086    (1) Detect a cross-iteration def-use cycle that represents a simple
3087    reduction computation.  We look for the following pattern:
3088
3089    loop_header:
3090      a1 = phi < a0, a2 >
3091      a3 = ...
3092      a2 = operation (a3, a1)
3093
3094    or
3095
3096    a3 = ...
3097    loop_header:
3098      a1 = phi < a0, a2 >
3099      a2 = operation (a3, a1)
3100
3101    such that:
3102    1. operation is commutative and associative and it is safe to
3103       change the order of the computation
3104    2. no uses for a2 in the loop (a2 is used out of the loop)
3105    3. no uses of a1 in the loop besides the reduction operation
3106    4. no uses of a1 outside the loop.
3107
3108    Conditions 1,4 are tested here.
3109    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3110
3111    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3112    nested cycles.
3113
3114    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3115    reductions:
3116
3117      a1 = phi < a0, a2 >
3118      inner loop (def of a3)
3119      a2 = phi < a3 >
3120
3121    (4) Detect condition expressions, ie:
3122      for (int i = 0; i < N; i++)
3123        if (a[i] < val)
3124         ret_val = a[i];
3125
3126 */
3127
3128 static gimple *
3129 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3130                           bool *double_reduc,
3131                           bool need_wrapping_integral_overflow,
3132                           enum vect_reduction_type *v_reduc_type)
3133 {
3134   struct loop *loop = (gimple_bb (phi))->loop_father;
3135   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3136   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3137   enum tree_code orig_code, code;
3138   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3139   tree type;
3140   int nloop_uses;
3141   tree name;
3142   imm_use_iterator imm_iter;
3143   use_operand_p use_p;
3144   bool phi_def;
3145
3146   *double_reduc = false;
3147   *v_reduc_type = TREE_CODE_REDUCTION;
3148
3149   tree phi_name = PHI_RESULT (phi);
3150   /* ???  If there are no uses of the PHI result the inner loop reduction
3151      won't be detected as possibly double-reduction by vectorizable_reduction
3152      because that tries to walk the PHI arg from the preheader edge which
3153      can be constant.  See PR60382.  */
3154   if (has_zero_uses (phi_name))
3155     return NULL;
3156   nloop_uses = 0;
3157   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3158     {
3159       gimple *use_stmt = USE_STMT (use_p);
3160       if (is_gimple_debug (use_stmt))
3161         continue;
3162
3163       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3164         {
3165           if (dump_enabled_p ())
3166             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3167                              "intermediate value used outside loop.\n");
3168
3169           return NULL;
3170         }
3171
3172       nloop_uses++;
3173       if (nloop_uses > 1)
3174         {
3175           if (dump_enabled_p ())
3176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3177                              "reduction value used in loop.\n");
3178           return NULL;
3179         }
3180
3181       phi_use_stmt = use_stmt;
3182     }
3183
3184   edge latch_e = loop_latch_edge (loop);
3185   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3186   if (TREE_CODE (loop_arg) != SSA_NAME)
3187     {
3188       if (dump_enabled_p ())
3189         {
3190           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3191                            "reduction: not ssa_name: ");
3192           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3193           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3194         }
3195       return NULL;
3196     }
3197
3198   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3199   if (is_gimple_assign (def_stmt))
3200     {
3201       name = gimple_assign_lhs (def_stmt);
3202       phi_def = false;
3203     }
3204   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3205     {
3206       name = PHI_RESULT (def_stmt);
3207       phi_def = true;
3208     }
3209   else
3210     {
3211       if (dump_enabled_p ())
3212         {
3213           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3214                            "reduction: unhandled reduction operation: ");
3215           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3216         }
3217       return NULL;
3218     }
3219
3220   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3221     return NULL;
3222
3223   nloop_uses = 0;
3224   auto_vec<gphi *, 3> lcphis;
3225   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3226     {
3227       gimple *use_stmt = USE_STMT (use_p);
3228       if (is_gimple_debug (use_stmt))
3229         continue;
3230       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3231         nloop_uses++;
3232       else
3233         /* We can have more than one loop-closed PHI.  */
3234         lcphis.safe_push (as_a <gphi *> (use_stmt));
3235       if (nloop_uses > 1)
3236         {
3237           if (dump_enabled_p ())
3238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3239                              "reduction used in loop.\n");
3240           return NULL;
3241         }
3242     }
3243
3244   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3245      defined in the inner loop.  */
3246   if (phi_def)
3247     {
3248       op1 = PHI_ARG_DEF (def_stmt, 0);
3249
3250       if (gimple_phi_num_args (def_stmt) != 1
3251           || TREE_CODE (op1) != SSA_NAME)
3252         {
3253           if (dump_enabled_p ())
3254             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3255                              "unsupported phi node definition.\n");
3256
3257           return NULL;
3258         }
3259
3260       def1 = SSA_NAME_DEF_STMT (op1);
3261       if (gimple_bb (def1)
3262           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3263           && loop->inner
3264           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3265           && is_gimple_assign (def1)
3266           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3267         {
3268           if (dump_enabled_p ())
3269             report_vect_op (MSG_NOTE, def_stmt,
3270                             "detected double reduction: ");
3271
3272           *double_reduc = true;
3273           return def_stmt;
3274         }
3275
3276       return NULL;
3277     }
3278
3279   /* If we are vectorizing an inner reduction we are executing that
3280      in the original order only in case we are not dealing with a
3281      double reduction.  */
3282   bool check_reduction = true;
3283   if (flow_loop_nested_p (vect_loop, loop))
3284     {
3285       gphi *lcphi;
3286       unsigned i;
3287       check_reduction = false;
3288       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3289         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3290           {
3291             gimple *use_stmt = USE_STMT (use_p);
3292             if (is_gimple_debug (use_stmt))
3293               continue;
3294             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3295               check_reduction = true;
3296           }
3297     }
3298
3299   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3300   code = orig_code = gimple_assign_rhs_code (def_stmt);
3301
3302   /* We can handle "res -= x[i]", which is non-associative by
3303      simply rewriting this into "res += -x[i]".  Avoid changing
3304      gimple instruction for the first simple tests and only do this
3305      if we're allowed to change code at all.  */
3306   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3307     code = PLUS_EXPR;
3308
3309   if (code == COND_EXPR)
3310     {
3311       if (! nested_in_vect_loop)
3312         *v_reduc_type = COND_REDUCTION;
3313
3314       op3 = gimple_assign_rhs1 (def_stmt);
3315       if (COMPARISON_CLASS_P (op3))
3316         {
3317           op4 = TREE_OPERAND (op3, 1);
3318           op3 = TREE_OPERAND (op3, 0);
3319         }
3320       if (op3 == phi_name || op4 == phi_name)
3321         {
3322           if (dump_enabled_p ())
3323             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3324                             "reduction: condition depends on previous"
3325                             " iteration: ");
3326           return NULL;
3327         }
3328
3329       op1 = gimple_assign_rhs2 (def_stmt);
3330       op2 = gimple_assign_rhs3 (def_stmt);
3331     }
3332   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3333     {
3334       if (dump_enabled_p ())
3335         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3336                         "reduction: not commutative/associative: ");
3337       return NULL;
3338     }
3339   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3340     {
3341       op1 = gimple_assign_rhs1 (def_stmt);
3342       op2 = gimple_assign_rhs2 (def_stmt);
3343     }
3344   else
3345     {
3346       if (dump_enabled_p ())
3347         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3348                         "reduction: not handled operation: ");
3349       return NULL;
3350     }
3351
3352   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3353     {
3354       if (dump_enabled_p ())
3355         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3356                         "reduction: both uses not ssa_names: ");
3357
3358       return NULL;
3359     }
3360
3361   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3362   if ((TREE_CODE (op1) == SSA_NAME
3363        && !types_compatible_p (type,TREE_TYPE (op1)))
3364       || (TREE_CODE (op2) == SSA_NAME
3365           && !types_compatible_p (type, TREE_TYPE (op2)))
3366       || (op3 && TREE_CODE (op3) == SSA_NAME
3367           && !types_compatible_p (type, TREE_TYPE (op3)))
3368       || (op4 && TREE_CODE (op4) == SSA_NAME
3369           && !types_compatible_p (type, TREE_TYPE (op4))))
3370     {
3371       if (dump_enabled_p ())
3372         {
3373           dump_printf_loc (MSG_NOTE, vect_location,
3374                            "reduction: multiple types: operation type: ");
3375           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3376           dump_printf (MSG_NOTE, ", operands types: ");
3377           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3378                              TREE_TYPE (op1));
3379           dump_printf (MSG_NOTE, ",");
3380           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3381                              TREE_TYPE (op2));
3382           if (op3)
3383             {
3384               dump_printf (MSG_NOTE, ",");
3385               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3386                                  TREE_TYPE (op3));
3387             }
3388
3389           if (op4)
3390             {
3391               dump_printf (MSG_NOTE, ",");
3392               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3393                                  TREE_TYPE (op4));
3394             }
3395           dump_printf (MSG_NOTE, "\n");
3396         }
3397
3398       return NULL;
3399     }
3400
3401   /* Check whether it's ok to change the order of the computation.
3402      Generally, when vectorizing a reduction we change the order of the
3403      computation.  This may change the behavior of the program in some
3404      cases, so we need to check that this is ok.  One exception is when
3405      vectorizing an outer-loop: the inner-loop is executed sequentially,
3406      and therefore vectorizing reductions in the inner-loop during
3407      outer-loop vectorization is safe.  */
3408   if (check_reduction
3409       && *v_reduc_type == TREE_CODE_REDUCTION
3410       && needs_fold_left_reduction_p (type, code,
3411                                       need_wrapping_integral_overflow))
3412     *v_reduc_type = FOLD_LEFT_REDUCTION;
3413
3414   /* Reduction is safe. We're dealing with one of the following:
3415      1) integer arithmetic and no trapv
3416      2) floating point arithmetic, and special flags permit this optimization
3417      3) nested cycle (i.e., outer loop vectorization).  */
3418   if (TREE_CODE (op1) == SSA_NAME)
3419     def1 = SSA_NAME_DEF_STMT (op1);
3420
3421   if (TREE_CODE (op2) == SSA_NAME)
3422     def2 = SSA_NAME_DEF_STMT (op2);
3423
3424   if (code != COND_EXPR
3425       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3426     {
3427       if (dump_enabled_p ())
3428         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3429       return NULL;
3430     }
3431
3432   /* Check that one def is the reduction def, defined by PHI,
3433      the other def is either defined in the loop ("vect_internal_def"),
3434      or it's an induction (defined by a loop-header phi-node).  */
3435
3436   if (def2 && def2 == phi
3437       && (code == COND_EXPR
3438           || !def1 || gimple_nop_p (def1)
3439           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3440           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3441               && (is_gimple_assign (def1)
3442                   || is_gimple_call (def1)
3443                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3444                       == vect_induction_def
3445                   || (gimple_code (def1) == GIMPLE_PHI
3446                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3447                           == vect_internal_def
3448                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3449     {
3450       if (dump_enabled_p ())
3451         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3452       return def_stmt;
3453     }
3454
3455   if (def1 && def1 == phi
3456       && (code == COND_EXPR
3457           || !def2 || gimple_nop_p (def2)
3458           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3459           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3460               && (is_gimple_assign (def2)
3461                   || is_gimple_call (def2)
3462                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3463                        == vect_induction_def
3464                   || (gimple_code (def2) == GIMPLE_PHI
3465                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3466                            == vect_internal_def
3467                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3468     {
3469       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3470         {
3471           /* Check if we can swap operands (just for simplicity - so that
3472              the rest of the code can assume that the reduction variable
3473              is always the last (second) argument).  */
3474           if (code == COND_EXPR)
3475             {
3476               /* Swap cond_expr by inverting the condition.  */
3477               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3478               enum tree_code invert_code = ERROR_MARK;
3479               enum tree_code cond_code = TREE_CODE (cond_expr);
3480
3481               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3482                 {
3483                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3484                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3485                 }
3486               if (invert_code != ERROR_MARK)
3487                 {
3488                   TREE_SET_CODE (cond_expr, invert_code);
3489                   swap_ssa_operands (def_stmt,
3490                                      gimple_assign_rhs2_ptr (def_stmt),
3491                                      gimple_assign_rhs3_ptr (def_stmt));
3492                 }
3493               else
3494                 {
3495                   if (dump_enabled_p ())
3496                     report_vect_op (MSG_NOTE, def_stmt,
3497                                     "detected reduction: cannot swap operands "
3498                                     "for cond_expr");
3499                   return NULL;
3500                 }
3501             }
3502           else
3503             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3504                                gimple_assign_rhs2_ptr (def_stmt));
3505
3506           if (dump_enabled_p ())
3507             report_vect_op (MSG_NOTE, def_stmt,
3508                             "detected reduction: need to swap operands: ");
3509
3510           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3511             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3512         }
3513       else
3514         {
3515           if (dump_enabled_p ())
3516             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3517         }
3518
3519       return def_stmt;
3520     }
3521
3522   /* Try to find SLP reduction chain.  */
3523   if (! nested_in_vect_loop
3524       && code != COND_EXPR
3525       && orig_code != MINUS_EXPR
3526       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3527     {
3528       if (dump_enabled_p ())
3529         report_vect_op (MSG_NOTE, def_stmt,
3530                         "reduction: detected reduction chain: ");
3531
3532       return def_stmt;
3533     }
3534
3535   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3536   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3537   while (first)
3538     {
3539       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3540       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3541       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3542       first = next;
3543     }
3544
3545   /* Look for the expression computing loop_arg from loop PHI result.  */
3546   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3547                             code))
3548     return def_stmt;
3549
3550   if (dump_enabled_p ())
3551     {
3552       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3553                       "reduction: unknown pattern: ");
3554     }
3555
3556   return NULL;
3557 }
3558
3559 /* Wrapper around vect_is_simple_reduction, which will modify code
3560    in-place if it enables detection of more reductions.  Arguments
3561    as there.  */
3562
3563 gimple *
3564 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3565                              bool *double_reduc,
3566                              bool need_wrapping_integral_overflow)
3567 {
3568   enum vect_reduction_type v_reduc_type;
3569   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3570                                           need_wrapping_integral_overflow,
3571                                           &v_reduc_type);
3572   if (def)
3573     {
3574       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3575       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3576       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3577       reduc_def_info = vinfo_for_stmt (def);
3578       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3579       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3580     }
3581   return def;
3582 }
3583
3584 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3585 int
3586 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3587                              int *peel_iters_epilogue,
3588                              stmt_vector_for_cost *scalar_cost_vec,
3589                              stmt_vector_for_cost *prologue_cost_vec,
3590                              stmt_vector_for_cost *epilogue_cost_vec)
3591 {
3592   int retval = 0;
3593   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3594
3595   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3596     {
3597       *peel_iters_epilogue = assumed_vf / 2;
3598       if (dump_enabled_p ())
3599         dump_printf_loc (MSG_NOTE, vect_location,
3600                          "cost model: epilogue peel iters set to vf/2 "
3601                          "because loop iterations are unknown .\n");
3602
3603       /* If peeled iterations are known but number of scalar loop
3604          iterations are unknown, count a taken branch per peeled loop.  */
3605       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3606                                  NULL, 0, vect_prologue);
3607       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3608                                  NULL, 0, vect_epilogue);
3609     }
3610   else
3611     {
3612       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3613       peel_iters_prologue = niters < peel_iters_prologue ?
3614                             niters : peel_iters_prologue;
3615       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3616       /* If we need to peel for gaps, but no peeling is required, we have to
3617          peel VF iterations.  */
3618       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3619         *peel_iters_epilogue = assumed_vf;
3620     }
3621
3622   stmt_info_for_cost *si;
3623   int j;
3624   if (peel_iters_prologue)
3625     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3626         {
3627           stmt_vec_info stmt_info
3628             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3629           retval += record_stmt_cost (prologue_cost_vec,
3630                                       si->count * peel_iters_prologue,
3631                                       si->kind, stmt_info, si->misalign,
3632                                       vect_prologue);
3633         }
3634   if (*peel_iters_epilogue)
3635     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3636         {
3637           stmt_vec_info stmt_info
3638             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3639           retval += record_stmt_cost (epilogue_cost_vec,
3640                                       si->count * *peel_iters_epilogue,
3641                                       si->kind, stmt_info, si->misalign,
3642                                       vect_epilogue);
3643         }
3644
3645   return retval;
3646 }
3647
3648 /* Function vect_estimate_min_profitable_iters
3649
3650    Return the number of iterations required for the vector version of the
3651    loop to be profitable relative to the cost of the scalar version of the
3652    loop.
3653
3654    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3655    of iterations for vectorization.  -1 value means loop vectorization
3656    is not profitable.  This returned value may be used for dynamic
3657    profitability check.
3658
3659    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3660    for static check against estimated number of iterations.  */
3661
3662 static void
3663 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3664                                     int *ret_min_profitable_niters,
3665                                     int *ret_min_profitable_estimate)
3666 {
3667   int min_profitable_iters;
3668   int min_profitable_estimate;
3669   int peel_iters_prologue;
3670   int peel_iters_epilogue;
3671   unsigned vec_inside_cost = 0;
3672   int vec_outside_cost = 0;
3673   unsigned vec_prologue_cost = 0;
3674   unsigned vec_epilogue_cost = 0;
3675   int scalar_single_iter_cost = 0;
3676   int scalar_outside_cost = 0;
3677   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3678   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3679   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3680
3681   /* Cost model disabled.  */
3682   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3683     {
3684       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3685       *ret_min_profitable_niters = 0;
3686       *ret_min_profitable_estimate = 0;
3687       return;
3688     }
3689
3690   /* Requires loop versioning tests to handle misalignment.  */
3691   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3692     {
3693       /*  FIXME: Make cost depend on complexity of individual check.  */
3694       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3695       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3696                             vect_prologue);
3697       dump_printf (MSG_NOTE,
3698                    "cost model: Adding cost of checks for loop "
3699                    "versioning to treat misalignment.\n");
3700     }
3701
3702   /* Requires loop versioning with alias checks.  */
3703   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3704     {
3705       /*  FIXME: Make cost depend on complexity of individual check.  */
3706       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3707       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3708                             vect_prologue);
3709       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3710       if (len)
3711         /* Count LEN - 1 ANDs and LEN comparisons.  */
3712         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3713                               NULL, 0, vect_prologue);
3714       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3715       if (len)
3716         {
3717           /* Count LEN - 1 ANDs and LEN comparisons.  */
3718           unsigned int nstmts = len * 2 - 1;
3719           /* +1 for each bias that needs adding.  */
3720           for (unsigned int i = 0; i < len; ++i)
3721             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3722               nstmts += 1;
3723           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3724                                 NULL, 0, vect_prologue);
3725         }
3726       dump_printf (MSG_NOTE,
3727                    "cost model: Adding cost of checks for loop "
3728                    "versioning aliasing.\n");
3729     }
3730
3731   /* Requires loop versioning with niter checks.  */
3732   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3733     {
3734       /*  FIXME: Make cost depend on complexity of individual check.  */
3735       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3736                             vect_prologue);
3737       dump_printf (MSG_NOTE,
3738                    "cost model: Adding cost of checks for loop "
3739                    "versioning niters.\n");
3740     }
3741
3742   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3743     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3744                           vect_prologue);
3745
3746   /* Count statements in scalar loop.  Using this as scalar cost for a single
3747      iteration for now.
3748
3749      TODO: Add outer loop support.
3750
3751      TODO: Consider assigning different costs to different scalar
3752      statements.  */
3753
3754   scalar_single_iter_cost
3755     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3756
3757   /* Add additional cost for the peeled instructions in prologue and epilogue
3758      loop.  (For fully-masked loops there will be no peeling.)
3759
3760      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3761      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3762
3763      TODO: Build an expression that represents peel_iters for prologue and
3764      epilogue to be used in a run-time test.  */
3765
3766   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3767     {
3768       peel_iters_prologue = 0;
3769       peel_iters_epilogue = 0;
3770
3771       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3772         {
3773           /* We need to peel exactly one iteration.  */
3774           peel_iters_epilogue += 1;
3775           stmt_info_for_cost *si;
3776           int j;
3777           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3778                             j, si)
3779             {
3780               struct _stmt_vec_info *stmt_info
3781                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3782               (void) add_stmt_cost (target_cost_data, si->count,
3783                                     si->kind, stmt_info, si->misalign,
3784                                     vect_epilogue);
3785             }
3786         }
3787     }
3788   else if (npeel < 0)
3789     {
3790       peel_iters_prologue = assumed_vf / 2;
3791       dump_printf (MSG_NOTE, "cost model: "
3792                    "prologue peel iters set to vf/2.\n");
3793
3794       /* If peeling for alignment is unknown, loop bound of main loop becomes
3795          unknown.  */
3796       peel_iters_epilogue = assumed_vf / 2;
3797       dump_printf (MSG_NOTE, "cost model: "
3798                    "epilogue peel iters set to vf/2 because "
3799                    "peeling for alignment is unknown.\n");
3800
3801       /* If peeled iterations are unknown, count a taken branch and a not taken
3802          branch per peeled loop. Even if scalar loop iterations are known,
3803          vector iterations are not known since peeled prologue iterations are
3804          not known. Hence guards remain the same.  */
3805       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3806                             NULL, 0, vect_prologue);
3807       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3808                             NULL, 0, vect_prologue);
3809       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3810                             NULL, 0, vect_epilogue);
3811       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3812                             NULL, 0, vect_epilogue);
3813       stmt_info_for_cost *si;
3814       int j;
3815       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3816         {
3817           struct _stmt_vec_info *stmt_info
3818             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3819           (void) add_stmt_cost (target_cost_data,
3820                                 si->count * peel_iters_prologue,
3821                                 si->kind, stmt_info, si->misalign,
3822                                 vect_prologue);
3823           (void) add_stmt_cost (target_cost_data,
3824                                 si->count * peel_iters_epilogue,
3825                                 si->kind, stmt_info, si->misalign,
3826                                 vect_epilogue);
3827         }
3828     }
3829   else
3830     {
3831       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3832       stmt_info_for_cost *si;
3833       int j;
3834       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3835
3836       prologue_cost_vec.create (2);
3837       epilogue_cost_vec.create (2);
3838       peel_iters_prologue = npeel;
3839
3840       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3841                                           &peel_iters_epilogue,
3842                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3843                                             (loop_vinfo),
3844                                           &prologue_cost_vec,
3845                                           &epilogue_cost_vec);
3846
3847       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3848         {
3849           struct _stmt_vec_info *stmt_info
3850             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3851           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3852                                 si->misalign, vect_prologue);
3853         }
3854
3855       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3856         {
3857           struct _stmt_vec_info *stmt_info
3858             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3859           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3860                                 si->misalign, vect_epilogue);
3861         }
3862
3863       prologue_cost_vec.release ();
3864       epilogue_cost_vec.release ();
3865     }
3866
3867   /* FORNOW: The scalar outside cost is incremented in one of the
3868      following ways:
3869
3870      1. The vectorizer checks for alignment and aliasing and generates
3871      a condition that allows dynamic vectorization.  A cost model
3872      check is ANDED with the versioning condition.  Hence scalar code
3873      path now has the added cost of the versioning check.
3874
3875        if (cost > th & versioning_check)
3876          jmp to vector code
3877
3878      Hence run-time scalar is incremented by not-taken branch cost.
3879
3880      2. The vectorizer then checks if a prologue is required.  If the
3881      cost model check was not done before during versioning, it has to
3882      be done before the prologue check.
3883
3884        if (cost <= th)
3885          prologue = scalar_iters
3886        if (prologue == 0)
3887          jmp to vector code
3888        else
3889          execute prologue
3890        if (prologue == num_iters)
3891          go to exit
3892
3893      Hence the run-time scalar cost is incremented by a taken branch,
3894      plus a not-taken branch, plus a taken branch cost.
3895
3896      3. The vectorizer then checks if an epilogue is required.  If the
3897      cost model check was not done before during prologue check, it
3898      has to be done with the epilogue check.
3899
3900        if (prologue == 0)
3901          jmp to vector code
3902        else
3903          execute prologue
3904        if (prologue == num_iters)
3905          go to exit
3906        vector code:
3907          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3908            jmp to epilogue
3909
3910      Hence the run-time scalar cost should be incremented by 2 taken
3911      branches.
3912
3913      TODO: The back end may reorder the BBS's differently and reverse
3914      conditions/branch directions.  Change the estimates below to
3915      something more reasonable.  */
3916
3917   /* If the number of iterations is known and we do not do versioning, we can
3918      decide whether to vectorize at compile time.  Hence the scalar version
3919      do not carry cost model guard costs.  */
3920   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3921       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3922     {
3923       /* Cost model check occurs at versioning.  */
3924       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3925         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3926       else
3927         {
3928           /* Cost model check occurs at prologue generation.  */
3929           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3930             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3931               + vect_get_stmt_cost (cond_branch_not_taken);
3932           /* Cost model check occurs at epilogue generation.  */
3933           else
3934             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3935         }
3936     }
3937
3938   /* Complete the target-specific cost calculations.  */
3939   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3940                &vec_inside_cost, &vec_epilogue_cost);
3941
3942   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3943
3944   if (dump_enabled_p ())
3945     {
3946       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3947       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3948                    vec_inside_cost);
3949       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3950                    vec_prologue_cost);
3951       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3952                    vec_epilogue_cost);
3953       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3954                    scalar_single_iter_cost);
3955       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3956                    scalar_outside_cost);
3957       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3958                    vec_outside_cost);
3959       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3960                    peel_iters_prologue);
3961       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3962                    peel_iters_epilogue);
3963     }
3964
3965   /* Calculate number of iterations required to make the vector version
3966      profitable, relative to the loop bodies only.  The following condition
3967      must hold true:
3968      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3969      where
3970      SIC = scalar iteration cost, VIC = vector iteration cost,
3971      VOC = vector outside cost, VF = vectorization factor,
3972      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3973      SOC = scalar outside cost for run time cost model check.  */
3974
3975   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
3976     {
3977       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
3978                               * assumed_vf
3979                               - vec_inside_cost * peel_iters_prologue
3980                               - vec_inside_cost * peel_iters_epilogue);
3981       if (min_profitable_iters <= 0)
3982         min_profitable_iters = 0;
3983       else
3984         {
3985           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
3986                                    - vec_inside_cost);
3987
3988           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
3989               <= (((int) vec_inside_cost * min_profitable_iters)
3990                   + (((int) vec_outside_cost - scalar_outside_cost)
3991                      * assumed_vf)))
3992             min_profitable_iters++;
3993         }
3994     }
3995   /* vector version will never be profitable.  */
3996   else
3997     {
3998       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3999         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4000                     "did not happen for a simd loop");
4001
4002       if (dump_enabled_p ())
4003         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4004                          "cost model: the vector iteration cost = %d "
4005                          "divided by the scalar iteration cost = %d "
4006                          "is greater or equal to the vectorization factor = %d"
4007                          ".\n",
4008                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4009       *ret_min_profitable_niters = -1;
4010       *ret_min_profitable_estimate = -1;
4011       return;
4012     }
4013
4014   dump_printf (MSG_NOTE,
4015                "  Calculated minimum iters for profitability: %d\n",
4016                min_profitable_iters);
4017
4018   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4019       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4020     /* We want the vectorized loop to execute at least once.  */
4021     min_profitable_iters = assumed_vf + peel_iters_prologue;
4022
4023   if (dump_enabled_p ())
4024     dump_printf_loc (MSG_NOTE, vect_location,
4025                      "  Runtime profitability threshold = %d\n",
4026                      min_profitable_iters);
4027
4028   *ret_min_profitable_niters = min_profitable_iters;
4029
4030   /* Calculate number of iterations required to make the vector version
4031      profitable, relative to the loop bodies only.
4032
4033      Non-vectorized variant is SIC * niters and it must win over vector
4034      variant on the expected loop trip count.  The following condition must hold true:
4035      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4036
4037   if (vec_outside_cost <= 0)
4038     min_profitable_estimate = 0;
4039   else
4040     {
4041       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4042                                  * assumed_vf
4043                                  - vec_inside_cost * peel_iters_prologue
4044                                  - vec_inside_cost * peel_iters_epilogue)
4045                                  / ((scalar_single_iter_cost * assumed_vf)
4046                                    - vec_inside_cost);
4047     }
4048   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4049   if (dump_enabled_p ())
4050     dump_printf_loc (MSG_NOTE, vect_location,
4051                      "  Static estimate profitability threshold = %d\n",
4052                      min_profitable_estimate);
4053
4054   *ret_min_profitable_estimate = min_profitable_estimate;
4055 }
4056
4057 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4058    vector elements (not bits) for a vector with NELT elements.  */
4059 static void
4060 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4061                               vec_perm_builder *sel)
4062 {
4063   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4064      by vec_perm_indices.  */
4065   sel->new_vector (nelt, 1, 3);
4066   for (unsigned int i = 0; i < 3; i++)
4067     sel->quick_push (i + offset);
4068 }
4069
4070 /* Checks whether the target supports whole-vector shifts for vectors of mode
4071    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4072    it supports vec_perm_const with masks for all necessary shift amounts.  */
4073 static bool
4074 have_whole_vector_shift (machine_mode mode)
4075 {
4076   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4077     return true;
4078
4079   /* Variable-length vectors should be handled via the optab.  */
4080   unsigned int nelt;
4081   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4082     return false;
4083
4084   vec_perm_builder sel;
4085   vec_perm_indices indices;
4086   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4087     {
4088       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4089       indices.new_vector (sel, 2, nelt);
4090       if (!can_vec_perm_const_p (mode, indices, false))
4091         return false;
4092     }
4093   return true;
4094 }
4095
4096 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4097    functions. Design better to avoid maintenance issues.  */
4098
4099 /* Function vect_model_reduction_cost.
4100
4101    Models cost for a reduction operation, including the vector ops
4102    generated within the strip-mine loop, the initial definition before
4103    the loop, and the epilogue code that must be generated.  */
4104
4105 static void
4106 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4107                            int ncopies)
4108 {
4109   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4110   enum tree_code code;
4111   optab optab;
4112   tree vectype;
4113   gimple *orig_stmt;
4114   machine_mode mode;
4115   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4116   struct loop *loop = NULL;
4117   void *target_cost_data;
4118
4119   if (loop_vinfo)
4120     {
4121       loop = LOOP_VINFO_LOOP (loop_vinfo);
4122       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4123     }
4124   else
4125     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4126
4127   /* Condition reductions generate two reductions in the loop.  */
4128   vect_reduction_type reduction_type
4129     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4130   if (reduction_type == COND_REDUCTION)
4131     ncopies *= 2;
4132
4133   vectype = STMT_VINFO_VECTYPE (stmt_info);
4134   mode = TYPE_MODE (vectype);
4135   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4136
4137   if (!orig_stmt)
4138     orig_stmt = STMT_VINFO_STMT (stmt_info);
4139
4140   code = gimple_assign_rhs_code (orig_stmt);
4141
4142   if (reduction_type == EXTRACT_LAST_REDUCTION
4143       || reduction_type == FOLD_LEFT_REDUCTION)
4144     {
4145       /* No extra instructions needed in the prologue.  */
4146       prologue_cost = 0;
4147
4148       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4149         /* Count one reduction-like operation per vector.  */
4150         inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4151                                      stmt_info, 0, vect_body);
4152       else
4153         {
4154           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4155           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4156           inside_cost = add_stmt_cost (target_cost_data,  nelements,
4157                                        vec_to_scalar, stmt_info, 0,
4158                                        vect_body);
4159           inside_cost += add_stmt_cost (target_cost_data,  nelements,
4160                                         scalar_stmt, stmt_info, 0,
4161                                         vect_body);
4162         }
4163     }
4164   else
4165     {
4166       /* Add in cost for initial definition.
4167          For cond reduction we have four vectors: initial index, step,
4168          initial result of the data reduction, initial value of the index
4169          reduction.  */
4170       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4171       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4172                                       scalar_to_vec, stmt_info, 0,
4173                                       vect_prologue);
4174
4175       /* Cost of reduction op inside loop.  */
4176       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4177                                    stmt_info, 0, vect_body);
4178     }
4179
4180   /* Determine cost of epilogue code.
4181
4182      We have a reduction operator that will reduce the vector in one statement.
4183      Also requires scalar extract.  */
4184
4185   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4186     {
4187       if (reduc_fn != IFN_LAST)
4188         {
4189           if (reduction_type == COND_REDUCTION)
4190             {
4191               /* An EQ stmt and an COND_EXPR stmt.  */
4192               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4193                                               vector_stmt, stmt_info, 0,
4194                                               vect_epilogue);
4195               /* Reduction of the max index and a reduction of the found
4196                  values.  */
4197               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4198                                               vec_to_scalar, stmt_info, 0,
4199                                               vect_epilogue);
4200               /* A broadcast of the max value.  */
4201               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4202                                               scalar_to_vec, stmt_info, 0,
4203                                               vect_epilogue);
4204             }
4205           else
4206             {
4207               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4208                                               stmt_info, 0, vect_epilogue);
4209               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4210                                               vec_to_scalar, stmt_info, 0,
4211                                               vect_epilogue);
4212             }
4213         }
4214       else if (reduction_type == COND_REDUCTION)
4215         {
4216           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4217           /* Extraction of scalar elements.  */
4218           epilogue_cost += add_stmt_cost (target_cost_data,
4219                                           2 * estimated_nunits,
4220                                           vec_to_scalar, stmt_info, 0,
4221                                           vect_epilogue);
4222           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4223           epilogue_cost += add_stmt_cost (target_cost_data,
4224                                           2 * estimated_nunits - 3,
4225                                           scalar_stmt, stmt_info, 0,
4226                                           vect_epilogue);
4227         }
4228       else if (reduction_type == EXTRACT_LAST_REDUCTION
4229                || reduction_type == FOLD_LEFT_REDUCTION)
4230         /* No extra instructions need in the epilogue.  */
4231         ;
4232       else
4233         {
4234           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4235           tree bitsize =
4236             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4237           int element_bitsize = tree_to_uhwi (bitsize);
4238           int nelements = vec_size_in_bits / element_bitsize;
4239
4240           if (code == COND_EXPR)
4241             code = MAX_EXPR;
4242
4243           optab = optab_for_tree_code (code, vectype, optab_default);
4244
4245           /* We have a whole vector shift available.  */
4246           if (optab != unknown_optab
4247               && VECTOR_MODE_P (mode)
4248               && optab_handler (optab, mode) != CODE_FOR_nothing
4249               && have_whole_vector_shift (mode))
4250             {
4251               /* Final reduction via vector shifts and the reduction operator.
4252                  Also requires scalar extract.  */
4253               epilogue_cost += add_stmt_cost (target_cost_data,
4254                                               exact_log2 (nelements) * 2,
4255                                               vector_stmt, stmt_info, 0,
4256                                               vect_epilogue);
4257               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4258                                               vec_to_scalar, stmt_info, 0,
4259                                               vect_epilogue);
4260             }
4261           else
4262             /* Use extracts and reduction op for final reduction.  For N
4263                elements, we have N extracts and N-1 reduction ops.  */
4264             epilogue_cost += add_stmt_cost (target_cost_data,
4265                                             nelements + nelements - 1,
4266                                             vector_stmt, stmt_info, 0,
4267                                             vect_epilogue);
4268         }
4269     }
4270
4271   if (dump_enabled_p ())
4272     dump_printf (MSG_NOTE,
4273                  "vect_model_reduction_cost: inside_cost = %d, "
4274                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4275                  prologue_cost, epilogue_cost);
4276 }
4277
4278
4279 /* Function vect_model_induction_cost.
4280
4281    Models cost for induction operations.  */
4282
4283 static void
4284 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4285 {
4286   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4287   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4288   unsigned inside_cost, prologue_cost;
4289
4290   if (PURE_SLP_STMT (stmt_info))
4291     return;
4292
4293   /* loop cost for vec_loop.  */
4294   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4295                                stmt_info, 0, vect_body);
4296
4297   /* prologue cost for vec_init and vec_step.  */
4298   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4299                                  stmt_info, 0, vect_prologue);
4300
4301   if (dump_enabled_p ())
4302     dump_printf_loc (MSG_NOTE, vect_location,
4303                      "vect_model_induction_cost: inside_cost = %d, "
4304                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4305 }
4306
4307
4308
4309 /* Function get_initial_def_for_reduction
4310
4311    Input:
4312    STMT - a stmt that performs a reduction operation in the loop.
4313    INIT_VAL - the initial value of the reduction variable
4314
4315    Output:
4316    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4317         of the reduction (used for adjusting the epilog - see below).
4318    Return a vector variable, initialized according to the operation that STMT
4319         performs. This vector will be used as the initial value of the
4320         vector of partial results.
4321
4322    Option1 (adjust in epilog): Initialize the vector as follows:
4323      add/bit or/xor:    [0,0,...,0,0]
4324      mult/bit and:      [1,1,...,1,1]
4325      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4326    and when necessary (e.g. add/mult case) let the caller know
4327    that it needs to adjust the result by init_val.
4328
4329    Option2: Initialize the vector as follows:
4330      add/bit or/xor:    [init_val,0,0,...,0]
4331      mult/bit and:      [init_val,1,1,...,1]
4332      min/max/cond_expr: [init_val,init_val,...,init_val]
4333    and no adjustments are needed.
4334
4335    For example, for the following code:
4336
4337    s = init_val;
4338    for (i=0;i<n;i++)
4339      s = s + a[i];
4340
4341    STMT is 's = s + a[i]', and the reduction variable is 's'.
4342    For a vector of 4 units, we want to return either [0,0,0,init_val],
4343    or [0,0,0,0] and let the caller know that it needs to adjust
4344    the result at the end by 'init_val'.
4345
4346    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4347    initialization vector is simpler (same element in all entries), if
4348    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4349
4350    A cost model should help decide between these two schemes.  */
4351
4352 tree
4353 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4354                                tree *adjustment_def)
4355 {
4356   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4357   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4358   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4359   tree scalar_type = TREE_TYPE (init_val);
4360   tree vectype = get_vectype_for_scalar_type (scalar_type);
4361   enum tree_code code = gimple_assign_rhs_code (stmt);
4362   tree def_for_init;
4363   tree init_def;
4364   bool nested_in_vect_loop = false;
4365   REAL_VALUE_TYPE real_init_val = dconst0;
4366   int int_init_val = 0;
4367   gimple *def_stmt = NULL;
4368   gimple_seq stmts = NULL;
4369
4370   gcc_assert (vectype);
4371
4372   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4373               || SCALAR_FLOAT_TYPE_P (scalar_type));
4374
4375   if (nested_in_vect_loop_p (loop, stmt))
4376     nested_in_vect_loop = true;
4377   else
4378     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4379
4380   /* In case of double reduction we only create a vector variable to be put
4381      in the reduction phi node.  The actual statement creation is done in
4382      vect_create_epilog_for_reduction.  */
4383   if (adjustment_def && nested_in_vect_loop
4384       && TREE_CODE (init_val) == SSA_NAME
4385       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4386       && gimple_code (def_stmt) == GIMPLE_PHI
4387       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4388       && vinfo_for_stmt (def_stmt)
4389       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4390           == vect_double_reduction_def)
4391     {
4392       *adjustment_def = NULL;
4393       return vect_create_destination_var (init_val, vectype);
4394     }
4395
4396   vect_reduction_type reduction_type
4397     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4398
4399   /* In case of a nested reduction do not use an adjustment def as
4400      that case is not supported by the epilogue generation correctly
4401      if ncopies is not one.  */
4402   if (adjustment_def && nested_in_vect_loop)
4403     {
4404       *adjustment_def = NULL;
4405       return vect_get_vec_def_for_operand (init_val, stmt);
4406     }
4407
4408   switch (code)
4409     {
4410     case WIDEN_SUM_EXPR:
4411     case DOT_PROD_EXPR:
4412     case SAD_EXPR:
4413     case PLUS_EXPR:
4414     case MINUS_EXPR:
4415     case BIT_IOR_EXPR:
4416     case BIT_XOR_EXPR:
4417     case MULT_EXPR:
4418     case BIT_AND_EXPR:
4419       {
4420         /* ADJUSTMENT_DEF is NULL when called from
4421            vect_create_epilog_for_reduction to vectorize double reduction.  */
4422         if (adjustment_def)
4423           *adjustment_def = init_val;
4424
4425         if (code == MULT_EXPR)
4426           {
4427             real_init_val = dconst1;
4428             int_init_val = 1;
4429           }
4430
4431         if (code == BIT_AND_EXPR)
4432           int_init_val = -1;
4433
4434         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4435           def_for_init = build_real (scalar_type, real_init_val);
4436         else
4437           def_for_init = build_int_cst (scalar_type, int_init_val);
4438
4439         if (adjustment_def)
4440           /* Option1: the first element is '0' or '1' as well.  */
4441           init_def = gimple_build_vector_from_val (&stmts, vectype,
4442                                                    def_for_init);
4443         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4444           {
4445             /* Option2 (variable length): the first element is INIT_VAL.  */
4446             init_def = build_vector_from_val (vectype, def_for_init);
4447             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4448                                                       2, init_def, init_val);
4449             init_def = make_ssa_name (vectype);
4450             gimple_call_set_lhs (call, init_def);
4451             gimple_seq_add_stmt (&stmts, call);
4452           }
4453         else
4454           {
4455             /* Option2: the first element is INIT_VAL.  */
4456             tree_vector_builder elts (vectype, 1, 2);
4457             elts.quick_push (init_val);
4458             elts.quick_push (def_for_init);
4459             init_def = gimple_build_vector (&stmts, &elts);
4460           }
4461       }
4462       break;
4463
4464     case MIN_EXPR:
4465     case MAX_EXPR:
4466     case COND_EXPR:
4467       {
4468         if (adjustment_def)
4469           {
4470             *adjustment_def = NULL_TREE;
4471             if (reduction_type != COND_REDUCTION
4472                 && reduction_type != EXTRACT_LAST_REDUCTION)
4473               {
4474                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4475                 break;
4476               }
4477           }
4478         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4479         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4480       }
4481       break;
4482
4483     default:
4484       gcc_unreachable ();
4485     }
4486
4487   if (stmts)
4488     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4489   return init_def;
4490 }
4491
4492 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4493    NUMBER_OF_VECTORS is the number of vector defs to create.
4494    If NEUTRAL_OP is nonnull, introducing extra elements of that
4495    value will not change the result.  */
4496
4497 static void
4498 get_initial_defs_for_reduction (slp_tree slp_node,
4499                                 vec<tree> *vec_oprnds,
4500                                 unsigned int number_of_vectors,
4501                                 bool reduc_chain, tree neutral_op)
4502 {
4503   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4504   gimple *stmt = stmts[0];
4505   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4506   unsigned HOST_WIDE_INT nunits;
4507   unsigned j, number_of_places_left_in_vector;
4508   tree vector_type;
4509   tree vop;
4510   int group_size = stmts.length ();
4511   unsigned int vec_num, i;
4512   unsigned number_of_copies = 1;
4513   vec<tree> voprnds;
4514   voprnds.create (number_of_vectors);
4515   struct loop *loop;
4516   auto_vec<tree, 16> permute_results;
4517
4518   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4519
4520   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4521
4522   loop = (gimple_bb (stmt))->loop_father;
4523   gcc_assert (loop);
4524   edge pe = loop_preheader_edge (loop);
4525
4526   gcc_assert (!reduc_chain || neutral_op);
4527
4528   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4529      created vectors. It is greater than 1 if unrolling is performed.
4530
4531      For example, we have two scalar operands, s1 and s2 (e.g., group of
4532      strided accesses of size two), while NUNITS is four (i.e., four scalars
4533      of this type can be packed in a vector).  The output vector will contain
4534      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4535      will be 2).
4536
4537      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4538      containing the operands.
4539
4540      For example, NUNITS is four as before, and the group size is 8
4541      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4542      {s5, s6, s7, s8}.  */
4543
4544   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4545     nunits = group_size;
4546
4547   number_of_copies = nunits * number_of_vectors / group_size;
4548
4549   number_of_places_left_in_vector = nunits;
4550   bool constant_p = true;
4551   tree_vector_builder elts (vector_type, nunits, 1);
4552   elts.quick_grow (nunits);
4553   for (j = 0; j < number_of_copies; j++)
4554     {
4555       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4556         {
4557           tree op;
4558           /* Get the def before the loop.  In reduction chain we have only
4559              one initial value.  */
4560           if ((j != (number_of_copies - 1)
4561                || (reduc_chain && i != 0))
4562               && neutral_op)
4563             op = neutral_op;
4564           else
4565             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4566
4567           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4568           number_of_places_left_in_vector--;
4569           elts[number_of_places_left_in_vector] = op;
4570           if (!CONSTANT_CLASS_P (op))
4571             constant_p = false;
4572
4573           if (number_of_places_left_in_vector == 0)
4574             {
4575               gimple_seq ctor_seq = NULL;
4576               tree init;
4577               if (constant_p && !neutral_op
4578                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4579                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4580                 /* Build the vector directly from ELTS.  */
4581                 init = gimple_build_vector (&ctor_seq, &elts);
4582               else if (neutral_op)
4583                 {
4584                   /* Build a vector of the neutral value and shift the
4585                      other elements into place.  */
4586                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4587                                                        neutral_op);
4588                   int k = nunits;
4589                   while (k > 0 && elts[k - 1] == neutral_op)
4590                     k -= 1;
4591                   while (k > 0)
4592                     {
4593                       k -= 1;
4594                       gcall *call = gimple_build_call_internal
4595                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4596                       init = make_ssa_name (vector_type);
4597                       gimple_call_set_lhs (call, init);
4598                       gimple_seq_add_stmt (&ctor_seq, call);
4599                     }
4600                 }
4601               else
4602                 {
4603                   /* First time round, duplicate ELTS to fill the
4604                      required number of vectors, then cherry pick the
4605                      appropriate result for each iteration.  */
4606                   if (vec_oprnds->is_empty ())
4607                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4608                                               number_of_vectors,
4609                                               permute_results);
4610                   init = permute_results[number_of_vectors - j - 1];
4611                 }
4612               if (ctor_seq != NULL)
4613                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4614               voprnds.quick_push (init);
4615
4616               number_of_places_left_in_vector = nunits;
4617               elts.new_vector (vector_type, nunits, 1);
4618               elts.quick_grow (nunits);
4619               constant_p = true;
4620             }
4621         }
4622     }
4623
4624   /* Since the vectors are created in the reverse order, we should invert
4625      them.  */
4626   vec_num = voprnds.length ();
4627   for (j = vec_num; j != 0; j--)
4628     {
4629       vop = voprnds[j - 1];
4630       vec_oprnds->quick_push (vop);
4631     }
4632
4633   voprnds.release ();
4634
4635   /* In case that VF is greater than the unrolling factor needed for the SLP
4636      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4637      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4638      to replicate the vectors.  */
4639   tree neutral_vec = NULL;
4640   while (number_of_vectors > vec_oprnds->length ())
4641     {
4642       if (neutral_op)
4643         {
4644           if (!neutral_vec)
4645             {
4646               gimple_seq ctor_seq = NULL;
4647               neutral_vec = gimple_build_vector_from_val
4648                 (&ctor_seq, vector_type, neutral_op);
4649               if (ctor_seq != NULL)
4650                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4651             }
4652           vec_oprnds->quick_push (neutral_vec);
4653         }
4654       else
4655         {
4656           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4657             vec_oprnds->quick_push (vop);
4658         }
4659     }
4660 }
4661
4662
4663 /* Function vect_create_epilog_for_reduction
4664
4665    Create code at the loop-epilog to finalize the result of a reduction
4666    computation.
4667
4668    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4669      reduction statements.
4670    STMT is the scalar reduction stmt that is being vectorized.
4671    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4672      number of elements that we can fit in a vectype (nunits).  In this case
4673      we have to generate more than one vector stmt - i.e - we need to "unroll"
4674      the vector stmt by a factor VF/nunits.  For more details see documentation
4675      in vectorizable_operation.
4676    REDUC_FN is the internal function for the epilog reduction.
4677    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4678      computation.
4679    REDUC_INDEX is the index of the operand in the right hand side of the
4680      statement that is defined by REDUCTION_PHI.
4681    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4682    SLP_NODE is an SLP node containing a group of reduction statements. The
4683      first one in this group is STMT.
4684    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4685      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4686      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4687      any value of the IV in the loop.
4688    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4689    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4690      null if this is not an SLP reduction
4691
4692    This function:
4693    1. Creates the reduction def-use cycles: sets the arguments for
4694       REDUCTION_PHIS:
4695       The loop-entry argument is the vectorized initial-value of the reduction.
4696       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4697       sums.
4698    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4699       by calling the function specified by REDUC_FN if available, or by
4700       other means (whole-vector shifts or a scalar loop).
4701       The function also creates a new phi node at the loop exit to preserve
4702       loop-closed form, as illustrated below.
4703
4704      The flow at the entry to this function:
4705
4706         loop:
4707           vec_def = phi <null, null>            # REDUCTION_PHI
4708           VECT_DEF = vector_stmt                # vectorized form of STMT
4709           s_loop = scalar_stmt                  # (scalar) STMT
4710         loop_exit:
4711           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4712           use <s_out0>
4713           use <s_out0>
4714
4715      The above is transformed by this function into:
4716
4717         loop:
4718           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4719           VECT_DEF = vector_stmt                # vectorized form of STMT
4720           s_loop = scalar_stmt                  # (scalar) STMT
4721         loop_exit:
4722           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4723           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4724           v_out2 = reduce <v_out1>
4725           s_out3 = extract_field <v_out2, 0>
4726           s_out4 = adjust_result <s_out3>
4727           use <s_out4>
4728           use <s_out4>
4729 */
4730
4731 static void
4732 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4733                                   gimple *reduc_def_stmt,
4734                                   int ncopies, internal_fn reduc_fn,
4735                                   vec<gimple *> reduction_phis,
4736                                   bool double_reduc,
4737                                   slp_tree slp_node,
4738                                   slp_instance slp_node_instance,
4739                                   tree induc_val, enum tree_code induc_code,
4740                                   tree neutral_op)
4741 {
4742   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4743   stmt_vec_info prev_phi_info;
4744   tree vectype;
4745   machine_mode mode;
4746   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4747   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4748   basic_block exit_bb;
4749   tree scalar_dest;
4750   tree scalar_type;
4751   gimple *new_phi = NULL, *phi;
4752   gimple_stmt_iterator exit_gsi;
4753   tree vec_dest;
4754   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4755   gimple *epilog_stmt = NULL;
4756   enum tree_code code = gimple_assign_rhs_code (stmt);
4757   gimple *exit_phi;
4758   tree bitsize;
4759   tree adjustment_def = NULL;
4760   tree vec_initial_def = NULL;
4761   tree expr, def, initial_def = NULL;
4762   tree orig_name, scalar_result;
4763   imm_use_iterator imm_iter, phi_imm_iter;
4764   use_operand_p use_p, phi_use_p;
4765   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4766   bool nested_in_vect_loop = false;
4767   auto_vec<gimple *> new_phis;
4768   auto_vec<gimple *> inner_phis;
4769   enum vect_def_type dt = vect_unknown_def_type;
4770   int j, i;
4771   auto_vec<tree> scalar_results;
4772   unsigned int group_size = 1, k, ratio;
4773   auto_vec<tree> vec_initial_defs;
4774   auto_vec<gimple *> phis;
4775   bool slp_reduc = false;
4776   bool direct_slp_reduc;
4777   tree new_phi_result;
4778   gimple *inner_phi = NULL;
4779   tree induction_index = NULL_TREE;
4780
4781   if (slp_node)
4782     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4783
4784   if (nested_in_vect_loop_p (loop, stmt))
4785     {
4786       outer_loop = loop;
4787       loop = loop->inner;
4788       nested_in_vect_loop = true;
4789       gcc_assert (!slp_node);
4790     }
4791
4792   vectype = STMT_VINFO_VECTYPE (stmt_info);
4793   gcc_assert (vectype);
4794   mode = TYPE_MODE (vectype);
4795
4796   /* 1. Create the reduction def-use cycle:
4797      Set the arguments of REDUCTION_PHIS, i.e., transform
4798
4799         loop:
4800           vec_def = phi <null, null>            # REDUCTION_PHI
4801           VECT_DEF = vector_stmt                # vectorized form of STMT
4802           ...
4803
4804      into:
4805
4806         loop:
4807           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4808           VECT_DEF = vector_stmt                # vectorized form of STMT
4809           ...
4810
4811      (in case of SLP, do it for all the phis). */
4812
4813   /* Get the loop-entry arguments.  */
4814   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4815   if (slp_node)
4816     {
4817       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4818       vec_initial_defs.reserve (vec_num);
4819       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4820                                       &vec_initial_defs, vec_num,
4821                                       GROUP_FIRST_ELEMENT (stmt_info),
4822                                       neutral_op);
4823     }
4824   else
4825     {
4826       /* Get at the scalar def before the loop, that defines the initial value
4827          of the reduction variable.  */
4828       gimple *def_stmt;
4829       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4830                                            loop_preheader_edge (loop));
4831       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4832          and we can't use zero for induc_val, use initial_def.  Similarly
4833          for REDUC_MIN and initial_def larger than the base.  */
4834       if (TREE_CODE (initial_def) == INTEGER_CST
4835           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4836               == INTEGER_INDUC_COND_REDUCTION)
4837           && !integer_zerop (induc_val)
4838           && ((induc_code == MAX_EXPR
4839                && tree_int_cst_lt (initial_def, induc_val))
4840               || (induc_code == MIN_EXPR
4841                   && tree_int_cst_lt (induc_val, initial_def))))
4842         induc_val = initial_def;
4843       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4844       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4845                                                        &adjustment_def);
4846       vec_initial_defs.create (1);
4847       vec_initial_defs.quick_push (vec_initial_def);
4848     }
4849
4850   /* Set phi nodes arguments.  */
4851   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4852     {
4853       tree vec_init_def = vec_initial_defs[i];
4854       tree def = vect_defs[i];
4855       for (j = 0; j < ncopies; j++)
4856         {
4857           if (j != 0)
4858             {
4859               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4860               if (nested_in_vect_loop)
4861                 vec_init_def
4862                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4863                                                     vec_init_def);
4864             }
4865
4866           /* Set the loop-entry arg of the reduction-phi.  */
4867
4868           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4869               == INTEGER_INDUC_COND_REDUCTION)
4870             {
4871               /* Initialise the reduction phi to zero.  This prevents initial
4872                  values of non-zero interferring with the reduction op.  */
4873               gcc_assert (ncopies == 1);
4874               gcc_assert (i == 0);
4875
4876               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4877               tree induc_val_vec
4878                 = build_vector_from_val (vec_init_def_type, induc_val);
4879
4880               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4881                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4882             }
4883           else
4884             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4885                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4886
4887           /* Set the loop-latch arg for the reduction-phi.  */
4888           if (j > 0)
4889             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4890
4891           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4892                        UNKNOWN_LOCATION);
4893
4894           if (dump_enabled_p ())
4895             {
4896               dump_printf_loc (MSG_NOTE, vect_location,
4897                                "transform reduction: created def-use cycle: ");
4898               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4899               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4900             }
4901         }
4902     }
4903
4904   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4905      which is updated with the current index of the loop for every match of
4906      the original loop's cond_expr (VEC_STMT).  This results in a vector
4907      containing the last time the condition passed for that vector lane.
4908      The first match will be a 1 to allow 0 to be used for non-matching
4909      indexes.  If there are no matches at all then the vector will be all
4910      zeroes.  */
4911   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4912     {
4913       tree indx_before_incr, indx_after_incr;
4914       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4915
4916       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4917       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4918
4919       int scalar_precision
4920         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4921       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4922       tree cr_index_vector_type = build_vector_type
4923         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4924
4925       /* First we create a simple vector induction variable which starts
4926          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4927          vector size (STEP).  */
4928
4929       /* Create a {1,2,3,...} vector.  */
4930       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4931
4932       /* Create a vector of the step value.  */
4933       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4934       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4935
4936       /* Create an induction variable.  */
4937       gimple_stmt_iterator incr_gsi;
4938       bool insert_after;
4939       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4940       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4941                  insert_after, &indx_before_incr, &indx_after_incr);
4942
4943       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4944          filled with zeros (VEC_ZERO).  */
4945
4946       /* Create a vector of 0s.  */
4947       tree zero = build_zero_cst (cr_index_scalar_type);
4948       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4949
4950       /* Create a vector phi node.  */
4951       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4952       new_phi = create_phi_node (new_phi_tree, loop->header);
4953       set_vinfo_for_stmt (new_phi,
4954                           new_stmt_vec_info (new_phi, loop_vinfo));
4955       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4956                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4957
4958       /* Now take the condition from the loops original cond_expr
4959          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4960          every match uses values from the induction variable
4961          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4962          (NEW_PHI_TREE).
4963          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4964          the new cond_expr (INDEX_COND_EXPR).  */
4965
4966       /* Duplicate the condition from vec_stmt.  */
4967       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4968
4969       /* Create a conditional, where the condition is taken from vec_stmt
4970          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4971          else is the phi (NEW_PHI_TREE).  */
4972       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4973                                      ccompare, indx_before_incr,
4974                                      new_phi_tree);
4975       induction_index = make_ssa_name (cr_index_vector_type);
4976       gimple *index_condition = gimple_build_assign (induction_index,
4977                                                      index_cond_expr);
4978       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4979       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4980                                                         loop_vinfo);
4981       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4982       set_vinfo_for_stmt (index_condition, index_vec_info);
4983
4984       /* Update the phi with the vec cond.  */
4985       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4986                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4987     }
4988
4989   /* 2. Create epilog code.
4990         The reduction epilog code operates across the elements of the vector
4991         of partial results computed by the vectorized loop.
4992         The reduction epilog code consists of:
4993
4994         step 1: compute the scalar result in a vector (v_out2)
4995         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4996         step 3: adjust the scalar result (s_out3) if needed.
4997
4998         Step 1 can be accomplished using one the following three schemes:
4999           (scheme 1) using reduc_fn, if available.
5000           (scheme 2) using whole-vector shifts, if available.
5001           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5002                      combined.
5003
5004           The overall epilog code looks like this:
5005
5006           s_out0 = phi <s_loop>         # original EXIT_PHI
5007           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5008           v_out2 = reduce <v_out1>              # step 1
5009           s_out3 = extract_field <v_out2, 0>    # step 2
5010           s_out4 = adjust_result <s_out3>       # step 3
5011
5012           (step 3 is optional, and steps 1 and 2 may be combined).
5013           Lastly, the uses of s_out0 are replaced by s_out4.  */
5014
5015
5016   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5017          v_out1 = phi <VECT_DEF>
5018          Store them in NEW_PHIS.  */
5019
5020   exit_bb = single_exit (loop)->dest;
5021   prev_phi_info = NULL;
5022   new_phis.create (vect_defs.length ());
5023   FOR_EACH_VEC_ELT (vect_defs, i, def)
5024     {
5025       for (j = 0; j < ncopies; j++)
5026         {
5027           tree new_def = copy_ssa_name (def);
5028           phi = create_phi_node (new_def, exit_bb);
5029           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5030           if (j == 0)
5031             new_phis.quick_push (phi);
5032           else
5033             {
5034               def = vect_get_vec_def_for_stmt_copy (dt, def);
5035               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5036             }
5037
5038           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5039           prev_phi_info = vinfo_for_stmt (phi);
5040         }
5041     }
5042
5043   /* The epilogue is created for the outer-loop, i.e., for the loop being
5044      vectorized.  Create exit phis for the outer loop.  */
5045   if (double_reduc)
5046     {
5047       loop = outer_loop;
5048       exit_bb = single_exit (loop)->dest;
5049       inner_phis.create (vect_defs.length ());
5050       FOR_EACH_VEC_ELT (new_phis, i, phi)
5051         {
5052           tree new_result = copy_ssa_name (PHI_RESULT (phi));
5053           gphi *outer_phi = create_phi_node (new_result, exit_bb);
5054           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5055                            PHI_RESULT (phi));
5056           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5057                                                             loop_vinfo));
5058           inner_phis.quick_push (phi);
5059           new_phis[i] = outer_phi;
5060           prev_phi_info = vinfo_for_stmt (outer_phi);
5061           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5062             {
5063               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5064               new_result = copy_ssa_name (PHI_RESULT (phi));
5065               outer_phi = create_phi_node (new_result, exit_bb);
5066               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5067                                PHI_RESULT (phi));
5068               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5069                                                                 loop_vinfo));
5070               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5071               prev_phi_info = vinfo_for_stmt (outer_phi);
5072             }
5073         }
5074     }
5075
5076   exit_gsi = gsi_after_labels (exit_bb);
5077
5078   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5079          (i.e. when reduc_fn is not available) and in the final adjustment
5080          code (if needed).  Also get the original scalar reduction variable as
5081          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5082          represents a reduction pattern), the tree-code and scalar-def are
5083          taken from the original stmt that the pattern-stmt (STMT) replaces.
5084          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5085          are taken from STMT.  */
5086
5087   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5088   if (!orig_stmt)
5089     {
5090       /* Regular reduction  */
5091       orig_stmt = stmt;
5092     }
5093   else
5094     {
5095       /* Reduction pattern  */
5096       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5097       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5098       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5099     }
5100
5101   code = gimple_assign_rhs_code (orig_stmt);
5102   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5103      partial results are added and not subtracted.  */
5104   if (code == MINUS_EXPR)
5105     code = PLUS_EXPR;
5106
5107   scalar_dest = gimple_assign_lhs (orig_stmt);
5108   scalar_type = TREE_TYPE (scalar_dest);
5109   scalar_results.create (group_size);
5110   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5111   bitsize = TYPE_SIZE (scalar_type);
5112
5113   /* In case this is a reduction in an inner-loop while vectorizing an outer
5114      loop - we don't need to extract a single scalar result at the end of the
5115      inner-loop (unless it is double reduction, i.e., the use of reduction is
5116      outside the outer-loop).  The final vector of partial results will be used
5117      in the vectorized outer-loop, or reduced to a scalar result at the end of
5118      the outer-loop.  */
5119   if (nested_in_vect_loop && !double_reduc)
5120     goto vect_finalize_reduction;
5121
5122   /* SLP reduction without reduction chain, e.g.,
5123      # a1 = phi <a2, a0>
5124      # b1 = phi <b2, b0>
5125      a2 = operation (a1)
5126      b2 = operation (b1)  */
5127   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5128
5129   /* True if we should implement SLP_REDUC using native reduction operations
5130      instead of scalar operations.  */
5131   direct_slp_reduc = (reduc_fn != IFN_LAST
5132                       && slp_reduc
5133                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5134
5135   /* In case of reduction chain, e.g.,
5136      # a1 = phi <a3, a0>
5137      a2 = operation (a1)
5138      a3 = operation (a2),
5139
5140      we may end up with more than one vector result.  Here we reduce them to
5141      one vector.  */
5142   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5143     {
5144       tree first_vect = PHI_RESULT (new_phis[0]);
5145       gassign *new_vec_stmt = NULL;
5146       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5147       for (k = 1; k < new_phis.length (); k++)
5148         {
5149           gimple *next_phi = new_phis[k];
5150           tree second_vect = PHI_RESULT (next_phi);
5151           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5152           new_vec_stmt = gimple_build_assign (tem, code,
5153                                               first_vect, second_vect);
5154           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5155           first_vect = tem;
5156         }
5157
5158       new_phi_result = first_vect;
5159       if (new_vec_stmt)
5160         {
5161           new_phis.truncate (0);
5162           new_phis.safe_push (new_vec_stmt);
5163         }
5164     }
5165   /* Likewise if we couldn't use a single defuse cycle.  */
5166   else if (ncopies > 1)
5167     {
5168       gcc_assert (new_phis.length () == 1);
5169       tree first_vect = PHI_RESULT (new_phis[0]);
5170       gassign *new_vec_stmt = NULL;
5171       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5172       gimple *next_phi = new_phis[0];
5173       for (int k = 1; k < ncopies; ++k)
5174         {
5175           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5176           tree second_vect = PHI_RESULT (next_phi);
5177           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5178           new_vec_stmt = gimple_build_assign (tem, code,
5179                                               first_vect, second_vect);
5180           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5181           first_vect = tem;
5182         }
5183       new_phi_result = first_vect;
5184       new_phis.truncate (0);
5185       new_phis.safe_push (new_vec_stmt);
5186     }
5187   else
5188     new_phi_result = PHI_RESULT (new_phis[0]);
5189
5190   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5191       && reduc_fn != IFN_LAST)
5192     {
5193       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5194          various data values where the condition matched and another vector
5195          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5196          need to extract the last matching index (which will be the index with
5197          highest value) and use this to index into the data vector.
5198          For the case where there were no matches, the data vector will contain
5199          all default values and the index vector will be all zeros.  */
5200
5201       /* Get various versions of the type of the vector of indexes.  */
5202       tree index_vec_type = TREE_TYPE (induction_index);
5203       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5204       tree index_scalar_type = TREE_TYPE (index_vec_type);
5205       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5206         (index_vec_type);
5207
5208       /* Get an unsigned integer version of the type of the data vector.  */
5209       int scalar_precision
5210         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5211       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5212       tree vectype_unsigned = build_vector_type
5213         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5214
5215       /* First we need to create a vector (ZERO_VEC) of zeros and another
5216          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5217          can create using a MAX reduction and then expanding.
5218          In the case where the loop never made any matches, the max index will
5219          be zero.  */
5220
5221       /* Vector of {0, 0, 0,...}.  */
5222       tree zero_vec = make_ssa_name (vectype);
5223       tree zero_vec_rhs = build_zero_cst (vectype);
5224       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5225       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5226
5227       /* Find maximum value from the vector of found indexes.  */
5228       tree max_index = make_ssa_name (index_scalar_type);
5229       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5230                                                           1, induction_index);
5231       gimple_call_set_lhs (max_index_stmt, max_index);
5232       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5233
5234       /* Vector of {max_index, max_index, max_index,...}.  */
5235       tree max_index_vec = make_ssa_name (index_vec_type);
5236       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5237                                                       max_index);
5238       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5239                                                         max_index_vec_rhs);
5240       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5241
5242       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5243          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5244          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5245          otherwise.  Only one value should match, resulting in a vector
5246          (VEC_COND) with one data value and the rest zeros.
5247          In the case where the loop never made any matches, every index will
5248          match, resulting in a vector with all data values (which will all be
5249          the default value).  */
5250
5251       /* Compare the max index vector to the vector of found indexes to find
5252          the position of the max value.  */
5253       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5254       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5255                                                       induction_index,
5256                                                       max_index_vec);
5257       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5258
5259       /* Use the compare to choose either values from the data vector or
5260          zero.  */
5261       tree vec_cond = make_ssa_name (vectype);
5262       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5263                                                    vec_compare, new_phi_result,
5264                                                    zero_vec);
5265       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5266
5267       /* Finally we need to extract the data value from the vector (VEC_COND)
5268          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5269          reduction, but because this doesn't exist, we can use a MAX reduction
5270          instead.  The data value might be signed or a float so we need to cast
5271          it first.
5272          In the case where the loop never made any matches, the data values are
5273          all identical, and so will reduce down correctly.  */
5274
5275       /* Make the matched data values unsigned.  */
5276       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5277       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5278                                        vec_cond);
5279       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5280                                                         VIEW_CONVERT_EXPR,
5281                                                         vec_cond_cast_rhs);
5282       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5283
5284       /* Reduce down to a scalar value.  */
5285       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5286       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5287                                                            1, vec_cond_cast);
5288       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5289       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5290
5291       /* Convert the reduced value back to the result type and set as the
5292          result.  */
5293       gimple_seq stmts = NULL;
5294       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5295                                data_reduc);
5296       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5297       scalar_results.safe_push (new_temp);
5298     }
5299   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5300            && reduc_fn == IFN_LAST)
5301     {
5302       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5303          idx = 0;
5304          idx_val = induction_index[0];
5305          val = data_reduc[0];
5306          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5307            if (induction_index[i] > idx_val)
5308              val = data_reduc[i], idx_val = induction_index[i];
5309          return val;  */
5310
5311       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5312       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5313       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5314       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5315       /* Enforced by vectorizable_reduction, which ensures we have target
5316          support before allowing a conditional reduction on variable-length
5317          vectors.  */
5318       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5319       tree idx_val = NULL_TREE, val = NULL_TREE;
5320       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5321         {
5322           tree old_idx_val = idx_val;
5323           tree old_val = val;
5324           idx_val = make_ssa_name (idx_eltype);
5325           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5326                                              build3 (BIT_FIELD_REF, idx_eltype,
5327                                                      induction_index,
5328                                                      bitsize_int (el_size),
5329                                                      bitsize_int (off)));
5330           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5331           val = make_ssa_name (data_eltype);
5332           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5333                                              build3 (BIT_FIELD_REF,
5334                                                      data_eltype,
5335                                                      new_phi_result,
5336                                                      bitsize_int (el_size),
5337                                                      bitsize_int (off)));
5338           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5339           if (off != 0)
5340             {
5341               tree new_idx_val = idx_val;
5342               tree new_val = val;
5343               if (off != v_size - el_size)
5344                 {
5345                   new_idx_val = make_ssa_name (idx_eltype);
5346                   epilog_stmt = gimple_build_assign (new_idx_val,
5347                                                      MAX_EXPR, idx_val,
5348                                                      old_idx_val);
5349                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5350                 }
5351               new_val = make_ssa_name (data_eltype);
5352               epilog_stmt = gimple_build_assign (new_val,
5353                                                  COND_EXPR,
5354                                                  build2 (GT_EXPR,
5355                                                          boolean_type_node,
5356                                                          idx_val,
5357                                                          old_idx_val),
5358                                                  val, old_val);
5359               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5360               idx_val = new_idx_val;
5361               val = new_val;
5362             }
5363         }
5364       /* Convert the reduced value back to the result type and set as the
5365          result.  */
5366       gimple_seq stmts = NULL;
5367       val = gimple_convert (&stmts, scalar_type, val);
5368       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5369       scalar_results.safe_push (val);
5370     }
5371
5372   /* 2.3 Create the reduction code, using one of the three schemes described
5373          above. In SLP we simply need to extract all the elements from the
5374          vector (without reducing them), so we use scalar shifts.  */
5375   else if (reduc_fn != IFN_LAST && !slp_reduc)
5376     {
5377       tree tmp;
5378       tree vec_elem_type;
5379
5380       /* Case 1:  Create:
5381          v_out2 = reduc_expr <v_out1>  */
5382
5383       if (dump_enabled_p ())
5384         dump_printf_loc (MSG_NOTE, vect_location,
5385                          "Reduce using direct vector reduction.\n");
5386
5387       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5388       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5389         {
5390           tree tmp_dest
5391             = vect_create_destination_var (scalar_dest, vec_elem_type);
5392           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5393                                                     new_phi_result);
5394           gimple_set_lhs (epilog_stmt, tmp_dest);
5395           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5396           gimple_set_lhs (epilog_stmt, new_temp);
5397           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5398
5399           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5400                                              new_temp);
5401         }
5402       else
5403         {
5404           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5405                                                     new_phi_result);
5406           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5407         }
5408
5409       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5410       gimple_set_lhs (epilog_stmt, new_temp);
5411       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5412
5413       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5414            == INTEGER_INDUC_COND_REDUCTION)
5415           && !operand_equal_p (initial_def, induc_val, 0))
5416         {
5417           /* Earlier we set the initial value to be a vector if induc_val
5418              values.  Check the result and if it is induc_val then replace
5419              with the original initial value, unless induc_val is
5420              the same as initial_def already.  */
5421           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5422                                   induc_val);
5423
5424           tmp = make_ssa_name (new_scalar_dest);
5425           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5426                                              initial_def, new_temp);
5427           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5428           new_temp = tmp;
5429         }
5430
5431       scalar_results.safe_push (new_temp);
5432     }
5433   else if (direct_slp_reduc)
5434     {
5435       /* Here we create one vector for each of the GROUP_SIZE results,
5436          with the elements for other SLP statements replaced with the
5437          neutral value.  We can then do a normal reduction on each vector.  */
5438
5439       /* Enforced by vectorizable_reduction.  */
5440       gcc_assert (new_phis.length () == 1);
5441       gcc_assert (pow2p_hwi (group_size));
5442
5443       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5444       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5445       gimple_seq seq = NULL;
5446
5447       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5448          and the same element size as VECTYPE.  */
5449       tree index = build_index_vector (vectype, 0, 1);
5450       tree index_type = TREE_TYPE (index);
5451       tree index_elt_type = TREE_TYPE (index_type);
5452       tree mask_type = build_same_sized_truth_vector_type (index_type);
5453
5454       /* Create a vector that, for each element, identifies which of
5455          the GROUP_SIZE results should use it.  */
5456       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5457       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5458                             build_vector_from_val (index_type, index_mask));
5459
5460       /* Get a neutral vector value.  This is simply a splat of the neutral
5461          scalar value if we have one, otherwise the initial scalar value
5462          is itself a neutral value.  */
5463       tree vector_identity = NULL_TREE;
5464       if (neutral_op)
5465         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5466                                                         neutral_op);
5467       for (unsigned int i = 0; i < group_size; ++i)
5468         {
5469           /* If there's no univeral neutral value, we can use the
5470              initial scalar value from the original PHI.  This is used
5471              for MIN and MAX reduction, for example.  */
5472           if (!neutral_op)
5473             {
5474               tree scalar_value
5475                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5476                                          loop_preheader_edge (loop));
5477               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5478                                                               scalar_value);
5479             }
5480
5481           /* Calculate the equivalent of:
5482
5483              sel[j] = (index[j] == i);
5484
5485              which selects the elements of NEW_PHI_RESULT that should
5486              be included in the result.  */
5487           tree compare_val = build_int_cst (index_elt_type, i);
5488           compare_val = build_vector_from_val (index_type, compare_val);
5489           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5490                                    index, compare_val);
5491
5492           /* Calculate the equivalent of:
5493
5494              vec = seq ? new_phi_result : vector_identity;
5495
5496              VEC is now suitable for a full vector reduction.  */
5497           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5498                                    sel, new_phi_result, vector_identity);
5499
5500           /* Do the reduction and convert it to the appropriate type.  */
5501           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5502           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5503           gimple_call_set_lhs (call, scalar);
5504           gimple_seq_add_stmt (&seq, call);
5505           scalar = gimple_convert (&seq, scalar_type, scalar);
5506           scalar_results.safe_push (scalar);
5507         }
5508       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5509     }
5510   else
5511     {
5512       bool reduce_with_shift;
5513       tree vec_temp;
5514
5515       /* COND reductions all do the final reduction with MAX_EXPR
5516          or MIN_EXPR.  */
5517       if (code == COND_EXPR)
5518         {
5519           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5520               == INTEGER_INDUC_COND_REDUCTION)
5521             code = induc_code;
5522           else
5523             code = MAX_EXPR;
5524         }
5525
5526       /* See if the target wants to do the final (shift) reduction
5527          in a vector mode of smaller size and first reduce upper/lower
5528          halves against each other.  */
5529       enum machine_mode mode1 = mode;
5530       tree vectype1 = vectype;
5531       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5532       unsigned sz1 = sz;
5533       if (!slp_reduc
5534           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5535         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5536
5537       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5538       reduce_with_shift = have_whole_vector_shift (mode1);
5539       if (!VECTOR_MODE_P (mode1))
5540         reduce_with_shift = false;
5541       else
5542         {
5543           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5544           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5545             reduce_with_shift = false;
5546         }
5547
5548       /* First reduce the vector to the desired vector size we should
5549          do shift reduction on by combining upper and lower halves.  */
5550       new_temp = new_phi_result;
5551       while (sz > sz1)
5552         {
5553           gcc_assert (!slp_reduc);
5554           sz /= 2;
5555           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5556
5557           /* The target has to make sure we support lowpart/highpart
5558              extraction, either via direct vector extract or through
5559              an integer mode punning.  */
5560           tree dst1, dst2;
5561           if (convert_optab_handler (vec_extract_optab,
5562                                      TYPE_MODE (TREE_TYPE (new_temp)),
5563                                      TYPE_MODE (vectype1))
5564               != CODE_FOR_nothing)
5565             {
5566               /* Extract sub-vectors directly once vec_extract becomes
5567                  a conversion optab.  */
5568               dst1 = make_ssa_name (vectype1);
5569               epilog_stmt
5570                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5571                                          build3 (BIT_FIELD_REF, vectype1,
5572                                                  new_temp, TYPE_SIZE (vectype1),
5573                                                  bitsize_int (0)));
5574               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5575               dst2 =  make_ssa_name (vectype1);
5576               epilog_stmt
5577                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5578                                          build3 (BIT_FIELD_REF, vectype1,
5579                                                  new_temp, TYPE_SIZE (vectype1),
5580                                                  bitsize_int (sz * BITS_PER_UNIT)));
5581               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5582             }
5583           else
5584             {
5585               /* Extract via punning to appropriately sized integer mode
5586                  vector.  */
5587               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5588                                                             1);
5589               tree etype = build_vector_type (eltype, 2);
5590               gcc_assert (convert_optab_handler (vec_extract_optab,
5591                                                  TYPE_MODE (etype),
5592                                                  TYPE_MODE (eltype))
5593                           != CODE_FOR_nothing);
5594               tree tem = make_ssa_name (etype);
5595               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5596                                                  build1 (VIEW_CONVERT_EXPR,
5597                                                          etype, new_temp));
5598               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5599               new_temp = tem;
5600               tem = make_ssa_name (eltype);
5601               epilog_stmt
5602                   = gimple_build_assign (tem, BIT_FIELD_REF,
5603                                          build3 (BIT_FIELD_REF, eltype,
5604                                                  new_temp, TYPE_SIZE (eltype),
5605                                                  bitsize_int (0)));
5606               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5607               dst1 = make_ssa_name (vectype1);
5608               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5609                                                  build1 (VIEW_CONVERT_EXPR,
5610                                                          vectype1, tem));
5611               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5612               tem = make_ssa_name (eltype);
5613               epilog_stmt
5614                   = gimple_build_assign (tem, BIT_FIELD_REF,
5615                                          build3 (BIT_FIELD_REF, eltype,
5616                                                  new_temp, TYPE_SIZE (eltype),
5617                                                  bitsize_int (sz * BITS_PER_UNIT)));
5618               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5619               dst2 =  make_ssa_name (vectype1);
5620               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5621                                                  build1 (VIEW_CONVERT_EXPR,
5622                                                          vectype1, tem));
5623               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5624             }
5625
5626           new_temp = make_ssa_name (vectype1);
5627           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5628           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5629         }
5630
5631       if (reduce_with_shift && !slp_reduc)
5632         {
5633           int element_bitsize = tree_to_uhwi (bitsize);
5634           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5635              for variable-length vectors and also requires direct target support
5636              for loop reductions.  */
5637           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5638           int nelements = vec_size_in_bits / element_bitsize;
5639           vec_perm_builder sel;
5640           vec_perm_indices indices;
5641
5642           int elt_offset;
5643
5644           tree zero_vec = build_zero_cst (vectype1);
5645           /* Case 2: Create:
5646              for (offset = nelements/2; offset >= 1; offset/=2)
5647                 {
5648                   Create:  va' = vec_shift <va, offset>
5649                   Create:  va = vop <va, va'>
5650                 }  */
5651
5652           tree rhs;
5653
5654           if (dump_enabled_p ())
5655             dump_printf_loc (MSG_NOTE, vect_location,
5656                              "Reduce using vector shifts\n");
5657
5658           mode1 = TYPE_MODE (vectype1);
5659           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5660           for (elt_offset = nelements / 2;
5661                elt_offset >= 1;
5662                elt_offset /= 2)
5663             {
5664               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5665               indices.new_vector (sel, 2, nelements);
5666               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5667               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5668                                                  new_temp, zero_vec, mask);
5669               new_name = make_ssa_name (vec_dest, epilog_stmt);
5670               gimple_assign_set_lhs (epilog_stmt, new_name);
5671               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5672
5673               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5674                                                  new_temp);
5675               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5676               gimple_assign_set_lhs (epilog_stmt, new_temp);
5677               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5678             }
5679
5680           /* 2.4  Extract the final scalar result.  Create:
5681              s_out3 = extract_field <v_out2, bitpos>  */
5682
5683           if (dump_enabled_p ())
5684             dump_printf_loc (MSG_NOTE, vect_location,
5685                              "extract scalar result\n");
5686
5687           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5688                         bitsize, bitsize_zero_node);
5689           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5690           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5691           gimple_assign_set_lhs (epilog_stmt, new_temp);
5692           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5693           scalar_results.safe_push (new_temp);
5694         }
5695       else
5696         {
5697           /* Case 3: Create:
5698              s = extract_field <v_out2, 0>
5699              for (offset = element_size;
5700                   offset < vector_size;
5701                   offset += element_size;)
5702                {
5703                  Create:  s' = extract_field <v_out2, offset>
5704                  Create:  s = op <s, s'>  // For non SLP cases
5705                }  */
5706
5707           if (dump_enabled_p ())
5708             dump_printf_loc (MSG_NOTE, vect_location,
5709                              "Reduce using scalar code.\n");
5710
5711           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5712           int element_bitsize = tree_to_uhwi (bitsize);
5713           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5714             {
5715               int bit_offset;
5716               if (gimple_code (new_phi) == GIMPLE_PHI)
5717                 vec_temp = PHI_RESULT (new_phi);
5718               else
5719                 vec_temp = gimple_assign_lhs (new_phi);
5720               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5721                                  bitsize_zero_node);
5722               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5723               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5724               gimple_assign_set_lhs (epilog_stmt, new_temp);
5725               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5726
5727               /* In SLP we don't need to apply reduction operation, so we just
5728                  collect s' values in SCALAR_RESULTS.  */
5729               if (slp_reduc)
5730                 scalar_results.safe_push (new_temp);
5731
5732               for (bit_offset = element_bitsize;
5733                    bit_offset < vec_size_in_bits;
5734                    bit_offset += element_bitsize)
5735                 {
5736                   tree bitpos = bitsize_int (bit_offset);
5737                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5738                                      bitsize, bitpos);
5739
5740                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5741                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5742                   gimple_assign_set_lhs (epilog_stmt, new_name);
5743                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5744
5745                   if (slp_reduc)
5746                     {
5747                       /* In SLP we don't need to apply reduction operation, so
5748                          we just collect s' values in SCALAR_RESULTS.  */
5749                       new_temp = new_name;
5750                       scalar_results.safe_push (new_name);
5751                     }
5752                   else
5753                     {
5754                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5755                                                          new_name, new_temp);
5756                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5757                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5758                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5759                     }
5760                 }
5761             }
5762
5763           /* The only case where we need to reduce scalar results in SLP, is
5764              unrolling.  If the size of SCALAR_RESULTS is greater than
5765              GROUP_SIZE, we reduce them combining elements modulo
5766              GROUP_SIZE.  */
5767           if (slp_reduc)
5768             {
5769               tree res, first_res, new_res;
5770               gimple *new_stmt;
5771
5772               /* Reduce multiple scalar results in case of SLP unrolling.  */
5773               for (j = group_size; scalar_results.iterate (j, &res);
5774                    j++)
5775                 {
5776                   first_res = scalar_results[j % group_size];
5777                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5778                                                   first_res, res);
5779                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5780                   gimple_assign_set_lhs (new_stmt, new_res);
5781                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5782                   scalar_results[j % group_size] = new_res;
5783                 }
5784             }
5785           else
5786             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5787             scalar_results.safe_push (new_temp);
5788         }
5789
5790       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5791            == INTEGER_INDUC_COND_REDUCTION)
5792           && !operand_equal_p (initial_def, induc_val, 0))
5793         {
5794           /* Earlier we set the initial value to be a vector if induc_val
5795              values.  Check the result and if it is induc_val then replace
5796              with the original initial value, unless induc_val is
5797              the same as initial_def already.  */
5798           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5799                                   induc_val);
5800
5801           tree tmp = make_ssa_name (new_scalar_dest);
5802           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5803                                              initial_def, new_temp);
5804           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5805           scalar_results[0] = tmp;
5806         }
5807     }
5808
5809 vect_finalize_reduction:
5810
5811   if (double_reduc)
5812     loop = loop->inner;
5813
5814   /* 2.5 Adjust the final result by the initial value of the reduction
5815          variable. (When such adjustment is not needed, then
5816          'adjustment_def' is zero).  For example, if code is PLUS we create:
5817          new_temp = loop_exit_def + adjustment_def  */
5818
5819   if (adjustment_def)
5820     {
5821       gcc_assert (!slp_reduc);
5822       if (nested_in_vect_loop)
5823         {
5824           new_phi = new_phis[0];
5825           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5826           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5827           new_dest = vect_create_destination_var (scalar_dest, vectype);
5828         }
5829       else
5830         {
5831           new_temp = scalar_results[0];
5832           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5833           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5834           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5835         }
5836
5837       epilog_stmt = gimple_build_assign (new_dest, expr);
5838       new_temp = make_ssa_name (new_dest, epilog_stmt);
5839       gimple_assign_set_lhs (epilog_stmt, new_temp);
5840       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5841       if (nested_in_vect_loop)
5842         {
5843           set_vinfo_for_stmt (epilog_stmt,
5844                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5845           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5846                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5847
5848           if (!double_reduc)
5849             scalar_results.quick_push (new_temp);
5850           else
5851             scalar_results[0] = new_temp;
5852         }
5853       else
5854         scalar_results[0] = new_temp;
5855
5856       new_phis[0] = epilog_stmt;
5857     }
5858
5859   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5860           phis with new adjusted scalar results, i.e., replace use <s_out0>
5861           with use <s_out4>.
5862
5863      Transform:
5864         loop_exit:
5865           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5866           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5867           v_out2 = reduce <v_out1>
5868           s_out3 = extract_field <v_out2, 0>
5869           s_out4 = adjust_result <s_out3>
5870           use <s_out0>
5871           use <s_out0>
5872
5873      into:
5874
5875         loop_exit:
5876           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5877           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5878           v_out2 = reduce <v_out1>
5879           s_out3 = extract_field <v_out2, 0>
5880           s_out4 = adjust_result <s_out3>
5881           use <s_out4>
5882           use <s_out4> */
5883
5884
5885   /* In SLP reduction chain we reduce vector results into one vector if
5886      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5887      the last stmt in the reduction chain, since we are looking for the loop
5888      exit phi node.  */
5889   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5890     {
5891       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5892       /* Handle reduction patterns.  */
5893       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5894         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5895
5896       scalar_dest = gimple_assign_lhs (dest_stmt);
5897       group_size = 1;
5898     }
5899
5900   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5901      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5902      need to match SCALAR_RESULTS with corresponding statements.  The first
5903      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5904      the first vector stmt, etc.
5905      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5906   if (group_size > new_phis.length ())
5907     {
5908       ratio = group_size / new_phis.length ();
5909       gcc_assert (!(group_size % new_phis.length ()));
5910     }
5911   else
5912     ratio = 1;
5913
5914   for (k = 0; k < group_size; k++)
5915     {
5916       if (k % ratio == 0)
5917         {
5918           epilog_stmt = new_phis[k / ratio];
5919           reduction_phi = reduction_phis[k / ratio];
5920           if (double_reduc)
5921             inner_phi = inner_phis[k / ratio];
5922         }
5923
5924       if (slp_reduc)
5925         {
5926           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5927
5928           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5929           /* SLP statements can't participate in patterns.  */
5930           gcc_assert (!orig_stmt);
5931           scalar_dest = gimple_assign_lhs (current_stmt);
5932         }
5933
5934       phis.create (3);
5935       /* Find the loop-closed-use at the loop exit of the original scalar
5936          result.  (The reduction result is expected to have two immediate uses -
5937          one at the latch block, and one at the loop exit).  */
5938       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5939         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5940             && !is_gimple_debug (USE_STMT (use_p)))
5941           phis.safe_push (USE_STMT (use_p));
5942
5943       /* While we expect to have found an exit_phi because of loop-closed-ssa
5944          form we can end up without one if the scalar cycle is dead.  */
5945
5946       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5947         {
5948           if (outer_loop)
5949             {
5950               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5951               gphi *vect_phi;
5952
5953               /* FORNOW. Currently not supporting the case that an inner-loop
5954                  reduction is not used in the outer-loop (but only outside the
5955                  outer-loop), unless it is double reduction.  */
5956               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5957                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5958                           || double_reduc);
5959
5960               if (double_reduc)
5961                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5962               else
5963                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5964               if (!double_reduc
5965                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5966                       != vect_double_reduction_def)
5967                 continue;
5968
5969               /* Handle double reduction:
5970
5971                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5972                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5973                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5974                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5975
5976                  At that point the regular reduction (stmt2 and stmt3) is
5977                  already vectorized, as well as the exit phi node, stmt4.
5978                  Here we vectorize the phi node of double reduction, stmt1, and
5979                  update all relevant statements.  */
5980
5981               /* Go through all the uses of s2 to find double reduction phi
5982                  node, i.e., stmt1 above.  */
5983               orig_name = PHI_RESULT (exit_phi);
5984               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5985                 {
5986                   stmt_vec_info use_stmt_vinfo;
5987                   stmt_vec_info new_phi_vinfo;
5988                   tree vect_phi_init, preheader_arg, vect_phi_res;
5989                   basic_block bb = gimple_bb (use_stmt);
5990                   gimple *use;
5991
5992                   /* Check that USE_STMT is really double reduction phi
5993                      node.  */
5994                   if (gimple_code (use_stmt) != GIMPLE_PHI
5995                       || gimple_phi_num_args (use_stmt) != 2
5996                       || bb->loop_father != outer_loop)
5997                     continue;
5998                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5999                   if (!use_stmt_vinfo
6000                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6001                           != vect_double_reduction_def)
6002                     continue;
6003
6004                   /* Create vector phi node for double reduction:
6005                      vs1 = phi <vs0, vs2>
6006                      vs1 was created previously in this function by a call to
6007                        vect_get_vec_def_for_operand and is stored in
6008                        vec_initial_def;
6009                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6010                      vs0 is created here.  */
6011
6012                   /* Create vector phi node.  */
6013                   vect_phi = create_phi_node (vec_initial_def, bb);
6014                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6015                                     loop_vec_info_for_loop (outer_loop));
6016                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6017
6018                   /* Create vs0 - initial def of the double reduction phi.  */
6019                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6020                                              loop_preheader_edge (outer_loop));
6021                   vect_phi_init = get_initial_def_for_reduction
6022                     (stmt, preheader_arg, NULL);
6023
6024                   /* Update phi node arguments with vs0 and vs2.  */
6025                   add_phi_arg (vect_phi, vect_phi_init,
6026                                loop_preheader_edge (outer_loop),
6027                                UNKNOWN_LOCATION);
6028                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6029                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6030                   if (dump_enabled_p ())
6031                     {
6032                       dump_printf_loc (MSG_NOTE, vect_location,
6033                                        "created double reduction phi node: ");
6034                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6035                     }
6036
6037                   vect_phi_res = PHI_RESULT (vect_phi);
6038
6039                   /* Replace the use, i.e., set the correct vs1 in the regular
6040                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6041                      loop is redundant.  */
6042                   use = reduction_phi;
6043                   for (j = 0; j < ncopies; j++)
6044                     {
6045                       edge pr_edge = loop_preheader_edge (loop);
6046                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6047                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6048                     }
6049                 }
6050             }
6051         }
6052
6053       phis.release ();
6054       if (nested_in_vect_loop)
6055         {
6056           if (double_reduc)
6057             loop = outer_loop;
6058           else
6059             continue;
6060         }
6061
6062       phis.create (3);
6063       /* Find the loop-closed-use at the loop exit of the original scalar
6064          result.  (The reduction result is expected to have two immediate uses,
6065          one at the latch block, and one at the loop exit).  For double
6066          reductions we are looking for exit phis of the outer loop.  */
6067       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6068         {
6069           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6070             {
6071               if (!is_gimple_debug (USE_STMT (use_p)))
6072                 phis.safe_push (USE_STMT (use_p));
6073             }
6074           else
6075             {
6076               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6077                 {
6078                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6079
6080                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6081                     {
6082                       if (!flow_bb_inside_loop_p (loop,
6083                                              gimple_bb (USE_STMT (phi_use_p)))
6084                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6085                         phis.safe_push (USE_STMT (phi_use_p));
6086                     }
6087                 }
6088             }
6089         }
6090
6091       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6092         {
6093           /* Replace the uses:  */
6094           orig_name = PHI_RESULT (exit_phi);
6095           scalar_result = scalar_results[k];
6096           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6097             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6098               SET_USE (use_p, scalar_result);
6099         }
6100
6101       phis.release ();
6102     }
6103 }
6104
6105 /* Return a vector of type VECTYPE that is equal to the vector select
6106    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6107    before GSI.  */
6108
6109 static tree
6110 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6111                      tree vec, tree identity)
6112 {
6113   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6114   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6115                                           mask, vec, identity);
6116   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6117   return cond;
6118 }
6119
6120 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6121    order, starting with LHS.  Insert the extraction statements before GSI and
6122    associate the new scalar SSA names with variable SCALAR_DEST.
6123    Return the SSA name for the result.  */
6124
6125 static tree
6126 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6127                        tree_code code, tree lhs, tree vector_rhs)
6128 {
6129   tree vectype = TREE_TYPE (vector_rhs);
6130   tree scalar_type = TREE_TYPE (vectype);
6131   tree bitsize = TYPE_SIZE (scalar_type);
6132   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6133   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6134
6135   for (unsigned HOST_WIDE_INT bit_offset = 0;
6136        bit_offset < vec_size_in_bits;
6137        bit_offset += element_bitsize)
6138     {
6139       tree bitpos = bitsize_int (bit_offset);
6140       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6141                          bitsize, bitpos);
6142
6143       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6144       rhs = make_ssa_name (scalar_dest, stmt);
6145       gimple_assign_set_lhs (stmt, rhs);
6146       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6147
6148       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6149       tree new_name = make_ssa_name (scalar_dest, stmt);
6150       gimple_assign_set_lhs (stmt, new_name);
6151       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6152       lhs = new_name;
6153     }
6154   return lhs;
6155 }
6156
6157 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6158    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6159    statement.  CODE is the operation performed by STMT and OPS are
6160    its scalar operands.  REDUC_INDEX is the index of the operand in
6161    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6162    implements in-order reduction, or IFN_LAST if we should open-code it.
6163    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6164    that should be used to control the operation in a fully-masked loop.  */
6165
6166 static bool
6167 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6168                                gimple **vec_stmt, slp_tree slp_node,
6169                                gimple *reduc_def_stmt,
6170                                tree_code code, internal_fn reduc_fn,
6171                                tree ops[3], tree vectype_in,
6172                                int reduc_index, vec_loop_masks *masks)
6173 {
6174   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6175   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6176   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6177   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6178   gimple *new_stmt = NULL;
6179
6180   int ncopies;
6181   if (slp_node)
6182     ncopies = 1;
6183   else
6184     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6185
6186   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6187   gcc_assert (ncopies == 1);
6188   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6189   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6190   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6191               == FOLD_LEFT_REDUCTION);
6192
6193   if (slp_node)
6194     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6195                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6196
6197   tree op0 = ops[1 - reduc_index];
6198
6199   int group_size = 1;
6200   gimple *scalar_dest_def;
6201   auto_vec<tree> vec_oprnds0;
6202   if (slp_node)
6203     {
6204       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6205       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6206       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6207     }
6208   else
6209     {
6210       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6211       vec_oprnds0.create (1);
6212       vec_oprnds0.quick_push (loop_vec_def0);
6213       scalar_dest_def = stmt;
6214     }
6215
6216   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6217   tree scalar_type = TREE_TYPE (scalar_dest);
6218   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6219
6220   int vec_num = vec_oprnds0.length ();
6221   gcc_assert (vec_num == 1 || slp_node);
6222   tree vec_elem_type = TREE_TYPE (vectype_out);
6223   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6224
6225   tree vector_identity = NULL_TREE;
6226   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6227     vector_identity = build_zero_cst (vectype_out);
6228
6229   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6230   int i;
6231   tree def0;
6232   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6233     {
6234       tree mask = NULL_TREE;
6235       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6236         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6237
6238       /* Handle MINUS by adding the negative.  */
6239       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6240         {
6241           tree negated = make_ssa_name (vectype_out);
6242           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6243           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6244           def0 = negated;
6245         }
6246
6247       if (mask)
6248         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6249                                     vector_identity);
6250
6251       /* On the first iteration the input is simply the scalar phi
6252          result, and for subsequent iterations it is the output of
6253          the preceding operation.  */
6254       if (reduc_fn != IFN_LAST)
6255         {
6256           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6257           /* For chained SLP reductions the output of the previous reduction
6258              operation serves as the input of the next. For the final statement
6259              the output cannot be a temporary - we reuse the original
6260              scalar destination of the last statement.  */
6261           if (i != vec_num - 1)
6262             {
6263               gimple_set_lhs (new_stmt, scalar_dest_var);
6264               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6265               gimple_set_lhs (new_stmt, reduc_var);
6266             }
6267         }
6268       else
6269         {
6270           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6271                                              reduc_var, def0);
6272           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6273           /* Remove the statement, so that we can use the same code paths
6274              as for statements that we've just created.  */
6275           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6276           gsi_remove (&tmp_gsi, false);
6277         }
6278
6279       if (i == vec_num - 1)
6280         {
6281           gimple_set_lhs (new_stmt, scalar_dest);
6282           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6283         }
6284       else
6285         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6286
6287       if (slp_node)
6288         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6289     }
6290
6291   if (!slp_node)
6292     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6293
6294   return true;
6295 }
6296
6297 /* Function is_nonwrapping_integer_induction.
6298
6299    Check if STMT (which is part of loop LOOP) both increments and
6300    does not cause overflow.  */
6301
6302 static bool
6303 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6304 {
6305   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6306   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6307   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6308   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6309   widest_int ni, max_loop_value, lhs_max;
6310   bool overflow = false;
6311
6312   /* Make sure the loop is integer based.  */
6313   if (TREE_CODE (base) != INTEGER_CST
6314       || TREE_CODE (step) != INTEGER_CST)
6315     return false;
6316
6317   /* Check that the max size of the loop will not wrap.  */
6318
6319   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6320     return true;
6321
6322   if (! max_stmt_executions (loop, &ni))
6323     return false;
6324
6325   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6326                             &overflow);
6327   if (overflow)
6328     return false;
6329
6330   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6331                             TYPE_SIGN (lhs_type), &overflow);
6332   if (overflow)
6333     return false;
6334
6335   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6336           <= TYPE_PRECISION (lhs_type));
6337 }
6338
6339 /* Function vectorizable_reduction.
6340
6341    Check if STMT performs a reduction operation that can be vectorized.
6342    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6343    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6344    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6345
6346    This function also handles reduction idioms (patterns) that have been
6347    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6348    of this form:
6349      X = pattern_expr (arg0, arg1, ..., X)
6350    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6351    sequence that had been detected and replaced by the pattern-stmt (STMT).
6352
6353    This function also handles reduction of condition expressions, for example:
6354      for (int i = 0; i < N; i++)
6355        if (a[i] < value)
6356          last = a[i];
6357    This is handled by vectorising the loop and creating an additional vector
6358    containing the loop indexes for which "a[i] < value" was true.  In the
6359    function epilogue this is reduced to a single max value and then used to
6360    index into the vector of results.
6361
6362    In some cases of reduction patterns, the type of the reduction variable X is
6363    different than the type of the other arguments of STMT.
6364    In such cases, the vectype that is used when transforming STMT into a vector
6365    stmt is different than the vectype that is used to determine the
6366    vectorization factor, because it consists of a different number of elements
6367    than the actual number of elements that are being operated upon in parallel.
6368
6369    For example, consider an accumulation of shorts into an int accumulator.
6370    On some targets it's possible to vectorize this pattern operating on 8
6371    shorts at a time (hence, the vectype for purposes of determining the
6372    vectorization factor should be V8HI); on the other hand, the vectype that
6373    is used to create the vector form is actually V4SI (the type of the result).
6374
6375    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6376    indicates what is the actual level of parallelism (V8HI in the example), so
6377    that the right vectorization factor would be derived.  This vectype
6378    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6379    be used to create the vectorized stmt.  The right vectype for the vectorized
6380    stmt is obtained from the type of the result X:
6381         get_vectype_for_scalar_type (TREE_TYPE (X))
6382
6383    This means that, contrary to "regular" reductions (or "regular" stmts in
6384    general), the following equation:
6385       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6386    does *NOT* necessarily hold for reduction patterns.  */
6387
6388 bool
6389 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6390                         gimple **vec_stmt, slp_tree slp_node,
6391                         slp_instance slp_node_instance)
6392 {
6393   tree vec_dest;
6394   tree scalar_dest;
6395   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6396   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6397   tree vectype_in = NULL_TREE;
6398   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6399   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6400   enum tree_code code, orig_code;
6401   internal_fn reduc_fn;
6402   machine_mode vec_mode;
6403   int op_type;
6404   optab optab;
6405   tree new_temp = NULL_TREE;
6406   gimple *def_stmt;
6407   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6408   gimple *cond_reduc_def_stmt = NULL;
6409   enum tree_code cond_reduc_op_code = ERROR_MARK;
6410   tree scalar_type;
6411   bool is_simple_use;
6412   gimple *orig_stmt;
6413   stmt_vec_info orig_stmt_info = NULL;
6414   int i;
6415   int ncopies;
6416   int epilog_copies;
6417   stmt_vec_info prev_stmt_info, prev_phi_info;
6418   bool single_defuse_cycle = false;
6419   gimple *new_stmt = NULL;
6420   int j;
6421   tree ops[3];
6422   enum vect_def_type dts[3];
6423   bool nested_cycle = false, found_nested_cycle_def = false;
6424   bool double_reduc = false;
6425   basic_block def_bb;
6426   struct loop * def_stmt_loop, *outer_loop = NULL;
6427   tree def_arg;
6428   gimple *def_arg_stmt;
6429   auto_vec<tree> vec_oprnds0;
6430   auto_vec<tree> vec_oprnds1;
6431   auto_vec<tree> vec_oprnds2;
6432   auto_vec<tree> vect_defs;
6433   auto_vec<gimple *> phis;
6434   int vec_num;
6435   tree def0, tem;
6436   bool first_p = true;
6437   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6438   tree cond_reduc_val = NULL_TREE;
6439
6440   /* Make sure it was already recognized as a reduction computation.  */
6441   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6442       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6443     return false;
6444
6445   if (nested_in_vect_loop_p (loop, stmt))
6446     {
6447       outer_loop = loop;
6448       loop = loop->inner;
6449       nested_cycle = true;
6450     }
6451
6452   /* In case of reduction chain we switch to the first stmt in the chain, but
6453      we don't update STMT_INFO, since only the last stmt is marked as reduction
6454      and has reduction properties.  */
6455   if (GROUP_FIRST_ELEMENT (stmt_info)
6456       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6457     {
6458       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6459       first_p = false;
6460     }
6461
6462   if (gimple_code (stmt) == GIMPLE_PHI)
6463     {
6464       /* Analysis is fully done on the reduction stmt invocation.  */
6465       if (! vec_stmt)
6466         {
6467           if (slp_node)
6468             slp_node_instance->reduc_phis = slp_node;
6469
6470           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6471           return true;
6472         }
6473
6474       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6475         /* Leave the scalar phi in place.  Note that checking
6476            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6477            for reductions involving a single statement.  */
6478         return true;
6479
6480       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6481       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6482         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6483
6484       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6485           == EXTRACT_LAST_REDUCTION)
6486         /* Leave the scalar phi in place.  */
6487         return true;
6488
6489       gcc_assert (is_gimple_assign (reduc_stmt));
6490       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6491         {
6492           tree op = gimple_op (reduc_stmt, k);
6493           if (op == gimple_phi_result (stmt))
6494             continue;
6495           if (k == 1
6496               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6497             continue;
6498           if (!vectype_in
6499               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6500                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6501             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6502           break;
6503         }
6504       gcc_assert (vectype_in);
6505
6506       if (slp_node)
6507         ncopies = 1;
6508       else
6509         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6510
6511       use_operand_p use_p;
6512       gimple *use_stmt;
6513       if (ncopies > 1
6514           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6515               <= vect_used_only_live)
6516           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6517           && (use_stmt == reduc_stmt
6518               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6519                   == reduc_stmt)))
6520         single_defuse_cycle = true;
6521
6522       /* Create the destination vector  */
6523       scalar_dest = gimple_assign_lhs (reduc_stmt);
6524       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6525
6526       if (slp_node)
6527         /* The size vect_schedule_slp_instance computes is off for us.  */
6528         vec_num = vect_get_num_vectors
6529           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6530            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6531            vectype_in);
6532       else
6533         vec_num = 1;
6534
6535       /* Generate the reduction PHIs upfront.  */
6536       prev_phi_info = NULL;
6537       for (j = 0; j < ncopies; j++)
6538         {
6539           if (j == 0 || !single_defuse_cycle)
6540             {
6541               for (i = 0; i < vec_num; i++)
6542                 {
6543                   /* Create the reduction-phi that defines the reduction
6544                      operand.  */
6545                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6546                   set_vinfo_for_stmt (new_phi,
6547                                       new_stmt_vec_info (new_phi, loop_vinfo));
6548
6549                   if (slp_node)
6550                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6551                   else
6552                     {
6553                       if (j == 0)
6554                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6555                       else
6556                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6557                       prev_phi_info = vinfo_for_stmt (new_phi);
6558                     }
6559                 }
6560             }
6561         }
6562
6563       return true;
6564     }
6565
6566   /* 1. Is vectorizable reduction?  */
6567   /* Not supportable if the reduction variable is used in the loop, unless
6568      it's a reduction chain.  */
6569   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6570       && !GROUP_FIRST_ELEMENT (stmt_info))
6571     return false;
6572
6573   /* Reductions that are not used even in an enclosing outer-loop,
6574      are expected to be "live" (used out of the loop).  */
6575   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6576       && !STMT_VINFO_LIVE_P (stmt_info))
6577     return false;
6578
6579   /* 2. Has this been recognized as a reduction pattern?
6580
6581      Check if STMT represents a pattern that has been recognized
6582      in earlier analysis stages.  For stmts that represent a pattern,
6583      the STMT_VINFO_RELATED_STMT field records the last stmt in
6584      the original sequence that constitutes the pattern.  */
6585
6586   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6587   if (orig_stmt)
6588     {
6589       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6590       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6591       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6592     }
6593
6594   /* 3. Check the operands of the operation.  The first operands are defined
6595         inside the loop body. The last operand is the reduction variable,
6596         which is defined by the loop-header-phi.  */
6597
6598   gcc_assert (is_gimple_assign (stmt));
6599
6600   /* Flatten RHS.  */
6601   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6602     {
6603     case GIMPLE_BINARY_RHS:
6604       code = gimple_assign_rhs_code (stmt);
6605       op_type = TREE_CODE_LENGTH (code);
6606       gcc_assert (op_type == binary_op);
6607       ops[0] = gimple_assign_rhs1 (stmt);
6608       ops[1] = gimple_assign_rhs2 (stmt);
6609       break;
6610
6611     case GIMPLE_TERNARY_RHS:
6612       code = gimple_assign_rhs_code (stmt);
6613       op_type = TREE_CODE_LENGTH (code);
6614       gcc_assert (op_type == ternary_op);
6615       ops[0] = gimple_assign_rhs1 (stmt);
6616       ops[1] = gimple_assign_rhs2 (stmt);
6617       ops[2] = gimple_assign_rhs3 (stmt);
6618       break;
6619
6620     case GIMPLE_UNARY_RHS:
6621       return false;
6622
6623     default:
6624       gcc_unreachable ();
6625     }
6626
6627   if (code == COND_EXPR && slp_node)
6628     return false;
6629
6630   scalar_dest = gimple_assign_lhs (stmt);
6631   scalar_type = TREE_TYPE (scalar_dest);
6632   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6633       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6634     return false;
6635
6636   /* Do not try to vectorize bit-precision reductions.  */
6637   if (!type_has_mode_precision_p (scalar_type))
6638     return false;
6639
6640   /* All uses but the last are expected to be defined in the loop.
6641      The last use is the reduction variable.  In case of nested cycle this
6642      assumption is not true: we use reduc_index to record the index of the
6643      reduction variable.  */
6644   gimple *reduc_def_stmt = NULL;
6645   int reduc_index = -1;
6646   for (i = 0; i < op_type; i++)
6647     {
6648       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6649       if (i == 0 && code == COND_EXPR)
6650         continue;
6651
6652       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6653                                           &def_stmt, &dts[i], &tem);
6654       dt = dts[i];
6655       gcc_assert (is_simple_use);
6656       if (dt == vect_reduction_def)
6657         {
6658           reduc_def_stmt = def_stmt;
6659           reduc_index = i;
6660           continue;
6661         }
6662       else if (tem)
6663         {
6664           /* To properly compute ncopies we are interested in the widest
6665              input type in case we're looking at a widening accumulation.  */
6666           if (!vectype_in
6667               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6668                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6669             vectype_in = tem;
6670         }
6671
6672       if (dt != vect_internal_def
6673           && dt != vect_external_def
6674           && dt != vect_constant_def
6675           && dt != vect_induction_def
6676           && !(dt == vect_nested_cycle && nested_cycle))
6677         return false;
6678
6679       if (dt == vect_nested_cycle)
6680         {
6681           found_nested_cycle_def = true;
6682           reduc_def_stmt = def_stmt;
6683           reduc_index = i;
6684         }
6685
6686       if (i == 1 && code == COND_EXPR)
6687         {
6688           /* Record how value of COND_EXPR is defined.  */
6689           if (dt == vect_constant_def)
6690             {
6691               cond_reduc_dt = dt;
6692               cond_reduc_val = ops[i];
6693             }
6694           if (dt == vect_induction_def
6695               && def_stmt != NULL
6696               && is_nonwrapping_integer_induction (def_stmt, loop))
6697             {
6698               cond_reduc_dt = dt;
6699               cond_reduc_def_stmt = def_stmt;
6700             }
6701         }
6702     }
6703
6704   if (!vectype_in)
6705     vectype_in = vectype_out;
6706
6707   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6708      directy used in stmt.  */
6709   if (reduc_index == -1)
6710     {
6711       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6712         {
6713           if (dump_enabled_p ())
6714             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6715                              "in-order reduction chain without SLP.\n");
6716           return false;
6717         }
6718
6719       if (orig_stmt)
6720         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6721       else
6722         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6723     }
6724
6725   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6726     return false;
6727
6728   if (!(reduc_index == -1
6729         || dts[reduc_index] == vect_reduction_def
6730         || dts[reduc_index] == vect_nested_cycle
6731         || ((dts[reduc_index] == vect_internal_def
6732              || dts[reduc_index] == vect_external_def
6733              || dts[reduc_index] == vect_constant_def
6734              || dts[reduc_index] == vect_induction_def)
6735             && nested_cycle && found_nested_cycle_def)))
6736     {
6737       /* For pattern recognized stmts, orig_stmt might be a reduction,
6738          but some helper statements for the pattern might not, or
6739          might be COND_EXPRs with reduction uses in the condition.  */
6740       gcc_assert (orig_stmt);
6741       return false;
6742     }
6743
6744   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6745   enum vect_reduction_type v_reduc_type
6746     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6747   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6748
6749   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6750   /* If we have a condition reduction, see if we can simplify it further.  */
6751   if (v_reduc_type == COND_REDUCTION)
6752     {
6753       /* Loop peeling modifies initial value of reduction PHI, which
6754          makes the reduction stmt to be transformed different to the
6755          original stmt analyzed.  We need to record reduction code for
6756          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6757          it can be used directly at transform stage.  */
6758       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6759           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6760         {
6761           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6762           gcc_assert (cond_reduc_dt == vect_constant_def);
6763           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6764         }
6765       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6766                                                vectype_in, OPTIMIZE_FOR_SPEED))
6767         {
6768           if (dump_enabled_p ())
6769             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6770                              "optimizing condition reduction with"
6771                              " FOLD_EXTRACT_LAST.\n");
6772           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6773         }
6774       else if (cond_reduc_dt == vect_induction_def)
6775         {
6776           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6777           tree base
6778             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6779           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6780
6781           gcc_assert (TREE_CODE (base) == INTEGER_CST
6782                       && TREE_CODE (step) == INTEGER_CST);
6783           cond_reduc_val = NULL_TREE;
6784           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6785              above base; punt if base is the minimum value of the type for
6786              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6787           if (tree_int_cst_sgn (step) == -1)
6788             {
6789               cond_reduc_op_code = MIN_EXPR;
6790               if (tree_int_cst_sgn (base) == -1)
6791                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6792               else if (tree_int_cst_lt (base,
6793                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6794                 cond_reduc_val
6795                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6796             }
6797           else
6798             {
6799               cond_reduc_op_code = MAX_EXPR;
6800               if (tree_int_cst_sgn (base) == 1)
6801                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6802               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6803                                         base))
6804                 cond_reduc_val
6805                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6806             }
6807           if (cond_reduc_val)
6808             {
6809               if (dump_enabled_p ())
6810                 dump_printf_loc (MSG_NOTE, vect_location,
6811                                  "condition expression based on "
6812                                  "integer induction.\n");
6813               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6814                 = INTEGER_INDUC_COND_REDUCTION;
6815             }
6816         }
6817       else if (cond_reduc_dt == vect_constant_def)
6818         {
6819           enum vect_def_type cond_initial_dt;
6820           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6821           tree cond_initial_val
6822             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6823
6824           gcc_assert (cond_reduc_val != NULL_TREE);
6825           vect_is_simple_use (cond_initial_val, loop_vinfo,
6826                               &def_stmt, &cond_initial_dt);
6827           if (cond_initial_dt == vect_constant_def
6828               && types_compatible_p (TREE_TYPE (cond_initial_val),
6829                                      TREE_TYPE (cond_reduc_val)))
6830             {
6831               tree e = fold_binary (LE_EXPR, boolean_type_node,
6832                                     cond_initial_val, cond_reduc_val);
6833               if (e && (integer_onep (e) || integer_zerop (e)))
6834                 {
6835                   if (dump_enabled_p ())
6836                     dump_printf_loc (MSG_NOTE, vect_location,
6837                                      "condition expression based on "
6838                                      "compile time constant.\n");
6839                   /* Record reduction code at analysis stage.  */
6840                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6841                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6842                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6843                     = CONST_COND_REDUCTION;
6844                 }
6845             }
6846         }
6847     }
6848
6849   if (orig_stmt)
6850     gcc_assert (tmp == orig_stmt
6851                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6852   else
6853     /* We changed STMT to be the first stmt in reduction chain, hence we
6854        check that in this case the first element in the chain is STMT.  */
6855     gcc_assert (stmt == tmp
6856                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6857
6858   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6859     return false;
6860
6861   if (slp_node)
6862     ncopies = 1;
6863   else
6864     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6865
6866   gcc_assert (ncopies >= 1);
6867
6868   vec_mode = TYPE_MODE (vectype_in);
6869   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6870
6871   if (code == COND_EXPR)
6872     {
6873       /* Only call during the analysis stage, otherwise we'll lose
6874          STMT_VINFO_TYPE.  */
6875       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6876                                                 ops[reduc_index], 0, NULL))
6877         {
6878           if (dump_enabled_p ())
6879             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6880                              "unsupported condition in reduction\n");
6881           return false;
6882         }
6883     }
6884   else
6885     {
6886       /* 4. Supportable by target?  */
6887
6888       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6889           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6890         {
6891           /* Shifts and rotates are only supported by vectorizable_shifts,
6892              not vectorizable_reduction.  */
6893           if (dump_enabled_p ())
6894             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6895                              "unsupported shift or rotation.\n");
6896           return false;
6897         }
6898
6899       /* 4.1. check support for the operation in the loop  */
6900       optab = optab_for_tree_code (code, vectype_in, optab_default);
6901       if (!optab)
6902         {
6903           if (dump_enabled_p ())
6904             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6905                              "no optab.\n");
6906
6907           return false;
6908         }
6909
6910       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6911         {
6912           if (dump_enabled_p ())
6913             dump_printf (MSG_NOTE, "op not supported by target.\n");
6914
6915           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6916               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6917             return false;
6918
6919           if (dump_enabled_p ())
6920             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6921         }
6922
6923       /* Worthwhile without SIMD support?  */
6924       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6925           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6926         {
6927           if (dump_enabled_p ())
6928             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6929                              "not worthwhile without SIMD support.\n");
6930
6931           return false;
6932         }
6933     }
6934
6935   /* 4.2. Check support for the epilog operation.
6936
6937           If STMT represents a reduction pattern, then the type of the
6938           reduction variable may be different than the type of the rest
6939           of the arguments.  For example, consider the case of accumulation
6940           of shorts into an int accumulator; The original code:
6941                         S1: int_a = (int) short_a;
6942           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6943
6944           was replaced with:
6945                         STMT: int_acc = widen_sum <short_a, int_acc>
6946
6947           This means that:
6948           1. The tree-code that is used to create the vector operation in the
6949              epilog code (that reduces the partial results) is not the
6950              tree-code of STMT, but is rather the tree-code of the original
6951              stmt from the pattern that STMT is replacing.  I.e, in the example
6952              above we want to use 'widen_sum' in the loop, but 'plus' in the
6953              epilog.
6954           2. The type (mode) we use to check available target support
6955              for the vector operation to be created in the *epilog*, is
6956              determined by the type of the reduction variable (in the example
6957              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6958              However the type (mode) we use to check available target support
6959              for the vector operation to be created *inside the loop*, is
6960              determined by the type of the other arguments to STMT (in the
6961              example we'd check this: optab_handler (widen_sum_optab,
6962              vect_short_mode)).
6963
6964           This is contrary to "regular" reductions, in which the types of all
6965           the arguments are the same as the type of the reduction variable.
6966           For "regular" reductions we can therefore use the same vector type
6967           (and also the same tree-code) when generating the epilog code and
6968           when generating the code inside the loop.  */
6969
6970   vect_reduction_type reduction_type
6971     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
6972   if (orig_stmt
6973       && (reduction_type == TREE_CODE_REDUCTION
6974           || reduction_type == FOLD_LEFT_REDUCTION))
6975     {
6976       /* This is a reduction pattern: get the vectype from the type of the
6977          reduction variable, and get the tree-code from orig_stmt.  */
6978       orig_code = gimple_assign_rhs_code (orig_stmt);
6979       gcc_assert (vectype_out);
6980       vec_mode = TYPE_MODE (vectype_out);
6981     }
6982   else
6983     {
6984       /* Regular reduction: use the same vectype and tree-code as used for
6985          the vector code inside the loop can be used for the epilog code. */
6986       orig_code = code;
6987
6988       if (code == MINUS_EXPR)
6989         orig_code = PLUS_EXPR;
6990
6991       /* For simple condition reductions, replace with the actual expression
6992          we want to base our reduction around.  */
6993       if (reduction_type == CONST_COND_REDUCTION)
6994         {
6995           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6996           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6997         }
6998       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
6999         orig_code = cond_reduc_op_code;
7000     }
7001
7002   if (nested_cycle)
7003     {
7004       def_bb = gimple_bb (reduc_def_stmt);
7005       def_stmt_loop = def_bb->loop_father;
7006       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7007                                        loop_preheader_edge (def_stmt_loop));
7008       if (TREE_CODE (def_arg) == SSA_NAME
7009           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7010           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7011           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7012           && vinfo_for_stmt (def_arg_stmt)
7013           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7014               == vect_double_reduction_def)
7015         double_reduc = true;
7016     }
7017
7018   reduc_fn = IFN_LAST;
7019
7020   if (reduction_type == TREE_CODE_REDUCTION
7021       || reduction_type == FOLD_LEFT_REDUCTION
7022       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7023       || reduction_type == CONST_COND_REDUCTION)
7024     {
7025       if (reduction_type == FOLD_LEFT_REDUCTION
7026           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7027           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7028         {
7029           if (reduc_fn != IFN_LAST
7030               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7031                                                   OPTIMIZE_FOR_SPEED))
7032             {
7033               if (dump_enabled_p ())
7034                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7035                                  "reduc op not supported by target.\n");
7036
7037               reduc_fn = IFN_LAST;
7038             }
7039         }
7040       else
7041         {
7042           if (!nested_cycle || double_reduc)
7043             {
7044               if (dump_enabled_p ())
7045                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7046                                  "no reduc code for scalar code.\n");
7047
7048               return false;
7049             }
7050         }
7051     }
7052   else if (reduction_type == COND_REDUCTION)
7053     {
7054       int scalar_precision
7055         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7056       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7057       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7058                                                 nunits_out);
7059
7060       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7061                                           OPTIMIZE_FOR_SPEED))
7062         reduc_fn = IFN_REDUC_MAX;
7063     }
7064
7065   if (reduction_type != EXTRACT_LAST_REDUCTION
7066       && reduc_fn == IFN_LAST
7067       && !nunits_out.is_constant ())
7068     {
7069       if (dump_enabled_p ())
7070         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7071                          "missing target support for reduction on"
7072                          " variable-length vectors.\n");
7073       return false;
7074     }
7075
7076   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7077       && ncopies > 1)
7078     {
7079       if (dump_enabled_p ())
7080         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7081                          "multiple types in double reduction or condition "
7082                          "reduction.\n");
7083       return false;
7084     }
7085
7086   /* For SLP reductions, see if there is a neutral value we can use.  */
7087   tree neutral_op = NULL_TREE;
7088   if (slp_node)
7089     neutral_op
7090       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7091                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7092
7093   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7094     {
7095       /* We can't support in-order reductions of code such as this:
7096
7097            for (int i = 0; i < n1; ++i)
7098              for (int j = 0; j < n2; ++j)
7099                l += a[j];
7100
7101          since GCC effectively transforms the loop when vectorizing:
7102
7103            for (int i = 0; i < n1 / VF; ++i)
7104              for (int j = 0; j < n2; ++j)
7105                for (int k = 0; k < VF; ++k)
7106                  l += a[j];
7107
7108          which is a reassociation of the original operation.  */
7109       if (dump_enabled_p ())
7110         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7111                          "in-order double reduction not supported.\n");
7112
7113       return false;
7114     }
7115
7116   if (reduction_type == FOLD_LEFT_REDUCTION
7117       && slp_node
7118       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7119     {
7120       /* We cannot use in-order reductions in this case because there is
7121          an implicit reassociation of the operations involved.  */
7122       if (dump_enabled_p ())
7123         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7124                          "in-order unchained SLP reductions not supported.\n");
7125       return false;
7126     }
7127
7128   /* For double reductions, and for SLP reductions with a neutral value,
7129      we construct a variable-length initial vector by loading a vector
7130      full of the neutral value and then shift-and-inserting the start
7131      values into the low-numbered elements.  */
7132   if ((double_reduc || neutral_op)
7133       && !nunits_out.is_constant ()
7134       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7135                                           vectype_out, OPTIMIZE_FOR_SPEED))
7136     {
7137       if (dump_enabled_p ())
7138         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7139                          "reduction on variable-length vectors requires"
7140                          " target support for a vector-shift-and-insert"
7141                          " operation.\n");
7142       return false;
7143     }
7144
7145   /* Check extra constraints for variable-length unchained SLP reductions.  */
7146   if (STMT_SLP_TYPE (stmt_info)
7147       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7148       && !nunits_out.is_constant ())
7149     {
7150       /* We checked above that we could build the initial vector when
7151          there's a neutral element value.  Check here for the case in
7152          which each SLP statement has its own initial value and in which
7153          that value needs to be repeated for every instance of the
7154          statement within the initial vector.  */
7155       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7156       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7157       if (!neutral_op
7158           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7159         {
7160           if (dump_enabled_p ())
7161             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7162                              "unsupported form of SLP reduction for"
7163                              " variable-length vectors: cannot build"
7164                              " initial vector.\n");
7165           return false;
7166         }
7167       /* The epilogue code relies on the number of elements being a multiple
7168          of the group size.  The duplicate-and-interleave approach to setting
7169          up the the initial vector does too.  */
7170       if (!multiple_p (nunits_out, group_size))
7171         {
7172           if (dump_enabled_p ())
7173             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7174                              "unsupported form of SLP reduction for"
7175                              " variable-length vectors: the vector size"
7176                              " is not a multiple of the number of results.\n");
7177           return false;
7178         }
7179     }
7180
7181   /* In case of widenning multiplication by a constant, we update the type
7182      of the constant to be the type of the other operand.  We check that the
7183      constant fits the type in the pattern recognition pass.  */
7184   if (code == DOT_PROD_EXPR
7185       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7186     {
7187       if (TREE_CODE (ops[0]) == INTEGER_CST)
7188         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7189       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7190         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7191       else
7192         {
7193           if (dump_enabled_p ())
7194             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7195                              "invalid types in dot-prod\n");
7196
7197           return false;
7198         }
7199     }
7200
7201   if (reduction_type == COND_REDUCTION)
7202     {
7203       widest_int ni;
7204
7205       if (! max_loop_iterations (loop, &ni))
7206         {
7207           if (dump_enabled_p ())
7208             dump_printf_loc (MSG_NOTE, vect_location,
7209                              "loop count not known, cannot create cond "
7210                              "reduction.\n");
7211           return false;
7212         }
7213       /* Convert backedges to iterations.  */
7214       ni += 1;
7215
7216       /* The additional index will be the same type as the condition.  Check
7217          that the loop can fit into this less one (because we'll use up the
7218          zero slot for when there are no matches).  */
7219       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7220       if (wi::geu_p (ni, wi::to_widest (max_index)))
7221         {
7222           if (dump_enabled_p ())
7223             dump_printf_loc (MSG_NOTE, vect_location,
7224                              "loop size is greater than data size.\n");
7225           return false;
7226         }
7227     }
7228
7229   /* In case the vectorization factor (VF) is bigger than the number
7230      of elements that we can fit in a vectype (nunits), we have to generate
7231      more than one vector stmt - i.e - we need to "unroll" the
7232      vector stmt by a factor VF/nunits.  For more details see documentation
7233      in vectorizable_operation.  */
7234
7235   /* If the reduction is used in an outer loop we need to generate
7236      VF intermediate results, like so (e.g. for ncopies=2):
7237         r0 = phi (init, r0)
7238         r1 = phi (init, r1)
7239         r0 = x0 + r0;
7240         r1 = x1 + r1;
7241     (i.e. we generate VF results in 2 registers).
7242     In this case we have a separate def-use cycle for each copy, and therefore
7243     for each copy we get the vector def for the reduction variable from the
7244     respective phi node created for this copy.
7245
7246     Otherwise (the reduction is unused in the loop nest), we can combine
7247     together intermediate results, like so (e.g. for ncopies=2):
7248         r = phi (init, r)
7249         r = x0 + r;
7250         r = x1 + r;
7251    (i.e. we generate VF/2 results in a single register).
7252    In this case for each copy we get the vector def for the reduction variable
7253    from the vectorized reduction operation generated in the previous iteration.
7254
7255    This only works when we see both the reduction PHI and its only consumer
7256    in vectorizable_reduction and there are no intermediate stmts
7257    participating.  */
7258   use_operand_p use_p;
7259   gimple *use_stmt;
7260   if (ncopies > 1
7261       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7262       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7263       && (use_stmt == stmt
7264           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7265     {
7266       single_defuse_cycle = true;
7267       epilog_copies = 1;
7268     }
7269   else
7270     epilog_copies = ncopies;
7271
7272   /* If the reduction stmt is one of the patterns that have lane
7273      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7274   if ((ncopies > 1
7275        && ! single_defuse_cycle)
7276       && (code == DOT_PROD_EXPR
7277           || code == WIDEN_SUM_EXPR
7278           || code == SAD_EXPR))
7279     {
7280       if (dump_enabled_p ())
7281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7282                          "multi def-use cycle not possible for lane-reducing "
7283                          "reduction operation\n");
7284       return false;
7285     }
7286
7287   if (slp_node)
7288     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7289   else
7290     vec_num = 1;
7291
7292   internal_fn cond_fn = get_conditional_internal_fn (code);
7293   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7294
7295   if (!vec_stmt) /* transformation not required.  */
7296     {
7297       if (first_p)
7298         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7299       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7300         {
7301           if (reduction_type != FOLD_LEFT_REDUCTION
7302               && (cond_fn == IFN_LAST
7303                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7304                                                       OPTIMIZE_FOR_SPEED)))
7305             {
7306               if (dump_enabled_p ())
7307                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7308                                  "can't use a fully-masked loop because no"
7309                                  " conditional operation is available.\n");
7310               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7311             }
7312           else if (reduc_index == -1)
7313             {
7314               if (dump_enabled_p ())
7315                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7316                                  "can't use a fully-masked loop for chained"
7317                                  " reductions.\n");
7318               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7319             }
7320           else
7321             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7322                                    vectype_in);
7323         }
7324       if (dump_enabled_p ()
7325           && reduction_type == FOLD_LEFT_REDUCTION)
7326         dump_printf_loc (MSG_NOTE, vect_location,
7327                          "using an in-order (fold-left) reduction.\n");
7328       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7329       return true;
7330     }
7331
7332   /* Transform.  */
7333
7334   if (dump_enabled_p ())
7335     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7336
7337   /* FORNOW: Multiple types are not supported for condition.  */
7338   if (code == COND_EXPR)
7339     gcc_assert (ncopies == 1);
7340
7341   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7342
7343   if (reduction_type == FOLD_LEFT_REDUCTION)
7344     return vectorize_fold_left_reduction
7345       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7346        reduc_fn, ops, vectype_in, reduc_index, masks);
7347
7348   if (reduction_type == EXTRACT_LAST_REDUCTION)
7349     {
7350       gcc_assert (!slp_node);
7351       return vectorizable_condition (stmt, gsi, vec_stmt,
7352                                      NULL, reduc_index, NULL);
7353     }
7354
7355   /* Create the destination vector  */
7356   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7357
7358   prev_stmt_info = NULL;
7359   prev_phi_info = NULL;
7360   if (!slp_node)
7361     {
7362       vec_oprnds0.create (1);
7363       vec_oprnds1.create (1);
7364       if (op_type == ternary_op)
7365         vec_oprnds2.create (1);
7366     }
7367
7368   phis.create (vec_num);
7369   vect_defs.create (vec_num);
7370   if (!slp_node)
7371     vect_defs.quick_push (NULL_TREE);
7372
7373   if (slp_node)
7374     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7375   else
7376     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7377
7378   for (j = 0; j < ncopies; j++)
7379     {
7380       if (code == COND_EXPR)
7381         {
7382           gcc_assert (!slp_node);
7383           vectorizable_condition (stmt, gsi, vec_stmt,
7384                                   PHI_RESULT (phis[0]),
7385                                   reduc_index, NULL);
7386           /* Multiple types are not supported for condition.  */
7387           break;
7388         }
7389
7390       /* Handle uses.  */
7391       if (j == 0)
7392         {
7393           if (slp_node)
7394             {
7395               /* Get vec defs for all the operands except the reduction index,
7396                  ensuring the ordering of the ops in the vector is kept.  */
7397               auto_vec<tree, 3> slp_ops;
7398               auto_vec<vec<tree>, 3> vec_defs;
7399
7400               slp_ops.quick_push (ops[0]);
7401               slp_ops.quick_push (ops[1]);
7402               if (op_type == ternary_op)
7403                 slp_ops.quick_push (ops[2]);
7404
7405               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7406
7407               vec_oprnds0.safe_splice (vec_defs[0]);
7408               vec_defs[0].release ();
7409               vec_oprnds1.safe_splice (vec_defs[1]);
7410               vec_defs[1].release ();
7411               if (op_type == ternary_op)
7412                 {
7413                   vec_oprnds2.safe_splice (vec_defs[2]);
7414                   vec_defs[2].release ();
7415                 }
7416             }
7417           else
7418             {
7419               vec_oprnds0.quick_push
7420                 (vect_get_vec_def_for_operand (ops[0], stmt));
7421               vec_oprnds1.quick_push
7422                 (vect_get_vec_def_for_operand (ops[1], stmt));
7423               if (op_type == ternary_op)
7424                 vec_oprnds2.quick_push
7425                   (vect_get_vec_def_for_operand (ops[2], stmt));
7426             }
7427         }
7428       else
7429         {
7430           if (!slp_node)
7431             {
7432               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7433
7434               if (single_defuse_cycle && reduc_index == 0)
7435                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7436               else
7437                 vec_oprnds0[0]
7438                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7439               if (single_defuse_cycle && reduc_index == 1)
7440                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7441               else
7442                 vec_oprnds1[0]
7443                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7444               if (op_type == ternary_op)
7445                 {
7446                   if (single_defuse_cycle && reduc_index == 2)
7447                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7448                   else
7449                     vec_oprnds2[0]
7450                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7451                 }
7452             }
7453         }
7454
7455       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7456         {
7457           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7458           if (masked_loop_p)
7459             {
7460               /* Make sure that the reduction accumulator is vop[0].  */
7461               if (reduc_index == 1)
7462                 {
7463                   gcc_assert (commutative_tree_code (code));
7464                   std::swap (vop[0], vop[1]);
7465                 }
7466               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7467                                               vectype_in, i * ncopies + j);
7468               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7469                                                         vop[0], vop[1]);
7470               new_temp = make_ssa_name (vec_dest, call);
7471               gimple_call_set_lhs (call, new_temp);
7472               gimple_call_set_nothrow (call, true);
7473               new_stmt = call;
7474             }
7475           else
7476             {
7477               if (op_type == ternary_op)
7478                 vop[2] = vec_oprnds2[i];
7479
7480               new_temp = make_ssa_name (vec_dest, new_stmt);
7481               new_stmt = gimple_build_assign (new_temp, code,
7482                                               vop[0], vop[1], vop[2]);
7483             }
7484           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7485
7486           if (slp_node)
7487             {
7488               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7489               vect_defs.quick_push (new_temp);
7490             }
7491           else
7492             vect_defs[0] = new_temp;
7493         }
7494
7495       if (slp_node)
7496         continue;
7497
7498       if (j == 0)
7499         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7500       else
7501         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7502
7503       prev_stmt_info = vinfo_for_stmt (new_stmt);
7504     }
7505
7506   /* Finalize the reduction-phi (set its arguments) and create the
7507      epilog reduction code.  */
7508   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7509     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7510
7511   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7512                                     epilog_copies, reduc_fn, phis,
7513                                     double_reduc, slp_node, slp_node_instance,
7514                                     cond_reduc_val, cond_reduc_op_code,
7515                                     neutral_op);
7516
7517   return true;
7518 }
7519
7520 /* Function vect_min_worthwhile_factor.
7521
7522    For a loop where we could vectorize the operation indicated by CODE,
7523    return the minimum vectorization factor that makes it worthwhile
7524    to use generic vectors.  */
7525 static unsigned int
7526 vect_min_worthwhile_factor (enum tree_code code)
7527 {
7528   switch (code)
7529     {
7530     case PLUS_EXPR:
7531     case MINUS_EXPR:
7532     case NEGATE_EXPR:
7533       return 4;
7534
7535     case BIT_AND_EXPR:
7536     case BIT_IOR_EXPR:
7537     case BIT_XOR_EXPR:
7538     case BIT_NOT_EXPR:
7539       return 2;
7540
7541     default:
7542       return INT_MAX;
7543     }
7544 }
7545
7546 /* Return true if VINFO indicates we are doing loop vectorization and if
7547    it is worth decomposing CODE operations into scalar operations for
7548    that loop's vectorization factor.  */
7549
7550 bool
7551 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7552 {
7553   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7554   unsigned HOST_WIDE_INT value;
7555   return (loop_vinfo
7556           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7557           && value >= vect_min_worthwhile_factor (code));
7558 }
7559
7560 /* Function vectorizable_induction
7561
7562    Check if PHI performs an induction computation that can be vectorized.
7563    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7564    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7565    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7566
7567 bool
7568 vectorizable_induction (gimple *phi,
7569                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7570                         gimple **vec_stmt, slp_tree slp_node)
7571 {
7572   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7573   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7574   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7575   unsigned ncopies;
7576   bool nested_in_vect_loop = false;
7577   struct loop *iv_loop;
7578   tree vec_def;
7579   edge pe = loop_preheader_edge (loop);
7580   basic_block new_bb;
7581   tree new_vec, vec_init, vec_step, t;
7582   tree new_name;
7583   gimple *new_stmt;
7584   gphi *induction_phi;
7585   tree induc_def, vec_dest;
7586   tree init_expr, step_expr;
7587   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7588   unsigned i;
7589   tree expr;
7590   gimple_seq stmts;
7591   imm_use_iterator imm_iter;
7592   use_operand_p use_p;
7593   gimple *exit_phi;
7594   edge latch_e;
7595   tree loop_arg;
7596   gimple_stmt_iterator si;
7597   basic_block bb = gimple_bb (phi);
7598
7599   if (gimple_code (phi) != GIMPLE_PHI)
7600     return false;
7601
7602   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7603     return false;
7604
7605   /* Make sure it was recognized as induction computation.  */
7606   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7607     return false;
7608
7609   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7610   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7611
7612   if (slp_node)
7613     ncopies = 1;
7614   else
7615     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7616   gcc_assert (ncopies >= 1);
7617
7618   /* FORNOW. These restrictions should be relaxed.  */
7619   if (nested_in_vect_loop_p (loop, phi))
7620     {
7621       imm_use_iterator imm_iter;
7622       use_operand_p use_p;
7623       gimple *exit_phi;
7624       edge latch_e;
7625       tree loop_arg;
7626
7627       if (ncopies > 1)
7628         {
7629           if (dump_enabled_p ())
7630             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7631                              "multiple types in nested loop.\n");
7632           return false;
7633         }
7634
7635       /* FORNOW: outer loop induction with SLP not supported.  */
7636       if (STMT_SLP_TYPE (stmt_info))
7637         return false;
7638
7639       exit_phi = NULL;
7640       latch_e = loop_latch_edge (loop->inner);
7641       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7642       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7643         {
7644           gimple *use_stmt = USE_STMT (use_p);
7645           if (is_gimple_debug (use_stmt))
7646             continue;
7647
7648           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7649             {
7650               exit_phi = use_stmt;
7651               break;
7652             }
7653         }
7654       if (exit_phi)
7655         {
7656           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7657           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7658                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7659             {
7660               if (dump_enabled_p ())
7661                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7662                                  "inner-loop induction only used outside "
7663                                  "of the outer vectorized loop.\n");
7664               return false;
7665             }
7666         }
7667
7668       nested_in_vect_loop = true;
7669       iv_loop = loop->inner;
7670     }
7671   else
7672     iv_loop = loop;
7673   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7674
7675   if (slp_node && !nunits.is_constant ())
7676     {
7677       /* The current SLP code creates the initial value element-by-element.  */
7678       if (dump_enabled_p ())
7679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7680                          "SLP induction not supported for variable-length"
7681                          " vectors.\n");
7682       return false;
7683     }
7684
7685   if (!vec_stmt) /* transformation not required.  */
7686     {
7687       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7688       if (dump_enabled_p ())
7689         dump_printf_loc (MSG_NOTE, vect_location,
7690                          "=== vectorizable_induction ===\n");
7691       vect_model_induction_cost (stmt_info, ncopies);
7692       return true;
7693     }
7694
7695   /* Transform.  */
7696
7697   /* Compute a vector variable, initialized with the first VF values of
7698      the induction variable.  E.g., for an iv with IV_PHI='X' and
7699      evolution S, for a vector of 4 units, we want to compute:
7700      [X, X + S, X + 2*S, X + 3*S].  */
7701
7702   if (dump_enabled_p ())
7703     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7704
7705   latch_e = loop_latch_edge (iv_loop);
7706   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7707
7708   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7709   gcc_assert (step_expr != NULL_TREE);
7710
7711   pe = loop_preheader_edge (iv_loop);
7712   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7713                                      loop_preheader_edge (iv_loop));
7714
7715   stmts = NULL;
7716   if (!nested_in_vect_loop)
7717     {
7718       /* Convert the initial value to the desired type.  */
7719       tree new_type = TREE_TYPE (vectype);
7720       init_expr = gimple_convert (&stmts, new_type, init_expr);
7721
7722       /* If we are using the loop mask to "peel" for alignment then we need
7723          to adjust the start value here.  */
7724       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7725       if (skip_niters != NULL_TREE)
7726         {
7727           if (FLOAT_TYPE_P (vectype))
7728             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7729                                         skip_niters);
7730           else
7731             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7732           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7733                                          skip_niters, step_expr);
7734           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7735                                     init_expr, skip_step);
7736         }
7737     }
7738
7739   /* Convert the step to the desired type.  */
7740   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7741
7742   if (stmts)
7743     {
7744       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7745       gcc_assert (!new_bb);
7746     }
7747
7748   /* Find the first insertion point in the BB.  */
7749   si = gsi_after_labels (bb);
7750
7751   /* For SLP induction we have to generate several IVs as for example
7752      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7753      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7754      [VF*S, VF*S, VF*S, VF*S] for all.  */
7755   if (slp_node)
7756     {
7757       /* Enforced above.  */
7758       unsigned int const_nunits = nunits.to_constant ();
7759
7760       /* Generate [VF*S, VF*S, ... ].  */
7761       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7762         {
7763           expr = build_int_cst (integer_type_node, vf);
7764           expr = fold_convert (TREE_TYPE (step_expr), expr);
7765         }
7766       else
7767         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7768       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7769                               expr, step_expr);
7770       if (! CONSTANT_CLASS_P (new_name))
7771         new_name = vect_init_vector (phi, new_name,
7772                                      TREE_TYPE (step_expr), NULL);
7773       new_vec = build_vector_from_val (vectype, new_name);
7774       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7775
7776       /* Now generate the IVs.  */
7777       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7778       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7779       unsigned elts = const_nunits * nvects;
7780       unsigned nivs = least_common_multiple (group_size,
7781                                              const_nunits) / const_nunits;
7782       gcc_assert (elts % group_size == 0);
7783       tree elt = init_expr;
7784       unsigned ivn;
7785       for (ivn = 0; ivn < nivs; ++ivn)
7786         {
7787           tree_vector_builder elts (vectype, const_nunits, 1);
7788           stmts = NULL;
7789           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7790             {
7791               if (ivn*const_nunits + eltn >= group_size
7792                   && (ivn * const_nunits + eltn) % group_size == 0)
7793                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7794                                     elt, step_expr);
7795               elts.quick_push (elt);
7796             }
7797           vec_init = gimple_build_vector (&stmts, &elts);
7798           if (stmts)
7799             {
7800               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7801               gcc_assert (!new_bb);
7802             }
7803
7804           /* Create the induction-phi that defines the induction-operand.  */
7805           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7806           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7807           set_vinfo_for_stmt (induction_phi,
7808                               new_stmt_vec_info (induction_phi, loop_vinfo));
7809           induc_def = PHI_RESULT (induction_phi);
7810
7811           /* Create the iv update inside the loop  */
7812           vec_def = make_ssa_name (vec_dest);
7813           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7814           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7815           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7816
7817           /* Set the arguments of the phi node:  */
7818           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7819           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7820                        UNKNOWN_LOCATION);
7821
7822           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7823         }
7824
7825       /* Re-use IVs when we can.  */
7826       if (ivn < nvects)
7827         {
7828           unsigned vfp
7829             = least_common_multiple (group_size, const_nunits) / group_size;
7830           /* Generate [VF'*S, VF'*S, ... ].  */
7831           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7832             {
7833               expr = build_int_cst (integer_type_node, vfp);
7834               expr = fold_convert (TREE_TYPE (step_expr), expr);
7835             }
7836           else
7837             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7838           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7839                                   expr, step_expr);
7840           if (! CONSTANT_CLASS_P (new_name))
7841             new_name = vect_init_vector (phi, new_name,
7842                                          TREE_TYPE (step_expr), NULL);
7843           new_vec = build_vector_from_val (vectype, new_name);
7844           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7845           for (; ivn < nvects; ++ivn)
7846             {
7847               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7848               tree def;
7849               if (gimple_code (iv) == GIMPLE_PHI)
7850                 def = gimple_phi_result (iv);
7851               else
7852                 def = gimple_assign_lhs (iv);
7853               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7854                                               PLUS_EXPR,
7855                                               def, vec_step);
7856               if (gimple_code (iv) == GIMPLE_PHI)
7857                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7858               else
7859                 {
7860                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7861                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7862                 }
7863               set_vinfo_for_stmt (new_stmt,
7864                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7865               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7866             }
7867         }
7868
7869       return true;
7870     }
7871
7872   /* Create the vector that holds the initial_value of the induction.  */
7873   if (nested_in_vect_loop)
7874     {
7875       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7876          been created during vectorization of previous stmts.  We obtain it
7877          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7878       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7879       /* If the initial value is not of proper type, convert it.  */
7880       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7881         {
7882           new_stmt
7883             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7884                                                           vect_simple_var,
7885                                                           "vec_iv_"),
7886                                    VIEW_CONVERT_EXPR,
7887                                    build1 (VIEW_CONVERT_EXPR, vectype,
7888                                            vec_init));
7889           vec_init = gimple_assign_lhs (new_stmt);
7890           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7891                                                  new_stmt);
7892           gcc_assert (!new_bb);
7893           set_vinfo_for_stmt (new_stmt,
7894                               new_stmt_vec_info (new_stmt, loop_vinfo));
7895         }
7896     }
7897   else
7898     {
7899       /* iv_loop is the loop to be vectorized. Create:
7900          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7901       stmts = NULL;
7902       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7903
7904       unsigned HOST_WIDE_INT const_nunits;
7905       if (nunits.is_constant (&const_nunits))
7906         {
7907           tree_vector_builder elts (vectype, const_nunits, 1);
7908           elts.quick_push (new_name);
7909           for (i = 1; i < const_nunits; i++)
7910             {
7911               /* Create: new_name_i = new_name + step_expr  */
7912               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7913                                        new_name, step_expr);
7914               elts.quick_push (new_name);
7915             }
7916           /* Create a vector from [new_name_0, new_name_1, ...,
7917              new_name_nunits-1]  */
7918           vec_init = gimple_build_vector (&stmts, &elts);
7919         }
7920       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7921         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7922         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7923                                  new_name, step_expr);
7924       else
7925         {
7926           /* Build:
7927                 [base, base, base, ...]
7928                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7929           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7930           gcc_assert (flag_associative_math);
7931           tree index = build_index_vector (vectype, 0, 1);
7932           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7933                                                         new_name);
7934           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7935                                                         step_expr);
7936           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
7937           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
7938                                    vec_init, step_vec);
7939           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
7940                                    vec_init, base_vec);
7941         }
7942
7943       if (stmts)
7944         {
7945           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7946           gcc_assert (!new_bb);
7947         }
7948     }
7949
7950
7951   /* Create the vector that holds the step of the induction.  */
7952   if (nested_in_vect_loop)
7953     /* iv_loop is nested in the loop to be vectorized. Generate:
7954        vec_step = [S, S, S, S]  */
7955     new_name = step_expr;
7956   else
7957     {
7958       /* iv_loop is the loop to be vectorized. Generate:
7959           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
7960       gimple_seq seq = NULL;
7961       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7962         {
7963           expr = build_int_cst (integer_type_node, vf);
7964           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
7965         }
7966       else
7967         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7968       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
7969                                expr, step_expr);
7970       if (seq)
7971         {
7972           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
7973           gcc_assert (!new_bb);
7974         }
7975     }
7976
7977   t = unshare_expr (new_name);
7978   gcc_assert (CONSTANT_CLASS_P (new_name)
7979               || TREE_CODE (new_name) == SSA_NAME);
7980   new_vec = build_vector_from_val (vectype, t);
7981   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7982
7983
7984   /* Create the following def-use cycle:
7985      loop prolog:
7986          vec_init = ...
7987          vec_step = ...
7988      loop:
7989          vec_iv = PHI <vec_init, vec_loop>
7990          ...
7991          STMT
7992          ...
7993          vec_loop = vec_iv + vec_step;  */
7994
7995   /* Create the induction-phi that defines the induction-operand.  */
7996   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7997   induction_phi = create_phi_node (vec_dest, iv_loop->header);
7998   set_vinfo_for_stmt (induction_phi,
7999                       new_stmt_vec_info (induction_phi, loop_vinfo));
8000   induc_def = PHI_RESULT (induction_phi);
8001
8002   /* Create the iv update inside the loop  */
8003   vec_def = make_ssa_name (vec_dest);
8004   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8005   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8006   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8007
8008   /* Set the arguments of the phi node:  */
8009   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8010   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8011                UNKNOWN_LOCATION);
8012
8013   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8014
8015   /* In case that vectorization factor (VF) is bigger than the number
8016      of elements that we can fit in a vectype (nunits), we have to generate
8017      more than one vector stmt - i.e - we need to "unroll" the
8018      vector stmt by a factor VF/nunits.  For more details see documentation
8019      in vectorizable_operation.  */
8020
8021   if (ncopies > 1)
8022     {
8023       gimple_seq seq = NULL;
8024       stmt_vec_info prev_stmt_vinfo;
8025       /* FORNOW. This restriction should be relaxed.  */
8026       gcc_assert (!nested_in_vect_loop);
8027
8028       /* Create the vector that holds the step of the induction.  */
8029       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8030         {
8031           expr = build_int_cst (integer_type_node, nunits);
8032           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8033         }
8034       else
8035         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8036       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8037                                expr, step_expr);
8038       if (seq)
8039         {
8040           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8041           gcc_assert (!new_bb);
8042         }
8043
8044       t = unshare_expr (new_name);
8045       gcc_assert (CONSTANT_CLASS_P (new_name)
8046                   || TREE_CODE (new_name) == SSA_NAME);
8047       new_vec = build_vector_from_val (vectype, t);
8048       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8049
8050       vec_def = induc_def;
8051       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8052       for (i = 1; i < ncopies; i++)
8053         {
8054           /* vec_i = vec_prev + vec_step  */
8055           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8056                                           vec_def, vec_step);
8057           vec_def = make_ssa_name (vec_dest, new_stmt);
8058           gimple_assign_set_lhs (new_stmt, vec_def);
8059
8060           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8061           set_vinfo_for_stmt (new_stmt,
8062                               new_stmt_vec_info (new_stmt, loop_vinfo));
8063           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8064           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8065         }
8066     }
8067
8068   if (nested_in_vect_loop)
8069     {
8070       /* Find the loop-closed exit-phi of the induction, and record
8071          the final vector of induction results:  */
8072       exit_phi = NULL;
8073       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8074         {
8075           gimple *use_stmt = USE_STMT (use_p);
8076           if (is_gimple_debug (use_stmt))
8077             continue;
8078
8079           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8080             {
8081               exit_phi = use_stmt;
8082               break;
8083             }
8084         }
8085       if (exit_phi)
8086         {
8087           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8088           /* FORNOW. Currently not supporting the case that an inner-loop induction
8089              is not used in the outer-loop (i.e. only outside the outer-loop).  */
8090           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8091                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
8092
8093           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8094           if (dump_enabled_p ())
8095             {
8096               dump_printf_loc (MSG_NOTE, vect_location,
8097                                "vector of inductions after inner-loop:");
8098               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8099             }
8100         }
8101     }
8102
8103
8104   if (dump_enabled_p ())
8105     {
8106       dump_printf_loc (MSG_NOTE, vect_location,
8107                        "transform induction: created def-use cycle: ");
8108       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8109       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8110                         SSA_NAME_DEF_STMT (vec_def), 0);
8111     }
8112
8113   return true;
8114 }
8115
8116 /* Function vectorizable_live_operation.
8117
8118    STMT computes a value that is used outside the loop.  Check if
8119    it can be supported.  */
8120
8121 bool
8122 vectorizable_live_operation (gimple *stmt,
8123                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8124                              slp_tree slp_node, int slp_index,
8125                              gimple **vec_stmt)
8126 {
8127   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8128   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8129   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8130   imm_use_iterator imm_iter;
8131   tree lhs, lhs_type, bitsize, vec_bitsize;
8132   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8133   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8134   int ncopies;
8135   gimple *use_stmt;
8136   auto_vec<tree> vec_oprnds;
8137   int vec_entry = 0;
8138   poly_uint64 vec_index = 0;
8139
8140   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8141
8142   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8143     return false;
8144
8145   /* FORNOW.  CHECKME.  */
8146   if (nested_in_vect_loop_p (loop, stmt))
8147     return false;
8148
8149   /* If STMT is not relevant and it is a simple assignment and its inputs are
8150      invariant then it can remain in place, unvectorized.  The original last
8151      scalar value that it computes will be used.  */
8152   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8153     {
8154       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8155       if (dump_enabled_p ())
8156         dump_printf_loc (MSG_NOTE, vect_location,
8157                          "statement is simple and uses invariant.  Leaving in "
8158                          "place.\n");
8159       return true;
8160     }
8161
8162   if (slp_node)
8163     ncopies = 1;
8164   else
8165     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8166
8167   if (slp_node)
8168     {
8169       gcc_assert (slp_index >= 0);
8170
8171       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8172       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8173
8174       /* Get the last occurrence of the scalar index from the concatenation of
8175          all the slp vectors. Calculate which slp vector it is and the index
8176          within.  */
8177       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8178
8179       /* Calculate which vector contains the result, and which lane of
8180          that vector we need.  */
8181       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8182         {
8183           if (dump_enabled_p ())
8184             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8185                              "Cannot determine which vector holds the"
8186                              " final result.\n");
8187           return false;
8188         }
8189     }
8190
8191   if (!vec_stmt)
8192     {
8193       /* No transformation required.  */
8194       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8195         {
8196           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8197                                                OPTIMIZE_FOR_SPEED))
8198             {
8199               if (dump_enabled_p ())
8200                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8201                                  "can't use a fully-masked loop because "
8202                                  "the target doesn't support extract last "
8203                                  "reduction.\n");
8204               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8205             }
8206           else if (slp_node)
8207             {
8208               if (dump_enabled_p ())
8209                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8210                                  "can't use a fully-masked loop because an "
8211                                  "SLP statement is live after the loop.\n");
8212               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8213             }
8214           else if (ncopies > 1)
8215             {
8216               if (dump_enabled_p ())
8217                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8218                                  "can't use a fully-masked loop because"
8219                                  " ncopies is greater than 1.\n");
8220               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8221             }
8222           else
8223             {
8224               gcc_assert (ncopies == 1 && !slp_node);
8225               vect_record_loop_mask (loop_vinfo,
8226                                      &LOOP_VINFO_MASKS (loop_vinfo),
8227                                      1, vectype);
8228             }
8229         }
8230       return true;
8231     }
8232
8233   /* If stmt has a related stmt, then use that for getting the lhs.  */
8234   if (is_pattern_stmt_p (stmt_info))
8235     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8236
8237   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8238         : gimple_get_lhs (stmt);
8239   lhs_type = TREE_TYPE (lhs);
8240
8241   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8242              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8243              : TYPE_SIZE (TREE_TYPE (vectype)));
8244   vec_bitsize = TYPE_SIZE (vectype);
8245
8246   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8247   tree vec_lhs, bitstart;
8248   if (slp_node)
8249     {
8250       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8251
8252       /* Get the correct slp vectorized stmt.  */
8253       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8254       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8255         vec_lhs = gimple_phi_result (phi);
8256       else
8257         vec_lhs = gimple_get_lhs (vec_stmt);
8258
8259       /* Get entry to use.  */
8260       bitstart = bitsize_int (vec_index);
8261       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8262     }
8263   else
8264     {
8265       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8266       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8267       gcc_checking_assert (ncopies == 1
8268                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8269
8270       /* For multiple copies, get the last copy.  */
8271       for (int i = 1; i < ncopies; ++i)
8272         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8273                                                   vec_lhs);
8274
8275       /* Get the last lane in the vector.  */
8276       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8277     }
8278
8279   gimple_seq stmts = NULL;
8280   tree new_tree;
8281   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8282     {
8283       /* Emit:
8284
8285            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8286
8287          where VEC_LHS is the vectorized live-out result and MASK is
8288          the loop mask for the final iteration.  */
8289       gcc_assert (ncopies == 1 && !slp_node);
8290       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8291       tree scalar_res = make_ssa_name (scalar_type);
8292       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8293                                       1, vectype, 0);
8294       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8295                                                     2, mask, vec_lhs);
8296       gimple_call_set_lhs (new_stmt, scalar_res);
8297       gimple_seq_add_stmt (&stmts, new_stmt);
8298
8299       /* Convert the extracted vector element to the required scalar type.  */
8300       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8301     }
8302   else
8303     {
8304       tree bftype = TREE_TYPE (vectype);
8305       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8306         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8307       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8308       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8309                                        &stmts, true, NULL_TREE);
8310     }
8311
8312   if (stmts)
8313     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8314
8315   /* Replace use of lhs with newly computed result.  If the use stmt is a
8316      single arg PHI, just replace all uses of PHI result.  It's necessary
8317      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8318   use_operand_p use_p;
8319   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8320     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8321         && !is_gimple_debug (use_stmt))
8322     {
8323       if (gimple_code (use_stmt) == GIMPLE_PHI
8324           && gimple_phi_num_args (use_stmt) == 1)
8325         {
8326           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8327         }
8328       else
8329         {
8330           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8331             SET_USE (use_p, new_tree);
8332         }
8333       update_stmt (use_stmt);
8334     }
8335
8336   return true;
8337 }
8338
8339 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8340
8341 static void
8342 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8343 {
8344   ssa_op_iter op_iter;
8345   imm_use_iterator imm_iter;
8346   def_operand_p def_p;
8347   gimple *ustmt;
8348
8349   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8350     {
8351       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8352         {
8353           basic_block bb;
8354
8355           if (!is_gimple_debug (ustmt))
8356             continue;
8357
8358           bb = gimple_bb (ustmt);
8359
8360           if (!flow_bb_inside_loop_p (loop, bb))
8361             {
8362               if (gimple_debug_bind_p (ustmt))
8363                 {
8364                   if (dump_enabled_p ())
8365                     dump_printf_loc (MSG_NOTE, vect_location,
8366                                      "killing debug use\n");
8367
8368                   gimple_debug_bind_reset_value (ustmt);
8369                   update_stmt (ustmt);
8370                 }
8371               else
8372                 gcc_unreachable ();
8373             }
8374         }
8375     }
8376 }
8377
8378 /* Given loop represented by LOOP_VINFO, return true if computation of
8379    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8380    otherwise.  */
8381
8382 static bool
8383 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8384 {
8385   /* Constant case.  */
8386   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8387     {
8388       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8389       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8390
8391       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8392       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8393       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8394         return true;
8395     }
8396
8397   widest_int max;
8398   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8399   /* Check the upper bound of loop niters.  */
8400   if (get_max_loop_iterations (loop, &max))
8401     {
8402       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8403       signop sgn = TYPE_SIGN (type);
8404       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8405       if (max < type_max)
8406         return true;
8407     }
8408   return false;
8409 }
8410
8411 /* Return a mask type with half the number of elements as TYPE.  */
8412
8413 tree
8414 vect_halve_mask_nunits (tree type)
8415 {
8416   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8417   return build_truth_vector_type (nunits, current_vector_size);
8418 }
8419
8420 /* Return a mask type with twice as many elements as TYPE.  */
8421
8422 tree
8423 vect_double_mask_nunits (tree type)
8424 {
8425   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8426   return build_truth_vector_type (nunits, current_vector_size);
8427 }
8428
8429 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8430    contain a sequence of NVECTORS masks that each control a vector of type
8431    VECTYPE.  */
8432
8433 void
8434 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8435                        unsigned int nvectors, tree vectype)
8436 {
8437   gcc_assert (nvectors != 0);
8438   if (masks->length () < nvectors)
8439     masks->safe_grow_cleared (nvectors);
8440   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8441   /* The number of scalars per iteration and the number of vectors are
8442      both compile-time constants.  */
8443   unsigned int nscalars_per_iter
8444     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8445                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8446   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8447     {
8448       rgm->max_nscalars_per_iter = nscalars_per_iter;
8449       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8450     }
8451 }
8452
8453 /* Given a complete set of masks MASKS, extract mask number INDEX
8454    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8455    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8456
8457    See the comment above vec_loop_masks for more details about the mask
8458    arrangement.  */
8459
8460 tree
8461 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8462                     unsigned int nvectors, tree vectype, unsigned int index)
8463 {
8464   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8465   tree mask_type = rgm->mask_type;
8466
8467   /* Populate the rgroup's mask array, if this is the first time we've
8468      used it.  */
8469   if (rgm->masks.is_empty ())
8470     {
8471       rgm->masks.safe_grow_cleared (nvectors);
8472       for (unsigned int i = 0; i < nvectors; ++i)
8473         {
8474           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8475           /* Provide a dummy definition until the real one is available.  */
8476           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8477           rgm->masks[i] = mask;
8478         }
8479     }
8480
8481   tree mask = rgm->masks[index];
8482   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8483                 TYPE_VECTOR_SUBPARTS (vectype)))
8484     {
8485       /* A loop mask for data type X can be reused for data type Y
8486          if X has N times more elements than Y and if Y's elements
8487          are N times bigger than X's.  In this case each sequence
8488          of N elements in the loop mask will be all-zero or all-one.
8489          We can then view-convert the mask so that each sequence of
8490          N elements is replaced by a single element.  */
8491       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8492                               TYPE_VECTOR_SUBPARTS (vectype)));
8493       gimple_seq seq = NULL;
8494       mask_type = build_same_sized_truth_vector_type (vectype);
8495       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8496       if (seq)
8497         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8498     }
8499   return mask;
8500 }
8501
8502 /* Scale profiling counters by estimation for LOOP which is vectorized
8503    by factor VF.  */
8504
8505 static void
8506 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8507 {
8508   edge preheader = loop_preheader_edge (loop);
8509   /* Reduce loop iterations by the vectorization factor.  */
8510   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8511   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8512
8513   if (freq_h.nonzero_p ())
8514     {
8515       profile_probability p;
8516
8517       /* Avoid dropping loop body profile counter to 0 because of zero count
8518          in loop's preheader.  */
8519       if (!(freq_e == profile_count::zero ()))
8520         freq_e = freq_e.force_nonzero ();
8521       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8522       scale_loop_frequencies (loop, p);
8523     }
8524
8525   edge exit_e = single_exit (loop);
8526   exit_e->probability = profile_probability::always ()
8527                                  .apply_scale (1, new_est_niter + 1);
8528
8529   edge exit_l = single_pred_edge (loop->latch);
8530   profile_probability prob = exit_l->probability;
8531   exit_l->probability = exit_e->probability.invert ();
8532   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8533     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8534 }
8535
8536 /* Function vect_transform_loop.
8537
8538    The analysis phase has determined that the loop is vectorizable.
8539    Vectorize the loop - created vectorized stmts to replace the scalar
8540    stmts in the loop, and update the loop exit condition.
8541    Returns scalar epilogue loop if any.  */
8542
8543 struct loop *
8544 vect_transform_loop (loop_vec_info loop_vinfo)
8545 {
8546   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8547   struct loop *epilogue = NULL;
8548   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8549   int nbbs = loop->num_nodes;
8550   int i;
8551   tree niters_vector = NULL_TREE;
8552   tree step_vector = NULL_TREE;
8553   tree niters_vector_mult_vf = NULL_TREE;
8554   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8555   unsigned int lowest_vf = constant_lower_bound (vf);
8556   bool grouped_store;
8557   bool slp_scheduled = false;
8558   gimple *stmt, *pattern_stmt;
8559   gimple_seq pattern_def_seq = NULL;
8560   gimple_stmt_iterator pattern_def_si = gsi_none ();
8561   bool transform_pattern_stmt = false;
8562   bool check_profitability = false;
8563   unsigned int th;
8564
8565   if (dump_enabled_p ())
8566     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8567
8568   /* Use the more conservative vectorization threshold.  If the number
8569      of iterations is constant assume the cost check has been performed
8570      by our caller.  If the threshold makes all loops profitable that
8571      run at least the (estimated) vectorization factor number of times
8572      checking is pointless, too.  */
8573   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8574   if (th >= vect_vf_for_cost (loop_vinfo)
8575       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8576     {
8577       if (dump_enabled_p ())
8578         dump_printf_loc (MSG_NOTE, vect_location,
8579                          "Profitability threshold is %d loop iterations.\n",
8580                          th);
8581       check_profitability = true;
8582     }
8583
8584   /* Make sure there exists a single-predecessor exit bb.  Do this before
8585      versioning.   */
8586   edge e = single_exit (loop);
8587   if (! single_pred_p (e->dest))
8588     {
8589       split_loop_exit_edge (e);
8590       if (dump_enabled_p ())
8591         dump_printf (MSG_NOTE, "split exit edge\n");
8592     }
8593
8594   /* Version the loop first, if required, so the profitability check
8595      comes first.  */
8596
8597   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8598     {
8599       poly_uint64 versioning_threshold
8600         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8601       if (check_profitability
8602           && ordered_p (poly_uint64 (th), versioning_threshold))
8603         {
8604           versioning_threshold = ordered_max (poly_uint64 (th),
8605                                               versioning_threshold);
8606           check_profitability = false;
8607         }
8608       vect_loop_versioning (loop_vinfo, th, check_profitability,
8609                             versioning_threshold);
8610       check_profitability = false;
8611     }
8612
8613   /* Make sure there exists a single-predecessor exit bb also on the
8614      scalar loop copy.  Do this after versioning but before peeling
8615      so CFG structure is fine for both scalar and if-converted loop
8616      to make slpeel_duplicate_current_defs_from_edges face matched
8617      loop closed PHI nodes on the exit.  */
8618   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8619     {
8620       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8621       if (! single_pred_p (e->dest))
8622         {
8623           split_loop_exit_edge (e);
8624           if (dump_enabled_p ())
8625             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8626         }
8627     }
8628
8629   tree niters = vect_build_loop_niters (loop_vinfo);
8630   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8631   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8632   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8633   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8634                               &step_vector, &niters_vector_mult_vf, th,
8635                               check_profitability, niters_no_overflow);
8636
8637   if (niters_vector == NULL_TREE)
8638     {
8639       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8640           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8641           && known_eq (lowest_vf, vf))
8642         {
8643           niters_vector
8644             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8645                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8646           step_vector = build_one_cst (TREE_TYPE (niters));
8647         }
8648       else
8649         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8650                                      &step_vector, niters_no_overflow);
8651     }
8652
8653   /* 1) Make sure the loop header has exactly two entries
8654      2) Make sure we have a preheader basic block.  */
8655
8656   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8657
8658   split_edge (loop_preheader_edge (loop));
8659
8660   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8661       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8662     /* This will deal with any possible peeling.  */
8663     vect_prepare_for_masked_peels (loop_vinfo);
8664
8665   /* FORNOW: the vectorizer supports only loops which body consist
8666      of one basic block (header + empty latch). When the vectorizer will
8667      support more involved loop forms, the order by which the BBs are
8668      traversed need to be reconsidered.  */
8669
8670   for (i = 0; i < nbbs; i++)
8671     {
8672       basic_block bb = bbs[i];
8673       stmt_vec_info stmt_info;
8674
8675       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8676            gsi_next (&si))
8677         {
8678           gphi *phi = si.phi ();
8679           if (dump_enabled_p ())
8680             {
8681               dump_printf_loc (MSG_NOTE, vect_location,
8682                                "------>vectorizing phi: ");
8683               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8684             }
8685           stmt_info = vinfo_for_stmt (phi);
8686           if (!stmt_info)
8687             continue;
8688
8689           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8690             vect_loop_kill_debug_uses (loop, phi);
8691
8692           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8693               && !STMT_VINFO_LIVE_P (stmt_info))
8694             continue;
8695
8696           if (STMT_VINFO_VECTYPE (stmt_info)
8697               && (maybe_ne
8698                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8699               && dump_enabled_p ())
8700             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8701
8702           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8703                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8704                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8705               && ! PURE_SLP_STMT (stmt_info))
8706             {
8707               if (dump_enabled_p ())
8708                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8709               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8710             }
8711         }
8712
8713       pattern_stmt = NULL;
8714       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8715            !gsi_end_p (si) || transform_pattern_stmt;)
8716         {
8717           bool is_store;
8718
8719           if (transform_pattern_stmt)
8720             stmt = pattern_stmt;
8721           else
8722             {
8723               stmt = gsi_stmt (si);
8724               /* During vectorization remove existing clobber stmts.  */
8725               if (gimple_clobber_p (stmt))
8726                 {
8727                   unlink_stmt_vdef (stmt);
8728                   gsi_remove (&si, true);
8729                   release_defs (stmt);
8730                   continue;
8731                 }
8732             }
8733
8734           if (dump_enabled_p ())
8735             {
8736               dump_printf_loc (MSG_NOTE, vect_location,
8737                                "------>vectorizing statement: ");
8738               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8739             }
8740
8741           stmt_info = vinfo_for_stmt (stmt);
8742
8743           /* vector stmts created in the outer-loop during vectorization of
8744              stmts in an inner-loop may not have a stmt_info, and do not
8745              need to be vectorized.  */
8746           if (!stmt_info)
8747             {
8748               gsi_next (&si);
8749               continue;
8750             }
8751
8752           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8753             vect_loop_kill_debug_uses (loop, stmt);
8754
8755           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8756               && !STMT_VINFO_LIVE_P (stmt_info))
8757             {
8758               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8759                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8760                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8761                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8762                 {
8763                   stmt = pattern_stmt;
8764                   stmt_info = vinfo_for_stmt (stmt);
8765                 }
8766               else
8767                 {
8768                   gsi_next (&si);
8769                   continue;
8770                 }
8771             }
8772           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8773                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8774                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8775                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8776             transform_pattern_stmt = true;
8777
8778           /* If pattern statement has def stmts, vectorize them too.  */
8779           if (is_pattern_stmt_p (stmt_info))
8780             {
8781               if (pattern_def_seq == NULL)
8782                 {
8783                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8784                   pattern_def_si = gsi_start (pattern_def_seq);
8785                 }
8786               else if (!gsi_end_p (pattern_def_si))
8787                 gsi_next (&pattern_def_si);
8788               if (pattern_def_seq != NULL)
8789                 {
8790                   gimple *pattern_def_stmt = NULL;
8791                   stmt_vec_info pattern_def_stmt_info = NULL;
8792
8793                   while (!gsi_end_p (pattern_def_si))
8794                     {
8795                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8796                       pattern_def_stmt_info
8797                         = vinfo_for_stmt (pattern_def_stmt);
8798                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8799                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8800                         break;
8801                       gsi_next (&pattern_def_si);
8802                     }
8803
8804                   if (!gsi_end_p (pattern_def_si))
8805                     {
8806                       if (dump_enabled_p ())
8807                         {
8808                           dump_printf_loc (MSG_NOTE, vect_location,
8809                                            "==> vectorizing pattern def "
8810                                            "stmt: ");
8811                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8812                                             pattern_def_stmt, 0);
8813                         }
8814
8815                       stmt = pattern_def_stmt;
8816                       stmt_info = pattern_def_stmt_info;
8817                     }
8818                   else
8819                     {
8820                       pattern_def_si = gsi_none ();
8821                       transform_pattern_stmt = false;
8822                     }
8823                 }
8824               else
8825                 transform_pattern_stmt = false;
8826             }
8827
8828           if (STMT_VINFO_VECTYPE (stmt_info))
8829             {
8830               poly_uint64 nunits
8831                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8832               if (!STMT_SLP_TYPE (stmt_info)
8833                   && maybe_ne (nunits, vf)
8834                   && dump_enabled_p ())
8835                   /* For SLP VF is set according to unrolling factor, and not
8836                      to vector size, hence for SLP this print is not valid.  */
8837                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8838             }
8839
8840           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8841              reached.  */
8842           if (STMT_SLP_TYPE (stmt_info))
8843             {
8844               if (!slp_scheduled)
8845                 {
8846                   slp_scheduled = true;
8847
8848                   if (dump_enabled_p ())
8849                     dump_printf_loc (MSG_NOTE, vect_location,
8850                                      "=== scheduling SLP instances ===\n");
8851
8852                   vect_schedule_slp (loop_vinfo);
8853                 }
8854
8855               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8856               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8857                 {
8858                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8859                     {
8860                       pattern_def_seq = NULL;
8861                       gsi_next (&si);
8862                     }
8863                   continue;
8864                 }
8865             }
8866
8867           /* -------- vectorize statement ------------ */
8868           if (dump_enabled_p ())
8869             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8870
8871           grouped_store = false;
8872           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8873           if (is_store)
8874             {
8875               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8876                 {
8877                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8878                      interleaving chain was completed - free all the stores in
8879                      the chain.  */
8880                   gsi_next (&si);
8881                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8882                 }
8883               else
8884                 {
8885                   /* Free the attached stmt_vec_info and remove the stmt.  */
8886                   gimple *store = gsi_stmt (si);
8887                   free_stmt_vec_info (store);
8888                   unlink_stmt_vdef (store);
8889                   gsi_remove (&si, true);
8890                   release_defs (store);
8891                 }
8892
8893               /* Stores can only appear at the end of pattern statements.  */
8894               gcc_assert (!transform_pattern_stmt);
8895               pattern_def_seq = NULL;
8896             }
8897           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8898             {
8899               pattern_def_seq = NULL;
8900               gsi_next (&si);
8901             }
8902         }                       /* stmts in BB */
8903
8904       /* Stub out scalar statements that must not survive vectorization.
8905          Doing this here helps with grouped statements, or statements that
8906          are involved in patterns.  */
8907       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8908            !gsi_end_p (gsi); gsi_next (&gsi))
8909         {
8910           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8911           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8912             {
8913               tree lhs = gimple_get_lhs (call);
8914               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8915                 {
8916                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8917                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8918                   gsi_replace (&gsi, new_stmt, true);
8919                 }
8920             }
8921         }
8922     }                           /* BBs in loop */
8923
8924   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8925      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8926   if (integer_onep (step_vector))
8927     niters_no_overflow = true;
8928   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8929                            niters_vector_mult_vf, !niters_no_overflow);
8930
8931   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8932   scale_profile_for_vect_loop (loop, assumed_vf);
8933
8934   /* True if the final iteration might not handle a full vector's
8935      worth of scalar iterations.  */
8936   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
8937   /* The minimum number of iterations performed by the epilogue.  This
8938      is 1 when peeling for gaps because we always need a final scalar
8939      iteration.  */
8940   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
8941   /* +1 to convert latch counts to loop iteration counts,
8942      -min_epilogue_iters to remove iterations that cannot be performed
8943        by the vector code.  */
8944   int bias_for_lowest = 1 - min_epilogue_iters;
8945   int bias_for_assumed = bias_for_lowest;
8946   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
8947   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8948     {
8949       /* When the amount of peeling is known at compile time, the first
8950          iteration will have exactly alignment_npeels active elements.
8951          In the worst case it will have at least one.  */
8952       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
8953       bias_for_lowest += lowest_vf - min_first_active;
8954       bias_for_assumed += assumed_vf - min_first_active;
8955     }
8956   /* In these calculations the "- 1" converts loop iteration counts
8957      back to latch counts.  */
8958   if (loop->any_upper_bound)
8959     loop->nb_iterations_upper_bound
8960       = (final_iter_may_be_partial
8961          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
8962                           lowest_vf) - 1
8963          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
8964                            lowest_vf) - 1);
8965   if (loop->any_likely_upper_bound)
8966     loop->nb_iterations_likely_upper_bound
8967       = (final_iter_may_be_partial
8968          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
8969                           + bias_for_lowest, lowest_vf) - 1
8970          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
8971                            + bias_for_lowest, lowest_vf) - 1);
8972   if (loop->any_estimate)
8973     loop->nb_iterations_estimate
8974       = (final_iter_may_be_partial
8975          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
8976                           assumed_vf) - 1
8977          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
8978                            assumed_vf) - 1);
8979
8980   if (dump_enabled_p ())
8981     {
8982       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
8983         {
8984           dump_printf_loc (MSG_NOTE, vect_location,
8985                            "LOOP VECTORIZED\n");
8986           if (loop->inner)
8987             dump_printf_loc (MSG_NOTE, vect_location,
8988                              "OUTER LOOP VECTORIZED\n");
8989           dump_printf (MSG_NOTE, "\n");
8990         }
8991       else
8992         {
8993           dump_printf_loc (MSG_NOTE, vect_location,
8994                            "LOOP EPILOGUE VECTORIZED (VS=");
8995           dump_dec (MSG_NOTE, current_vector_size);
8996           dump_printf (MSG_NOTE, ")\n");
8997         }
8998     }
8999
9000   /* Free SLP instances here because otherwise stmt reference counting
9001      won't work.  */
9002   slp_instance instance;
9003   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9004     vect_free_slp_instance (instance);
9005   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9006   /* Clear-up safelen field since its value is invalid after vectorization
9007      since vectorized loop can have loop-carried dependencies.  */
9008   loop->safelen = 0;
9009
9010   /* Don't vectorize epilogue for epilogue.  */
9011   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9012     epilogue = NULL;
9013
9014   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9015     epilogue = NULL;
9016
9017   if (epilogue)
9018     {
9019       auto_vector_sizes vector_sizes;
9020       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9021       unsigned int next_size = 0;
9022
9023       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9024           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9025           && known_eq (vf, lowest_vf))
9026         {
9027           unsigned int eiters
9028             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9029                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9030           eiters = eiters % lowest_vf;
9031           epilogue->nb_iterations_upper_bound = eiters - 1;
9032
9033           unsigned int ratio;
9034           while (next_size < vector_sizes.length ()
9035                  && !(constant_multiple_p (current_vector_size,
9036                                            vector_sizes[next_size], &ratio)
9037                       && eiters >= lowest_vf / ratio))
9038             next_size += 1;
9039         }
9040       else
9041         while (next_size < vector_sizes.length ()
9042                && maybe_lt (current_vector_size, vector_sizes[next_size]))
9043           next_size += 1;
9044
9045       if (next_size == vector_sizes.length ())
9046         epilogue = NULL;
9047     }
9048
9049   if (epilogue)
9050     {
9051       epilogue->force_vectorize = loop->force_vectorize;
9052       epilogue->safelen = loop->safelen;
9053       epilogue->dont_vectorize = false;
9054
9055       /* We may need to if-convert epilogue to vectorize it.  */
9056       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9057         tree_if_conversion (epilogue);
9058     }
9059
9060   return epilogue;
9061 }
9062
9063 /* The code below is trying to perform simple optimization - revert
9064    if-conversion for masked stores, i.e. if the mask of a store is zero
9065    do not perform it and all stored value producers also if possible.
9066    For example,
9067      for (i=0; i<n; i++)
9068        if (c[i])
9069         {
9070           p1[i] += 1;
9071           p2[i] = p3[i] +2;
9072         }
9073    this transformation will produce the following semi-hammock:
9074
9075    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9076      {
9077        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9078        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9079        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9080        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9081        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9082        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9083      }
9084 */
9085
9086 void
9087 optimize_mask_stores (struct loop *loop)
9088 {
9089   basic_block *bbs = get_loop_body (loop);
9090   unsigned nbbs = loop->num_nodes;
9091   unsigned i;
9092   basic_block bb;
9093   struct loop *bb_loop;
9094   gimple_stmt_iterator gsi;
9095   gimple *stmt;
9096   auto_vec<gimple *> worklist;
9097
9098   vect_location = find_loop_location (loop);
9099   /* Pick up all masked stores in loop if any.  */
9100   for (i = 0; i < nbbs; i++)
9101     {
9102       bb = bbs[i];
9103       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9104            gsi_next (&gsi))
9105         {
9106           stmt = gsi_stmt (gsi);
9107           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9108             worklist.safe_push (stmt);
9109         }
9110     }
9111
9112   free (bbs);
9113   if (worklist.is_empty ())
9114     return;
9115
9116   /* Loop has masked stores.  */
9117   while (!worklist.is_empty ())
9118     {
9119       gimple *last, *last_store;
9120       edge e, efalse;
9121       tree mask;
9122       basic_block store_bb, join_bb;
9123       gimple_stmt_iterator gsi_to;
9124       tree vdef, new_vdef;
9125       gphi *phi;
9126       tree vectype;
9127       tree zero;
9128
9129       last = worklist.pop ();
9130       mask = gimple_call_arg (last, 2);
9131       bb = gimple_bb (last);
9132       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9133          the same loop as if_bb.  It could be different to LOOP when two
9134          level loop-nest is vectorized and mask_store belongs to the inner
9135          one.  */
9136       e = split_block (bb, last);
9137       bb_loop = bb->loop_father;
9138       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9139       join_bb = e->dest;
9140       store_bb = create_empty_bb (bb);
9141       add_bb_to_loop (store_bb, bb_loop);
9142       e->flags = EDGE_TRUE_VALUE;
9143       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9144       /* Put STORE_BB to likely part.  */
9145       efalse->probability = profile_probability::unlikely ();
9146       store_bb->count = efalse->count ();
9147       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9148       if (dom_info_available_p (CDI_DOMINATORS))
9149         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9150       if (dump_enabled_p ())
9151         dump_printf_loc (MSG_NOTE, vect_location,
9152                          "Create new block %d to sink mask stores.",
9153                          store_bb->index);
9154       /* Create vector comparison with boolean result.  */
9155       vectype = TREE_TYPE (mask);
9156       zero = build_zero_cst (vectype);
9157       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9158       gsi = gsi_last_bb (bb);
9159       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9160       /* Create new PHI node for vdef of the last masked store:
9161          .MEM_2 = VDEF <.MEM_1>
9162          will be converted to
9163          .MEM.3 = VDEF <.MEM_1>
9164          and new PHI node will be created in join bb
9165          .MEM_2 = PHI <.MEM_1, .MEM_3>
9166       */
9167       vdef = gimple_vdef (last);
9168       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9169       gimple_set_vdef (last, new_vdef);
9170       phi = create_phi_node (vdef, join_bb);
9171       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9172
9173       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9174       while (true)
9175         {
9176           gimple_stmt_iterator gsi_from;
9177           gimple *stmt1 = NULL;
9178
9179           /* Move masked store to STORE_BB.  */
9180           last_store = last;
9181           gsi = gsi_for_stmt (last);
9182           gsi_from = gsi;
9183           /* Shift GSI to the previous stmt for further traversal.  */
9184           gsi_prev (&gsi);
9185           gsi_to = gsi_start_bb (store_bb);
9186           gsi_move_before (&gsi_from, &gsi_to);
9187           /* Setup GSI_TO to the non-empty block start.  */
9188           gsi_to = gsi_start_bb (store_bb);
9189           if (dump_enabled_p ())
9190             {
9191               dump_printf_loc (MSG_NOTE, vect_location,
9192                                "Move stmt to created bb\n");
9193               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9194             }
9195           /* Move all stored value producers if possible.  */
9196           while (!gsi_end_p (gsi))
9197             {
9198               tree lhs;
9199               imm_use_iterator imm_iter;
9200               use_operand_p use_p;
9201               bool res;
9202
9203               /* Skip debug statements.  */
9204               if (is_gimple_debug (gsi_stmt (gsi)))
9205                 {
9206                   gsi_prev (&gsi);
9207                   continue;
9208                 }
9209               stmt1 = gsi_stmt (gsi);
9210               /* Do not consider statements writing to memory or having
9211                  volatile operand.  */
9212               if (gimple_vdef (stmt1)
9213                   || gimple_has_volatile_ops (stmt1))
9214                 break;
9215               gsi_from = gsi;
9216               gsi_prev (&gsi);
9217               lhs = gimple_get_lhs (stmt1);
9218               if (!lhs)
9219                 break;
9220
9221               /* LHS of vectorized stmt must be SSA_NAME.  */
9222               if (TREE_CODE (lhs) != SSA_NAME)
9223                 break;
9224
9225               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9226                 {
9227                   /* Remove dead scalar statement.  */
9228                   if (has_zero_uses (lhs))
9229                     {
9230                       gsi_remove (&gsi_from, true);
9231                       continue;
9232                     }
9233                 }
9234
9235               /* Check that LHS does not have uses outside of STORE_BB.  */
9236               res = true;
9237               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9238                 {
9239                   gimple *use_stmt;
9240                   use_stmt = USE_STMT (use_p);
9241                   if (is_gimple_debug (use_stmt))
9242                     continue;
9243                   if (gimple_bb (use_stmt) != store_bb)
9244                     {
9245                       res = false;
9246                       break;
9247                     }
9248                 }
9249               if (!res)
9250                 break;
9251
9252               if (gimple_vuse (stmt1)
9253                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9254                 break;
9255
9256               /* Can move STMT1 to STORE_BB.  */
9257               if (dump_enabled_p ())
9258                 {
9259                   dump_printf_loc (MSG_NOTE, vect_location,
9260                                    "Move stmt to created bb\n");
9261                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9262                 }
9263               gsi_move_before (&gsi_from, &gsi_to);
9264               /* Shift GSI_TO for further insertion.  */
9265               gsi_prev (&gsi_to);
9266             }
9267           /* Put other masked stores with the same mask to STORE_BB.  */
9268           if (worklist.is_empty ()
9269               || gimple_call_arg (worklist.last (), 2) != mask
9270               || worklist.last () != stmt1)
9271             break;
9272           last = worklist.pop ();
9273         }
9274       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9275     }
9276 }