gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2018 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53 #include "internal-fn.h"
  54 #include "tree-vector-builder.h"
  55 #include "vec-perm-indices.h"
  56 #include "tree-eh.h"
  57
  58 /* Loop Vectorization Pass.
  59
  60    This pass tries to vectorize loops.
  61
  62    For example, the vectorizer transforms the following simple loop:
  63
  64         short a[N]; short b[N]; short c[N]; int i;
  65
  66         for (i=0; i<N; i++){
  67           a[i] = b[i] + c[i];
  68         }
  69
  70    as if it was manually vectorized by rewriting the source code into:
  71
  72         typedef int __attribute__((mode(V8HI))) v8hi;
  73         short a[N];  short b[N]; short c[N];   int i;
  74         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  75         v8hi va, vb, vc;
  76
  77         for (i=0; i<N/8; i++){
  78           vb = pb[i];
  79           vc = pc[i];
  80           va = vb + vc;
  81           pa[i] = va;
  82         }
  83
  84         The main entry to this pass is vectorize_loops(), in which
  85    the vectorizer applies a set of analyses on a given set of loops,
  86    followed by the actual vectorization transformation for the loops that
  87    had successfully passed the analysis phase.
  88         Throughout this pass we make a distinction between two types of
  89    data: scalars (which are represented by SSA_NAMES), and memory references
  90    ("data-refs").  These two types of data require different handling both
  91    during analysis and transformation. The types of data-refs that the
  92    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  93    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  94    accesses are required to have a simple (consecutive) access pattern.
  95
  96    Analysis phase:
  97    ===============
  98         The driver for the analysis phase is vect_analyze_loop().
  99    It applies a set of analyses, some of which rely on the scalar evolution
 100    analyzer (scev) developed by Sebastian Pop.
 101
 102         During the analysis phase the vectorizer records some information
 103    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 104    loop, as well as general information about the loop as a whole, which is
 105    recorded in a "loop_vec_info" struct attached to each loop.
 106
 107    Transformation phase:
 108    =====================
 109         The loop transformation phase scans all the stmts in the loop, and
 110    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 111    the loop that needs to be vectorized.  It inserts the vector code sequence
 112    just before the scalar stmt S, and records a pointer to the vector code
 113    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 114    attached to S).  This pointer will be used for the vectorization of following
 115    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 116    otherwise, we rely on dead code elimination for removing it.
 117
 118         For example, say stmt S1 was vectorized into stmt VS1:
 119
 120    VS1: vb = px[i];
 121    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 122    S2:  a = b;
 123
 124    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 125    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 126    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 127    resulting sequence would be:
 128
 129    VS1: vb = px[i];
 130    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 131    VS2: va = vb;
 132    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 133
 134         Operands that are not SSA_NAMEs, are data-refs that appear in
 135    load/store operations (like 'x[i]' in S1), and are handled differently.
 136
 137    Target modeling:
 138    =================
 139         Currently the only target specific information that is used is the
 140    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 141    Targets that can support different sizes of vectors, for now will need
 142    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 143    flexibility will be added in the future.
 144
 145         Since we only vectorize operations which vector form can be
 146    expressed using existing tree codes, to verify that an operation is
 147    supported, the vectorizer checks the relevant optab at the relevant
 148    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 149    the value found is CODE_FOR_nothing, then there's no target support, and
 150    we can't vectorize the stmt.
 151
 152    For additional information on this project see:
 153    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 154 */
 155
 156 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 157
 158 /* Function vect_determine_vectorization_factor
 159
 160    Determine the vectorization factor (VF).  VF is the number of data elements
 161    that are operated upon in parallel in a single iteration of the vectorized
 162    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 163    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 164    elements can fit in a single vector register.
 165
 166    We currently support vectorization of loops in which all types operated upon
 167    are of the same size.  Therefore this function currently sets VF according to
 168    the size of the types operated upon, and fails if there are multiple sizes
 169    in the loop.
 170
 171    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 172    original loop:
 173         for (i=0; i<N; i++){
 174           a[i] = b[i] + c[i];
 175         }
 176
 177    vectorized loop:
 178         for (i=0; i<N; i+=VF){
 179           a[i:VF] = b[i:VF] + c[i:VF];
 180         }
 181 */
 182
 183 static bool
 184 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 185 {
 186   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 187   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 188   unsigned nbbs = loop->num_nodes;
 189   poly_uint64 vectorization_factor = 1;
 190   tree scalar_type = NULL_TREE;
 191   gphi *phi;
 192   tree vectype;
 193   stmt_vec_info stmt_info;
 194   unsigned i;
 195   HOST_WIDE_INT dummy;
 196   gimple *stmt, *pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200   bool bool_result;
 201   auto_vec<stmt_vec_info> mask_producers;
 202
 203   if (dump_enabled_p ())
 204     dump_printf_loc (MSG_NOTE, vect_location,
 205                      "=== vect_determine_vectorization_factor ===\n");
 206
 207   for (i = 0; i < nbbs; i++)
 208     {
 209       basic_block bb = bbs[i];
 210
 211       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 212            gsi_next (&si))
 213         {
 214           phi = si.phi ();
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220             }
 221
 222           gcc_assert (stmt_info);
 223
 224           if (STMT_VINFO_RELEVANT_P (stmt_info)
 225               || STMT_VINFO_LIVE_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 264                   dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vectype));
 265                   dump_printf (MSG_NOTE, "\n");
 266                 }
 267
 268               vect_update_max_nunits (&vectorization_factor, vectype);
 269             }
 270         }
 271
 272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 273            !gsi_end_p (si) || analyze_pattern_stmt;)
 274         {
 275           tree vf_vectype;
 276
 277           if (analyze_pattern_stmt)
 278             stmt = pattern_stmt;
 279           else
 280             stmt = gsi_stmt (si);
 281
 282           stmt_info = vinfo_for_stmt (stmt);
 283
 284           if (dump_enabled_p ())
 285             {
 286               dump_printf_loc (MSG_NOTE, vect_location,
 287                                "==> examining statement: ");
 288               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                     }
 311                 }
 312               else
 313                 {
 314                   if (dump_enabled_p ())
 315                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 316                   gsi_next (&si);
 317                   continue;
 318                 }
 319             }
 320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 324             analyze_pattern_stmt = true;
 325
 326           /* If a pattern statement has def stmts, analyze them too.  */
 327           if (is_pattern_stmt_p (stmt_info))
 328             {
 329               if (pattern_def_seq == NULL)
 330                 {
 331                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 332                   pattern_def_si = gsi_start (pattern_def_seq);
 333                 }
 334               else if (!gsi_end_p (pattern_def_si))
 335                 gsi_next (&pattern_def_si);
 336               if (pattern_def_seq != NULL)
 337                 {
 338                   gimple *pattern_def_stmt = NULL;
 339                   stmt_vec_info pattern_def_stmt_info = NULL;
 340
 341                   while (!gsi_end_p (pattern_def_si))
 342                     {
 343                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 344                       pattern_def_stmt_info
 345                         = vinfo_for_stmt (pattern_def_stmt);
 346                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 347                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 348                         break;
 349                       gsi_next (&pattern_def_si);
 350                     }
 351
 352                   if (!gsi_end_p (pattern_def_si))
 353                     {
 354                       if (dump_enabled_p ())
 355                         {
 356                           dump_printf_loc (MSG_NOTE, vect_location,
 357                                            "==> examining pattern def stmt: ");
 358                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 359                                             pattern_def_stmt, 0);
 360                         }
 361
 362                       stmt = pattern_def_stmt;
 363                       stmt_info = pattern_def_stmt_info;
 364                     }
 365                   else
 366                     {
 367                       pattern_def_si = gsi_none ();
 368                       analyze_pattern_stmt = false;
 369                     }
 370                 }
 371               else
 372                 analyze_pattern_stmt = false;
 373             }
 374
 375           if (gimple_get_lhs (stmt) == NULL_TREE
 376               /* MASK_STORE has no lhs, but is ok.  */
 377               && (!is_gimple_call (stmt)
 378                   || !gimple_call_internal_p (stmt)
 379                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 380             {
 381               if (is_gimple_call (stmt))
 382                 {
 383                   /* Ignore calls with no lhs.  These must be calls to
 384                      #pragma omp simd functions, and what vectorization factor
 385                      it really needs can't be determined until
 386                      vectorizable_simd_clone_call.  */
 387                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 388                     {
 389                       pattern_def_seq = NULL;
 390                       gsi_next (&si);
 391                     }
 392                   continue;
 393                 }
 394               if (dump_enabled_p ())
 395                 {
 396                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 397                                    "not vectorized: irregular stmt.");
 398                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 399                                     0);
 400                 }
 401               return false;
 402             }
 403
 404           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 405             {
 406               if (dump_enabled_p ())
 407                 {
 408                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 409                                    "not vectorized: vector stmt in loop:");
 410                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 411                 }
 412               return false;
 413             }
 414
 415           bool_result = false;
 416
 417           if (STMT_VINFO_VECTYPE (stmt_info))
 418             {
 419               /* The only case when a vectype had been already set is for stmts
 420                  that contain a dataref, or for "pattern-stmts" (stmts
 421                  generated by the vectorizer to represent/replace a certain
 422                  idiom).  */
 423               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 424                           || is_pattern_stmt_p (stmt_info)
 425                           || !gsi_end_p (pattern_def_si));
 426               vectype = STMT_VINFO_VECTYPE (stmt_info);
 427             }
 428           else
 429             {
 430               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 431               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 432                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 433               else
 434                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 435
 436               /* Bool ops don't participate in vectorization factor
 437                  computation.  For comparison use compared types to
 438                  compute a factor.  */
 439               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 440                   && is_gimple_assign (stmt)
 441                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 442                 {
 443                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 444                       || STMT_VINFO_LIVE_P (stmt_info))
 445                     mask_producers.safe_push (stmt_info);
 446                   bool_result = true;
 447
 448                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 449                       == tcc_comparison
 450                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 451                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 452                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 453                   else
 454                     {
 455                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 456                         {
 457                           pattern_def_seq = NULL;
 458                           gsi_next (&si);
 459                         }
 460                       continue;
 461                     }
 462                 }
 463
 464               if (dump_enabled_p ())
 465                 {
 466                   dump_printf_loc (MSG_NOTE, vect_location,
 467                                    "get vectype for scalar type:  ");
 468                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 469                   dump_printf (MSG_NOTE, "\n");
 470                 }
 471               vectype = get_vectype_for_scalar_type (scalar_type);
 472               if (!vectype)
 473                 {
 474                   if (dump_enabled_p ())
 475                     {
 476                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 477                                        "not vectorized: unsupported "
 478                                        "data-type ");
 479                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 480                                          scalar_type);
 481                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 482                     }
 483                   return false;
 484                 }
 485
 486               if (!bool_result)
 487                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 488
 489               if (dump_enabled_p ())
 490                 {
 491                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 492                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 493                   dump_printf (MSG_NOTE, "\n");
 494                 }
 495             }
 496
 497           /* Don't try to compute VF out scalar types if we stmt
 498              produces boolean vector.  Use result vectype instead.  */
 499           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 500             vf_vectype = vectype;
 501           else
 502             {
 503               /* The vectorization factor is according to the smallest
 504                  scalar type (or the largest vector size, but we only
 505                  support one vector size per loop).  */
 506               if (!bool_result)
 507                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 508                                                              &dummy);
 509               if (dump_enabled_p ())
 510                 {
 511                   dump_printf_loc (MSG_NOTE, vect_location,
 512                                    "get vectype for scalar type:  ");
 513                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 514                   dump_printf (MSG_NOTE, "\n");
 515                 }
 516               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 517             }
 518           if (!vf_vectype)
 519             {
 520               if (dump_enabled_p ())
 521                 {
 522                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 523                                    "not vectorized: unsupported data-type ");
 524                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 525                                      scalar_type);
 526                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 527                 }
 528               return false;
 529             }
 530
 531           if (maybe_ne (GET_MODE_SIZE (TYPE_MODE (vectype)),
 532                         GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 533             {
 534               if (dump_enabled_p ())
 535                 {
 536                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 537                                    "not vectorized: different sized vector "
 538                                    "types in statement, ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 542                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 543                                      vf_vectype);
 544                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 545                 }
 546               return false;
 547             }
 548
 549           if (dump_enabled_p ())
 550             {
 551               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 552               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 553               dump_printf (MSG_NOTE, "\n");
 554             }
 555
 556           if (dump_enabled_p ())
 557             {
 558               dump_printf_loc (MSG_NOTE, vect_location, "nunits = ");
 559               dump_dec (MSG_NOTE, TYPE_VECTOR_SUBPARTS (vf_vectype));
 560               dump_printf (MSG_NOTE, "\n");
 561             }
 562
 563           vect_update_max_nunits (&vectorization_factor, vf_vectype);
 564
 565           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 566             {
 567               pattern_def_seq = NULL;
 568               gsi_next (&si);
 569             }
 570         }
 571     }
 572
 573   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 574   if (dump_enabled_p ())
 575     {
 576       dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = ");
 577       dump_dec (MSG_NOTE, vectorization_factor);
 578       dump_printf (MSG_NOTE, "\n");
 579     }
 580
 581   if (known_le (vectorization_factor, 1U))
 582     {
 583       if (dump_enabled_p ())
 584         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 585                          "not vectorized: unsupported data-type\n");
 586       return false;
 587     }
 588   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 589
 590   for (i = 0; i < mask_producers.length (); i++)
 591     {
 592       tree mask_type = NULL;
 593
 594       stmt = STMT_VINFO_STMT (mask_producers[i]);
 595
 596       if (is_gimple_assign (stmt)
 597           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 598           && !VECT_SCALAR_BOOLEAN_TYPE_P
 599                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 600         {
 601           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 602           mask_type = get_mask_type_for_scalar_type (scalar_type);
 603
 604           if (!mask_type)
 605             {
 606               if (dump_enabled_p ())
 607                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 608                                  "not vectorized: unsupported mask\n");
 609               return false;
 610             }
 611         }
 612       else
 613         {
 614           tree rhs;
 615           ssa_op_iter iter;
 616           gimple *def_stmt;
 617           enum vect_def_type dt;
 618
 619           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 620             {
 621               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 622                                        &def_stmt, &dt, &vectype))
 623                 {
 624                   if (dump_enabled_p ())
 625                     {
 626                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 627                                        "not vectorized: can't compute mask type "
 628                                        "for statement, ");
 629                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 630                                         0);
 631                     }
 632                   return false;
 633                 }
 634
 635               /* No vectype probably means external definition.
 636                  Allow it in case there is another operand which
 637                  allows to determine mask type.  */
 638               if (!vectype)
 639                 continue;
 640
 641               if (!mask_type)
 642                 mask_type = vectype;
 643               else if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
 644                                  TYPE_VECTOR_SUBPARTS (vectype)))
 645                 {
 646                   if (dump_enabled_p ())
 647                     {
 648                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 649                                        "not vectorized: different sized masks "
 650                                        "types in statement, ");
 651                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 652                                          mask_type);
 653                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 654                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 655                                          vectype);
 656                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 657                     }
 658                   return false;
 659                 }
 660               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 661                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 662                 {
 663                   if (dump_enabled_p ())
 664                     {
 665                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 666                                        "not vectorized: mixed mask and "
 667                                        "nonmask vector types in statement, ");
 668                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 669                                          mask_type);
 670                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 671                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 672                                          vectype);
 673                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 674                     }
 675                   return false;
 676                 }
 677             }
 678
 679           /* We may compare boolean value loaded as vector of integers.
 680              Fix mask_type in such case.  */
 681           if (mask_type
 682               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 683               && gimple_code (stmt) == GIMPLE_ASSIGN
 684               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 685             mask_type = build_same_sized_truth_vector_type (mask_type);
 686         }
 687
 688       /* No mask_type should mean loop invariant predicate.
 689          This is probably a subject for optimization in
 690          if-conversion.  */
 691       if (!mask_type)
 692         {
 693           if (dump_enabled_p ())
 694             {
 695               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 696                                "not vectorized: can't compute mask type "
 697                                "for statement, ");
 698               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 699                                 0);
 700             }
 701           return false;
 702         }
 703
 704       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 705     }
 706
 707   return true;
 708 }
 709
 710
 711 /* Function vect_is_simple_iv_evolution.
 712
 713    FORNOW: A simple evolution of an induction variables in the loop is
 714    considered a polynomial evolution.  */
 715
 716 static bool
 717 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 718                              tree * step)
 719 {
 720   tree init_expr;
 721   tree step_expr;
 722   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 723   basic_block bb;
 724
 725   /* When there is no evolution in this loop, the evolution function
 726      is not "simple".  */
 727   if (evolution_part == NULL_TREE)
 728     return false;
 729
 730   /* When the evolution is a polynomial of degree >= 2
 731      the evolution function is not "simple".  */
 732   if (tree_is_chrec (evolution_part))
 733     return false;
 734
 735   step_expr = evolution_part;
 736   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 737
 738   if (dump_enabled_p ())
 739     {
 740       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 741       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 742       dump_printf (MSG_NOTE, ",  init: ");
 743       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 744       dump_printf (MSG_NOTE, "\n");
 745     }
 746
 747   *init = init_expr;
 748   *step = step_expr;
 749
 750   if (TREE_CODE (step_expr) != INTEGER_CST
 751       && (TREE_CODE (step_expr) != SSA_NAME
 752           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 753               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 754           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 755               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 756                   || !flag_associative_math)))
 757       && (TREE_CODE (step_expr) != REAL_CST
 758           || !flag_associative_math))
 759     {
 760       if (dump_enabled_p ())
 761         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 762                          "step unknown.\n");
 763       return false;
 764     }
 765
 766   return true;
 767 }
 768
 769 /* Function vect_analyze_scalar_cycles_1.
 770
 771    Examine the cross iteration def-use cycles of scalar variables
 772    in LOOP.  LOOP_VINFO represents the loop that is now being
 773    considered for vectorization (can be LOOP, or an outer-loop
 774    enclosing LOOP).  */
 775
 776 static void
 777 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 778 {
 779   basic_block bb = loop->header;
 780   tree init, step;
 781   auto_vec<gimple *, 64> worklist;
 782   gphi_iterator gsi;
 783   bool double_reduc;
 784
 785   if (dump_enabled_p ())
 786     dump_printf_loc (MSG_NOTE, vect_location,
 787                      "=== vect_analyze_scalar_cycles ===\n");
 788
 789   /* First - identify all inductions.  Reduction detection assumes that all the
 790      inductions have been identified, therefore, this order must not be
 791      changed.  */
 792   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 793     {
 794       gphi *phi = gsi.phi ();
 795       tree access_fn = NULL;
 796       tree def = PHI_RESULT (phi);
 797       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 798
 799       if (dump_enabled_p ())
 800         {
 801           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 802           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 803         }
 804
 805       /* Skip virtual phi's.  The data dependences that are associated with
 806          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 807       if (virtual_operand_p (def))
 808         continue;
 809
 810       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 811
 812       /* Analyze the evolution function.  */
 813       access_fn = analyze_scalar_evolution (loop, def);
 814       if (access_fn)
 815         {
 816           STRIP_NOPS (access_fn);
 817           if (dump_enabled_p ())
 818             {
 819               dump_printf_loc (MSG_NOTE, vect_location,
 820                                "Access function of PHI: ");
 821               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 822               dump_printf (MSG_NOTE, "\n");
 823             }
 824           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 825             = initial_condition_in_loop_num (access_fn, loop->num);
 826           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 827             = evolution_part_in_loop_num (access_fn, loop->num);
 828         }
 829
 830       if (!access_fn
 831           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 832           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 833               && TREE_CODE (step) != INTEGER_CST))
 834         {
 835           worklist.safe_push (phi);
 836           continue;
 837         }
 838
 839       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 840                   != NULL_TREE);
 841       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 842
 843       if (dump_enabled_p ())
 844         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 845       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 846     }
 847
 848
 849   /* Second - identify all reductions and nested cycles.  */
 850   while (worklist.length () > 0)
 851     {
 852       gimple *phi = worklist.pop ();
 853       tree def = PHI_RESULT (phi);
 854       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 855       gimple *reduc_stmt;
 856
 857       if (dump_enabled_p ())
 858         {
 859           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 860           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 861         }
 862
 863       gcc_assert (!virtual_operand_p (def)
 864                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 865
 866       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 867                                                 &double_reduc, false);
 868       if (reduc_stmt)
 869         {
 870           if (double_reduc)
 871             {
 872               if (dump_enabled_p ())
 873                 dump_printf_loc (MSG_NOTE, vect_location,
 874                                  "Detected double reduction.\n");
 875
 876               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 877               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 878                                                     vect_double_reduction_def;
 879             }
 880           else
 881             {
 882               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 883                 {
 884                   if (dump_enabled_p ())
 885                     dump_printf_loc (MSG_NOTE, vect_location,
 886                                      "Detected vectorizable nested cycle.\n");
 887
 888                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 889                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 890                                                              vect_nested_cycle;
 891                 }
 892               else
 893                 {
 894                   if (dump_enabled_p ())
 895                     dump_printf_loc (MSG_NOTE, vect_location,
 896                                      "Detected reduction.\n");
 897
 898                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 899                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 900                                                            vect_reduction_def;
 901                   /* Store the reduction cycles for possible vectorization in
 902                      loop-aware SLP if it was not detected as reduction
 903                      chain.  */
 904                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 905                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 906                 }
 907             }
 908         }
 909       else
 910         if (dump_enabled_p ())
 911           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 912                            "Unknown def-use cycle pattern.\n");
 913     }
 914 }
 915
 916
 917 /* Function vect_analyze_scalar_cycles.
 918
 919    Examine the cross iteration def-use cycles of scalar variables, by
 920    analyzing the loop-header PHIs of scalar variables.  Classify each
 921    cycle as one of the following: invariant, induction, reduction, unknown.
 922    We do that for the loop represented by LOOP_VINFO, and also to its
 923    inner-loop, if exists.
 924    Examples for scalar cycles:
 925
 926    Example1: reduction:
 927
 928               loop1:
 929               for (i=0; i<N; i++)
 930                  sum += a[i];
 931
 932    Example2: induction:
 933
 934               loop2:
 935               for (i=0; i<N; i++)
 936                  a[i] = i;  */
 937
 938 static void
 939 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 940 {
 941   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 942
 943   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 944
 945   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 946      Reductions in such inner-loop therefore have different properties than
 947      the reductions in the nest that gets vectorized:
 948      1. When vectorized, they are executed in the same order as in the original
 949         scalar loop, so we can't change the order of computation when
 950         vectorizing them.
 951      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 952         current checks are too strict.  */
 953
 954   if (loop->inner)
 955     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 956 }
 957
 958 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 959
 960 static void
 961 vect_fixup_reduc_chain (gimple *stmt)
 962 {
 963   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 964   gimple *stmtp;
 965   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 966               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 967   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 968   do
 969     {
 970       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 971       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 972       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 973       if (stmt)
 974         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 975           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 976     }
 977   while (stmt);
 978   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 979 }
 980
 981 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 982
 983 static void
 984 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 985 {
 986   gimple *first;
 987   unsigned i;
 988
 989   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 990     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 991       {
 992         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 993         while (next)
 994           {
 995             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 996               break;
 997             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 998           }
 999         /* If not all stmt in the chain are patterns try to handle
1000            the chain without patterns.  */
1001         if (! next)
1002           {
1003             vect_fixup_reduc_chain (first);
1004             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
1005               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
1006           }
1007       }
1008 }
1009
1010 /* Function vect_get_loop_niters.
1011
1012    Determine how many iterations the loop is executed and place it
1013    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1014    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1015    niter information holds in ASSUMPTIONS.
1016
1017    Return the loop exit condition.  */
1018
1019
1020 static gcond *
1021 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1022                       tree *number_of_iterations, tree *number_of_iterationsm1)
1023 {
1024   edge exit = single_exit (loop);
1025   struct tree_niter_desc niter_desc;
1026   tree niter_assumptions, niter, may_be_zero;
1027   gcond *cond = get_loop_exit_condition (loop);
1028
1029   *assumptions = boolean_true_node;
1030   *number_of_iterationsm1 = chrec_dont_know;
1031   *number_of_iterations = chrec_dont_know;
1032   if (dump_enabled_p ())
1033     dump_printf_loc (MSG_NOTE, vect_location,
1034                      "=== get_loop_niters ===\n");
1035
1036   if (!exit)
1037     return cond;
1038
1039   niter = chrec_dont_know;
1040   may_be_zero = NULL_TREE;
1041   niter_assumptions = boolean_true_node;
1042   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1043       || chrec_contains_undetermined (niter_desc.niter))
1044     return cond;
1045
1046   niter_assumptions = niter_desc.assumptions;
1047   may_be_zero = niter_desc.may_be_zero;
1048   niter = niter_desc.niter;
1049
1050   if (may_be_zero && integer_zerop (may_be_zero))
1051     may_be_zero = NULL_TREE;
1052
1053   if (may_be_zero)
1054     {
1055       if (COMPARISON_CLASS_P (may_be_zero))
1056         {
1057           /* Try to combine may_be_zero with assumptions, this can simplify
1058              computation of niter expression.  */
1059           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1060             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1061                                              niter_assumptions,
1062                                              fold_build1 (TRUTH_NOT_EXPR,
1063                                                           boolean_type_node,
1064                                                           may_be_zero));
1065           else
1066             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1067                                  build_int_cst (TREE_TYPE (niter), 0),
1068                                  rewrite_to_non_trapping_overflow (niter));
1069
1070           may_be_zero = NULL_TREE;
1071         }
1072       else if (integer_nonzerop (may_be_zero))
1073         {
1074           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1075           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1076           return cond;
1077         }
1078       else
1079         return cond;
1080     }
1081
1082   *assumptions = niter_assumptions;
1083   *number_of_iterationsm1 = niter;
1084
1085   /* We want the number of loop header executions which is the number
1086      of latch executions plus one.
1087      ???  For UINT_MAX latch executions this number overflows to zero
1088      for loops like do { n++; } while (n != 0);  */
1089   if (niter && !chrec_contains_undetermined (niter))
1090     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1091                           build_int_cst (TREE_TYPE (niter), 1));
1092   *number_of_iterations = niter;
1093
1094   return cond;
1095 }
1096
1097 /* Function bb_in_loop_p
1098
1099    Used as predicate for dfs order traversal of the loop bbs.  */
1100
1101 static bool
1102 bb_in_loop_p (const_basic_block bb, const void *data)
1103 {
1104   const struct loop *const loop = (const struct loop *)data;
1105   if (flow_bb_inside_loop_p (loop, bb))
1106     return true;
1107   return false;
1108 }
1109
1110
1111 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1112    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1113
1114 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1115   : vec_info (vec_info::loop, init_cost (loop_in)),
1116     loop (loop_in),
1117     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1118     num_itersm1 (NULL_TREE),
1119     num_iters (NULL_TREE),
1120     num_iters_unchanged (NULL_TREE),
1121     num_iters_assumptions (NULL_TREE),
1122     th (0),
1123     versioning_threshold (0),
1124     vectorization_factor (0),
1125     max_vectorization_factor (0),
1126     mask_skip_niters (NULL_TREE),
1127     mask_compare_type (NULL_TREE),
1128     unaligned_dr (NULL),
1129     peeling_for_alignment (0),
1130     ptr_mask (0),
1131     ivexpr_map (NULL),
1132     slp_unrolling_factor (1),
1133     single_scalar_iteration_cost (0),
1134     vectorizable (false),
1135     can_fully_mask_p (true),
1136     fully_masked_p (false),
1137     peeling_for_gaps (false),
1138     peeling_for_niter (false),
1139     operands_swapped (false),
1140     no_data_dependencies (false),
1141     has_mask_store (false),
1142     scalar_loop (NULL),
1143     orig_loop_info (NULL)
1144 {
1145   /* Create/Update stmt_info for all stmts in the loop.  */
1146   basic_block *body = get_loop_body (loop);
1147   for (unsigned int i = 0; i < loop->num_nodes; i++)
1148     {
1149       basic_block bb = body[i];
1150       gimple_stmt_iterator si;
1151
1152       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1153         {
1154           gimple *phi = gsi_stmt (si);
1155           gimple_set_uid (phi, 0);
1156           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1157         }
1158
1159       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1160         {
1161           gimple *stmt = gsi_stmt (si);
1162           gimple_set_uid (stmt, 0);
1163           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1164         }
1165     }
1166   free (body);
1167
1168   /* CHECKME: We want to visit all BBs before their successors (except for
1169      latch blocks, for which this assertion wouldn't hold).  In the simple
1170      case of the loop forms we allow, a dfs order of the BBs would the same
1171      as reversed postorder traversal, so we are safe.  */
1172
1173   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1174                                           bbs, loop->num_nodes, loop);
1175   gcc_assert (nbbs == loop->num_nodes);
1176 }
1177
1178 /* Free all levels of MASKS.  */
1179
1180 void
1181 release_vec_loop_masks (vec_loop_masks *masks)
1182 {
1183   rgroup_masks *rgm;
1184   unsigned int i;
1185   FOR_EACH_VEC_ELT (*masks, i, rgm)
1186     rgm->masks.release ();
1187   masks->release ();
1188 }
1189
1190 /* Free all memory used by the _loop_vec_info, as well as all the
1191    stmt_vec_info structs of all the stmts in the loop.  */
1192
1193 _loop_vec_info::~_loop_vec_info ()
1194 {
1195   int nbbs;
1196   gimple_stmt_iterator si;
1197   int j;
1198
1199   nbbs = loop->num_nodes;
1200   for (j = 0; j < nbbs; j++)
1201     {
1202       basic_block bb = bbs[j];
1203       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1204         free_stmt_vec_info (gsi_stmt (si));
1205
1206       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1207         {
1208           gimple *stmt = gsi_stmt (si);
1209
1210           /* We may have broken canonical form by moving a constant
1211              into RHS1 of a commutative op.  Fix such occurrences.  */
1212           if (operands_swapped && is_gimple_assign (stmt))
1213             {
1214               enum tree_code code = gimple_assign_rhs_code (stmt);
1215
1216               if ((code == PLUS_EXPR
1217                    || code == POINTER_PLUS_EXPR
1218                    || code == MULT_EXPR)
1219                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1220                 swap_ssa_operands (stmt,
1221                                    gimple_assign_rhs1_ptr (stmt),
1222                                    gimple_assign_rhs2_ptr (stmt));
1223               else if (code == COND_EXPR
1224                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1225                 {
1226                   tree cond_expr = gimple_assign_rhs1 (stmt);
1227                   enum tree_code cond_code = TREE_CODE (cond_expr);
1228
1229                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1230                     {
1231                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1232                                                                   0));
1233                       cond_code = invert_tree_comparison (cond_code,
1234                                                           honor_nans);
1235                       if (cond_code != ERROR_MARK)
1236                         {
1237                           TREE_SET_CODE (cond_expr, cond_code);
1238                           swap_ssa_operands (stmt,
1239                                              gimple_assign_rhs2_ptr (stmt),
1240                                              gimple_assign_rhs3_ptr (stmt));
1241                         }
1242                     }
1243                 }
1244             }
1245
1246           /* Free stmt_vec_info.  */
1247           free_stmt_vec_info (stmt);
1248           gsi_next (&si);
1249         }
1250     }
1251
1252   free (bbs);
1253
1254   release_vec_loop_masks (&masks);
1255   delete ivexpr_map;
1256
1257   loop->aux = NULL;
1258 }
1259
1260 /* Return an invariant or register for EXPR and emit necessary
1261    computations in the LOOP_VINFO loop preheader.  */
1262
1263 tree
1264 cse_and_gimplify_to_preheader (loop_vec_info loop_vinfo, tree expr)
1265 {
1266   if (is_gimple_reg (expr)
1267       || is_gimple_min_invariant (expr))
1268     return expr;
1269
1270   if (! loop_vinfo->ivexpr_map)
1271     loop_vinfo->ivexpr_map = new hash_map<tree_operand_hash, tree>;
1272   tree &cached = loop_vinfo->ivexpr_map->get_or_insert (expr);
1273   if (! cached)
1274     {
1275       gimple_seq stmts = NULL;
1276       cached = force_gimple_operand (unshare_expr (expr),
1277                                      &stmts, true, NULL_TREE);
1278       if (stmts)
1279         {
1280           edge e = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
1281           gsi_insert_seq_on_edge_immediate (e, stmts);
1282         }
1283     }
1284   return cached;
1285 }
1286
1287 /* Return true if we can use CMP_TYPE as the comparison type to produce
1288    all masks required to mask LOOP_VINFO.  */
1289
1290 static bool
1291 can_produce_all_loop_masks_p (loop_vec_info loop_vinfo, tree cmp_type)
1292 {
1293   rgroup_masks *rgm;
1294   unsigned int i;
1295   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1296     if (rgm->mask_type != NULL_TREE
1297         && !direct_internal_fn_supported_p (IFN_WHILE_ULT,
1298                                             cmp_type, rgm->mask_type,
1299                                             OPTIMIZE_FOR_SPEED))
1300       return false;
1301   return true;
1302 }
1303
1304 /* Calculate the maximum number of scalars per iteration for every
1305    rgroup in LOOP_VINFO.  */
1306
1307 static unsigned int
1308 vect_get_max_nscalars_per_iter (loop_vec_info loop_vinfo)
1309 {
1310   unsigned int res = 1;
1311   unsigned int i;
1312   rgroup_masks *rgm;
1313   FOR_EACH_VEC_ELT (LOOP_VINFO_MASKS (loop_vinfo), i, rgm)
1314     res = MAX (res, rgm->max_nscalars_per_iter);
1315   return res;
1316 }
1317
1318 /* Each statement in LOOP_VINFO can be masked where necessary.  Check
1319    whether we can actually generate the masks required.  Return true if so,
1320    storing the type of the scalar IV in LOOP_VINFO_MASK_COMPARE_TYPE.  */
1321
1322 static bool
1323 vect_verify_full_masking (loop_vec_info loop_vinfo)
1324 {
1325   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1326   unsigned int min_ni_width;
1327
1328   /* Use a normal loop if there are no statements that need masking.
1329      This only happens in rare degenerate cases: it means that the loop
1330      has no loads, no stores, and no live-out values.  */
1331   if (LOOP_VINFO_MASKS (loop_vinfo).is_empty ())
1332     return false;
1333
1334   /* Get the maximum number of iterations that is representable
1335      in the counter type.  */
1336   tree ni_type = TREE_TYPE (LOOP_VINFO_NITERSM1 (loop_vinfo));
1337   widest_int max_ni = wi::to_widest (TYPE_MAX_VALUE (ni_type)) + 1;
1338
1339   /* Get a more refined estimate for the number of iterations.  */
1340   widest_int max_back_edges;
1341   if (max_loop_iterations (loop, &max_back_edges))
1342     max_ni = wi::smin (max_ni, max_back_edges + 1);
1343
1344   /* Account for rgroup masks, in which each bit is replicated N times.  */
1345   max_ni *= vect_get_max_nscalars_per_iter (loop_vinfo);
1346
1347   /* Work out how many bits we need to represent the limit.  */
1348   min_ni_width = wi::min_precision (max_ni, UNSIGNED);
1349
1350   /* Find a scalar mode for which WHILE_ULT is supported.  */
1351   opt_scalar_int_mode cmp_mode_iter;
1352   tree cmp_type = NULL_TREE;
1353   FOR_EACH_MODE_IN_CLASS (cmp_mode_iter, MODE_INT)
1354     {
1355       unsigned int cmp_bits = GET_MODE_BITSIZE (cmp_mode_iter.require ());
1356       if (cmp_bits >= min_ni_width
1357           && targetm.scalar_mode_supported_p (cmp_mode_iter.require ()))
1358         {
1359           tree this_type = build_nonstandard_integer_type (cmp_bits, true);
1360           if (this_type
1361               && can_produce_all_loop_masks_p (loop_vinfo, this_type))
1362             {
1363               /* Although we could stop as soon as we find a valid mode,
1364                  it's often better to continue until we hit Pmode, since the
1365                  operands to the WHILE are more likely to be reusable in
1366                  address calculations.  */
1367               cmp_type = this_type;
1368               if (cmp_bits >= GET_MODE_BITSIZE (Pmode))
1369                 break;
1370             }
1371         }
1372     }
1373
1374   if (!cmp_type)
1375     return false;
1376
1377   LOOP_VINFO_MASK_COMPARE_TYPE (loop_vinfo) = cmp_type;
1378   return true;
1379 }
1380
1381 /* Calculate the cost of one scalar iteration of the loop.  */
1382 static void
1383 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1384 {
1385   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1386   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1387   int nbbs = loop->num_nodes, factor;
1388   int innerloop_iters, i;
1389
1390   /* Gather costs for statements in the scalar loop.  */
1391
1392   /* FORNOW.  */
1393   innerloop_iters = 1;
1394   if (loop->inner)
1395     innerloop_iters = 50; /* FIXME */
1396
1397   for (i = 0; i < nbbs; i++)
1398     {
1399       gimple_stmt_iterator si;
1400       basic_block bb = bbs[i];
1401
1402       if (bb->loop_father == loop->inner)
1403         factor = innerloop_iters;
1404       else
1405         factor = 1;
1406
1407       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409           gimple *stmt = gsi_stmt (si);
1410           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1411
1412           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1413             continue;
1414
1415           /* Skip stmts that are not vectorized inside the loop.  */
1416           if (stmt_info
1417               && !STMT_VINFO_RELEVANT_P (stmt_info)
1418               && (!STMT_VINFO_LIVE_P (stmt_info)
1419                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1420               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1421             continue;
1422
1423           vect_cost_for_stmt kind;
1424           if (STMT_VINFO_DATA_REF (stmt_info))
1425             {
1426               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1427                kind = scalar_load;
1428              else
1429                kind = scalar_store;
1430             }
1431           else
1432             kind = scalar_stmt;
1433
1434           record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1435                             factor, kind, stmt_info, 0, vect_prologue);
1436         }
1437     }
1438
1439   /* Now accumulate cost.  */
1440   void *target_cost_data = init_cost (loop);
1441   stmt_info_for_cost *si;
1442   int j;
1443   FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1444                     j, si)
1445     {
1446       struct _stmt_vec_info *stmt_info
1447         = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
1448       (void) add_stmt_cost (target_cost_data, si->count,
1449                             si->kind, stmt_info, si->misalign,
1450                             vect_body);
1451     }
1452   unsigned dummy, body_cost = 0;
1453   finish_cost (target_cost_data, &dummy, &body_cost, &dummy);
1454   destroy_cost_data (target_cost_data);
1455   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo) = body_cost;
1456 }
1457
1458
1459 /* Function vect_analyze_loop_form_1.
1460
1461    Verify that certain CFG restrictions hold, including:
1462    - the loop has a pre-header
1463    - the loop has a single entry and exit
1464    - the loop exit condition is simple enough
1465    - the number of iterations can be analyzed, i.e, a countable loop.  The
1466      niter could be analyzed under some assumptions.  */
1467
1468 bool
1469 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1470                           tree *assumptions, tree *number_of_iterationsm1,
1471                           tree *number_of_iterations, gcond **inner_loop_cond)
1472 {
1473   if (dump_enabled_p ())
1474     dump_printf_loc (MSG_NOTE, vect_location,
1475                      "=== vect_analyze_loop_form ===\n");
1476
1477   /* Different restrictions apply when we are considering an inner-most loop,
1478      vs. an outer (nested) loop.
1479      (FORNOW. May want to relax some of these restrictions in the future).  */
1480
1481   if (!loop->inner)
1482     {
1483       /* Inner-most loop.  We currently require that the number of BBs is
1484          exactly 2 (the header and latch).  Vectorizable inner-most loops
1485          look like this:
1486
1487                         (pre-header)
1488                            |
1489                           header <--------+
1490                            | |            |
1491                            | +--> latch --+
1492                            |
1493                         (exit-bb)  */
1494
1495       if (loop->num_nodes != 2)
1496         {
1497           if (dump_enabled_p ())
1498             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1499                              "not vectorized: control flow in loop.\n");
1500           return false;
1501         }
1502
1503       if (empty_block_p (loop->header))
1504         {
1505           if (dump_enabled_p ())
1506             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1507                              "not vectorized: empty loop.\n");
1508           return false;
1509         }
1510     }
1511   else
1512     {
1513       struct loop *innerloop = loop->inner;
1514       edge entryedge;
1515
1516       /* Nested loop. We currently require that the loop is doubly-nested,
1517          contains a single inner loop, and the number of BBs is exactly 5.
1518          Vectorizable outer-loops look like this:
1519
1520                         (pre-header)
1521                            |
1522                           header <---+
1523                            |         |
1524                           inner-loop |
1525                            |         |
1526                           tail ------+
1527                            |
1528                         (exit-bb)
1529
1530          The inner-loop has the properties expected of inner-most loops
1531          as described above.  */
1532
1533       if ((loop->inner)->inner || (loop->inner)->next)
1534         {
1535           if (dump_enabled_p ())
1536             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                              "not vectorized: multiple nested loops.\n");
1538           return false;
1539         }
1540
1541       if (loop->num_nodes != 5)
1542         {
1543           if (dump_enabled_p ())
1544             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                              "not vectorized: control flow in loop.\n");
1546           return false;
1547         }
1548
1549       entryedge = loop_preheader_edge (innerloop);
1550       if (entryedge->src != loop->header
1551           || !single_exit (innerloop)
1552           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1553         {
1554           if (dump_enabled_p ())
1555             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1556                              "not vectorized: unsupported outerloop form.\n");
1557           return false;
1558         }
1559
1560       /* Analyze the inner-loop.  */
1561       tree inner_niterm1, inner_niter, inner_assumptions;
1562       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1563                                       &inner_assumptions, &inner_niterm1,
1564                                       &inner_niter, NULL)
1565           /* Don't support analyzing niter under assumptions for inner
1566              loop.  */
1567           || !integer_onep (inner_assumptions))
1568         {
1569           if (dump_enabled_p ())
1570             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                              "not vectorized: Bad inner loop.\n");
1572           return false;
1573         }
1574
1575       if (!expr_invariant_in_loop_p (loop, inner_niter))
1576         {
1577           if (dump_enabled_p ())
1578             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1579                              "not vectorized: inner-loop count not"
1580                              " invariant.\n");
1581           return false;
1582         }
1583
1584       if (dump_enabled_p ())
1585         dump_printf_loc (MSG_NOTE, vect_location,
1586                          "Considering outer-loop vectorization.\n");
1587     }
1588
1589   if (!single_exit (loop)
1590       || EDGE_COUNT (loop->header->preds) != 2)
1591     {
1592       if (dump_enabled_p ())
1593         {
1594           if (!single_exit (loop))
1595             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1596                              "not vectorized: multiple exits.\n");
1597           else if (EDGE_COUNT (loop->header->preds) != 2)
1598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1599                              "not vectorized: too many incoming edges.\n");
1600         }
1601       return false;
1602     }
1603
1604   /* We assume that the loop exit condition is at the end of the loop. i.e,
1605      that the loop is represented as a do-while (with a proper if-guard
1606      before the loop if needed), where the loop header contains all the
1607      executable statements, and the latch is empty.  */
1608   if (!empty_block_p (loop->latch)
1609       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1610     {
1611       if (dump_enabled_p ())
1612         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1613                          "not vectorized: latch block not empty.\n");
1614       return false;
1615     }
1616
1617   /* Make sure the exit is not abnormal.  */
1618   edge e = single_exit (loop);
1619   if (e->flags & EDGE_ABNORMAL)
1620     {
1621       if (dump_enabled_p ())
1622         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1623                          "not vectorized: abnormal loop exit edge.\n");
1624       return false;
1625     }
1626
1627   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1628                                      number_of_iterationsm1);
1629   if (!*loop_cond)
1630     {
1631       if (dump_enabled_p ())
1632         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1633                          "not vectorized: complicated exit condition.\n");
1634       return false;
1635     }
1636
1637   if (integer_zerop (*assumptions)
1638       || !*number_of_iterations
1639       || chrec_contains_undetermined (*number_of_iterations))
1640     {
1641       if (dump_enabled_p ())
1642         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1643                          "not vectorized: number of iterations cannot be "
1644                          "computed.\n");
1645       return false;
1646     }
1647
1648   if (integer_zerop (*number_of_iterations))
1649     {
1650       if (dump_enabled_p ())
1651         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652                          "not vectorized: number of iterations = 0.\n");
1653       return false;
1654     }
1655
1656   return true;
1657 }
1658
1659 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1660
1661 loop_vec_info
1662 vect_analyze_loop_form (struct loop *loop)
1663 {
1664   tree assumptions, number_of_iterations, number_of_iterationsm1;
1665   gcond *loop_cond, *inner_loop_cond = NULL;
1666
1667   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1668                                   &assumptions, &number_of_iterationsm1,
1669                                   &number_of_iterations, &inner_loop_cond))
1670     return NULL;
1671
1672   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1673   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1674   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1675   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1676   if (!integer_onep (assumptions))
1677     {
1678       /* We consider to vectorize this loop by versioning it under
1679          some assumptions.  In order to do this, we need to clear
1680          existing information computed by scev and niter analyzer.  */
1681       scev_reset_htab ();
1682       free_numbers_of_iterations_estimates (loop);
1683       /* Also set flag for this loop so that following scev and niter
1684          analysis are done under the assumptions.  */
1685       loop_constraint_set (loop, LOOP_C_FINITE);
1686       /* Also record the assumptions for versioning.  */
1687       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1688     }
1689
1690   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1691     {
1692       if (dump_enabled_p ())
1693         {
1694           dump_printf_loc (MSG_NOTE, vect_location,
1695                            "Symbolic number of iterations is ");
1696           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1697           dump_printf (MSG_NOTE, "\n");
1698         }
1699     }
1700
1701   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1702   if (inner_loop_cond)
1703     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1704       = loop_exit_ctrl_vec_info_type;
1705
1706   gcc_assert (!loop->aux);
1707   loop->aux = loop_vinfo;
1708   return loop_vinfo;
1709 }
1710
1711
1712
1713 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1714    statements update the vectorization factor.  */
1715
1716 static void
1717 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1718 {
1719   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1720   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1721   int nbbs = loop->num_nodes;
1722   poly_uint64 vectorization_factor;
1723   int i;
1724
1725   if (dump_enabled_p ())
1726     dump_printf_loc (MSG_NOTE, vect_location,
1727                      "=== vect_update_vf_for_slp ===\n");
1728
1729   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1730   gcc_assert (known_ne (vectorization_factor, 0U));
1731
1732   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1733      vectorization factor of the loop is the unrolling factor required by
1734      the SLP instances.  If that unrolling factor is 1, we say, that we
1735      perform pure SLP on loop - cross iteration parallelism is not
1736      exploited.  */
1737   bool only_slp_in_loop = true;
1738   for (i = 0; i < nbbs; i++)
1739     {
1740       basic_block bb = bbs[i];
1741       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1742            gsi_next (&si))
1743         {
1744           gimple *stmt = gsi_stmt (si);
1745           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1746           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1747               && STMT_VINFO_RELATED_STMT (stmt_info))
1748             {
1749               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1750               stmt_info = vinfo_for_stmt (stmt);
1751             }
1752           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1753                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1754               && !PURE_SLP_STMT (stmt_info))
1755             /* STMT needs both SLP and loop-based vectorization.  */
1756             only_slp_in_loop = false;
1757         }
1758     }
1759
1760   if (only_slp_in_loop)
1761     {
1762       dump_printf_loc (MSG_NOTE, vect_location,
1763                        "Loop contains only SLP stmts\n");
1764       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1765     }
1766   else
1767     {
1768       dump_printf_loc (MSG_NOTE, vect_location,
1769                        "Loop contains SLP and non-SLP stmts\n");
1770       /* Both the vectorization factor and unroll factor have the form
1771          current_vector_size * X for some rational X, so they must have
1772          a common multiple.  */
1773       vectorization_factor
1774         = force_common_multiple (vectorization_factor,
1775                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1776     }
1777
1778   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1779   if (dump_enabled_p ())
1780     {
1781       dump_printf_loc (MSG_NOTE, vect_location,
1782                        "Updating vectorization factor to ");
1783       dump_dec (MSG_NOTE, vectorization_factor);
1784       dump_printf (MSG_NOTE, ".\n");
1785     }
1786 }
1787
1788 /* Return true if STMT_INFO describes a double reduction phi and if
1789    the other phi in the reduction is also relevant for vectorization.
1790    This rejects cases such as:
1791
1792       outer1:
1793         x_1 = PHI <x_3(outer2), ...>;
1794         ...
1795
1796       inner:
1797         x_2 = ...;
1798         ...
1799
1800       outer2:
1801         x_3 = PHI <x_2(inner)>;
1802
1803    if nothing in x_2 or elsewhere makes x_1 relevant.  */
1804
1805 static bool
1806 vect_active_double_reduction_p (stmt_vec_info stmt_info)
1807 {
1808   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_double_reduction_def)
1809     return false;
1810
1811   gimple *other_phi = STMT_VINFO_REDUC_DEF (stmt_info);
1812   return STMT_VINFO_RELEVANT_P (vinfo_for_stmt (other_phi));
1813 }
1814
1815 /* Function vect_analyze_loop_operations.
1816
1817    Scan the loop stmts and make sure they are all vectorizable.  */
1818
1819 static bool
1820 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1821 {
1822   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1823   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1824   int nbbs = loop->num_nodes;
1825   int i;
1826   stmt_vec_info stmt_info;
1827   bool need_to_vectorize = false;
1828   bool ok;
1829
1830   if (dump_enabled_p ())
1831     dump_printf_loc (MSG_NOTE, vect_location,
1832                      "=== vect_analyze_loop_operations ===\n");
1833
1834   for (i = 0; i < nbbs; i++)
1835     {
1836       basic_block bb = bbs[i];
1837
1838       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1839            gsi_next (&si))
1840         {
1841           gphi *phi = si.phi ();
1842           ok = true;
1843
1844           stmt_info = vinfo_for_stmt (phi);
1845           if (dump_enabled_p ())
1846             {
1847               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1848               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1849             }
1850           if (virtual_operand_p (gimple_phi_result (phi)))
1851             continue;
1852
1853           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1854              (i.e., a phi in the tail of the outer-loop).  */
1855           if (! is_loop_header_bb_p (bb))
1856             {
1857               /* FORNOW: we currently don't support the case that these phis
1858                  are not used in the outerloop (unless it is double reduction,
1859                  i.e., this phi is vect_reduction_def), cause this case
1860                  requires to actually do something here.  */
1861               if (STMT_VINFO_LIVE_P (stmt_info)
1862                   && !vect_active_double_reduction_p (stmt_info))
1863                 {
1864                   if (dump_enabled_p ())
1865                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1866                                      "Unsupported loop-closed phi in "
1867                                      "outer-loop.\n");
1868                   return false;
1869                 }
1870
1871               /* If PHI is used in the outer loop, we check that its operand
1872                  is defined in the inner loop.  */
1873               if (STMT_VINFO_RELEVANT_P (stmt_info))
1874                 {
1875                   tree phi_op;
1876                   gimple *op_def_stmt;
1877
1878                   if (gimple_phi_num_args (phi) != 1)
1879                     return false;
1880
1881                   phi_op = PHI_ARG_DEF (phi, 0);
1882                   if (TREE_CODE (phi_op) != SSA_NAME)
1883                     return false;
1884
1885                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1886                   if (gimple_nop_p (op_def_stmt)
1887                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1888                       || !vinfo_for_stmt (op_def_stmt))
1889                     return false;
1890
1891                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1892                         != vect_used_in_outer
1893                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1894                            != vect_used_in_outer_by_reduction)
1895                     return false;
1896                 }
1897
1898               continue;
1899             }
1900
1901           gcc_assert (stmt_info);
1902
1903           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1904                || STMT_VINFO_LIVE_P (stmt_info))
1905               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1906             {
1907               /* A scalar-dependence cycle that we don't support.  */
1908               if (dump_enabled_p ())
1909                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1910                                  "not vectorized: scalar dependence cycle.\n");
1911               return false;
1912             }
1913
1914           if (STMT_VINFO_RELEVANT_P (stmt_info))
1915             {
1916               need_to_vectorize = true;
1917               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1918                   && ! PURE_SLP_STMT (stmt_info))
1919                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1920               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1921                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1922                        && ! PURE_SLP_STMT (stmt_info))
1923                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1924             }
1925
1926           /* SLP PHIs are tested by vect_slp_analyze_node_operations.  */
1927           if (ok
1928               && STMT_VINFO_LIVE_P (stmt_info)
1929               && !PURE_SLP_STMT (stmt_info))
1930             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1931
1932           if (!ok)
1933             {
1934               if (dump_enabled_p ())
1935                 {
1936                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1937                                    "not vectorized: relevant phi not "
1938                                    "supported: ");
1939                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1940                 }
1941               return false;
1942             }
1943         }
1944
1945       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1946            gsi_next (&si))
1947         {
1948           gimple *stmt = gsi_stmt (si);
1949           if (!gimple_clobber_p (stmt)
1950               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1951             return false;
1952         }
1953     } /* bbs */
1954
1955   /* All operations in the loop are either irrelevant (deal with loop
1956      control, or dead), or only used outside the loop and can be moved
1957      out of the loop (e.g. invariants, inductions).  The loop can be
1958      optimized away by scalar optimizations.  We're better off not
1959      touching this loop.  */
1960   if (!need_to_vectorize)
1961     {
1962       if (dump_enabled_p ())
1963         dump_printf_loc (MSG_NOTE, vect_location,
1964                          "All the computation can be taken out of the loop.\n");
1965       if (dump_enabled_p ())
1966         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1967                          "not vectorized: redundant loop. no profit to "
1968                          "vectorize.\n");
1969       return false;
1970     }
1971
1972   return true;
1973 }
1974
1975 /* Analyze the cost of the loop described by LOOP_VINFO.  Decide if it
1976    is worthwhile to vectorize.  Return 1 if definitely yes, 0 if
1977    definitely no, or -1 if it's worth retrying.  */
1978
1979 static int
1980 vect_analyze_loop_costing (loop_vec_info loop_vinfo)
1981 {
1982   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1983   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
1984
1985   /* Only fully-masked loops can have iteration counts less than the
1986      vectorization factor.  */
1987   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
1988     {
1989       HOST_WIDE_INT max_niter;
1990
1991       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1992         max_niter = LOOP_VINFO_INT_NITERS (loop_vinfo);
1993       else
1994         max_niter = max_stmt_executions_int (loop);
1995
1996       if (max_niter != -1
1997           && (unsigned HOST_WIDE_INT) max_niter < assumed_vf)
1998         {
1999           if (dump_enabled_p ())
2000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001                              "not vectorized: iteration count smaller than "
2002                              "vectorization factor.\n");
2003           return 0;
2004         }
2005     }
2006
2007   int min_profitable_iters, min_profitable_estimate;
2008   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2009                                       &min_profitable_estimate);
2010
2011   if (min_profitable_iters < 0)
2012     {
2013       if (dump_enabled_p ())
2014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015                          "not vectorized: vectorization not profitable.\n");
2016       if (dump_enabled_p ())
2017         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                          "not vectorized: vector version will never be "
2019                          "profitable.\n");
2020       return -1;
2021     }
2022
2023   int min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2024                                * assumed_vf);
2025
2026   /* Use the cost model only if it is more conservative than user specified
2027      threshold.  */
2028   unsigned int th = (unsigned) MAX (min_scalar_loop_bound,
2029                                     min_profitable_iters);
2030
2031   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2032
2033   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2034       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2035     {
2036       if (dump_enabled_p ())
2037         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2038                          "not vectorized: vectorization not profitable.\n");
2039       if (dump_enabled_p ())
2040         dump_printf_loc (MSG_NOTE, vect_location,
2041                          "not vectorized: iteration count smaller than user "
2042                          "specified loop bound parameter or minimum profitable "
2043                          "iterations (whichever is more conservative).\n");
2044       return 0;
2045     }
2046
2047   HOST_WIDE_INT estimated_niter = estimated_stmt_executions_int (loop);
2048   if (estimated_niter == -1)
2049     estimated_niter = likely_max_stmt_executions_int (loop);
2050   if (estimated_niter != -1
2051       && ((unsigned HOST_WIDE_INT) estimated_niter
2052           < MAX (th, (unsigned) min_profitable_estimate)))
2053     {
2054       if (dump_enabled_p ())
2055         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2056                          "not vectorized: estimated iteration count too "
2057                          "small.\n");
2058       if (dump_enabled_p ())
2059         dump_printf_loc (MSG_NOTE, vect_location,
2060                          "not vectorized: estimated iteration count smaller "
2061                          "than specified loop bound parameter or minimum "
2062                          "profitable iterations (whichever is more "
2063                          "conservative).\n");
2064       return -1;
2065     }
2066
2067   return 1;
2068 }
2069
2070
2071 /* Function vect_analyze_loop_2.
2072
2073    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2074    for it.  The different analyses will record information in the
2075    loop_vec_info struct.  */
2076 static bool
2077 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
2078 {
2079   bool ok;
2080   int res;
2081   unsigned int max_vf = MAX_VECTORIZATION_FACTOR;
2082   poly_uint64 min_vf = 2;
2083   unsigned int n_stmts = 0;
2084
2085   /* The first group of checks is independent of the vector size.  */
2086   fatal = true;
2087
2088   /* Find all data references in the loop (which correspond to vdefs/vuses)
2089      and analyze their evolution in the loop.  */
2090
2091   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2092
2093   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
2094   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
2095     {
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "not vectorized: loop nest containing two "
2099                          "or more consecutive inner loops cannot be "
2100                          "vectorized\n");
2101       return false;
2102     }
2103
2104   for (unsigned i = 0; i < loop->num_nodes; i++)
2105     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
2106          !gsi_end_p (gsi); gsi_next (&gsi))
2107       {
2108         gimple *stmt = gsi_stmt (gsi);
2109         if (is_gimple_debug (stmt))
2110           continue;
2111         ++n_stmts;
2112         if (!find_data_references_in_stmt (loop, stmt,
2113                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
2114           {
2115             if (is_gimple_call (stmt) && loop->safelen)
2116               {
2117                 tree fndecl = gimple_call_fndecl (stmt), op;
2118                 if (fndecl != NULL_TREE)
2119                   {
2120                     cgraph_node *node = cgraph_node::get (fndecl);
2121                     if (node != NULL && node->simd_clones != NULL)
2122                       {
2123                         unsigned int j, n = gimple_call_num_args (stmt);
2124                         for (j = 0; j < n; j++)
2125                           {
2126                             op = gimple_call_arg (stmt, j);
2127                             if (DECL_P (op)
2128                                 || (REFERENCE_CLASS_P (op)
2129                                     && get_base_address (op)))
2130                               break;
2131                           }
2132                         op = gimple_call_lhs (stmt);
2133                         /* Ignore #pragma omp declare simd functions
2134                            if they don't have data references in the
2135                            call stmt itself.  */
2136                         if (j == n
2137                             && !(op
2138                                  && (DECL_P (op)
2139                                      || (REFERENCE_CLASS_P (op)
2140                                          && get_base_address (op)))))
2141                           continue;
2142                       }
2143                   }
2144               }
2145             if (dump_enabled_p ())
2146               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2147                                "not vectorized: loop contains function "
2148                                "calls or data references that cannot "
2149                                "be analyzed\n");
2150             return false;
2151           }
2152       }
2153
2154   /* Analyze the data references and also adjust the minimal
2155      vectorization factor according to the loads and stores.  */
2156
2157   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
2158   if (!ok)
2159     {
2160       if (dump_enabled_p ())
2161         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2162                          "bad data references.\n");
2163       return false;
2164     }
2165
2166   /* Classify all cross-iteration scalar data-flow cycles.
2167      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
2168   vect_analyze_scalar_cycles (loop_vinfo);
2169
2170   vect_pattern_recog (loop_vinfo);
2171
2172   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
2173
2174   /* Analyze the access patterns of the data-refs in the loop (consecutive,
2175      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
2176
2177   ok = vect_analyze_data_ref_accesses (loop_vinfo);
2178   if (!ok)
2179     {
2180       if (dump_enabled_p ())
2181         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2182                          "bad data access.\n");
2183       return false;
2184     }
2185
2186   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
2187
2188   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
2189   if (!ok)
2190     {
2191       if (dump_enabled_p ())
2192         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2193                          "unexpected pattern.\n");
2194       return false;
2195     }
2196
2197   /* While the rest of the analysis below depends on it in some way.  */
2198   fatal = false;
2199
2200   /* Analyze data dependences between the data-refs in the loop
2201      and adjust the maximum vectorization factor according to
2202      the dependences.
2203      FORNOW: fail at the first data dependence that we encounter.  */
2204
2205   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
2206   if (!ok
2207       || (max_vf != MAX_VECTORIZATION_FACTOR
2208           && maybe_lt (max_vf, min_vf)))
2209     {
2210       if (dump_enabled_p ())
2211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2212                              "bad data dependence.\n");
2213       return false;
2214     }
2215   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
2216
2217   ok = vect_determine_vectorization_factor (loop_vinfo);
2218   if (!ok)
2219     {
2220       if (dump_enabled_p ())
2221         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2222                          "can't determine vectorization factor.\n");
2223       return false;
2224     }
2225   if (max_vf != MAX_VECTORIZATION_FACTOR
2226       && maybe_lt (max_vf, LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2227     {
2228       if (dump_enabled_p ())
2229         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230                          "bad data dependence.\n");
2231       return false;
2232     }
2233
2234   /* Compute the scalar iteration cost.  */
2235   vect_compute_single_scalar_iteration_cost (loop_vinfo);
2236
2237   poly_uint64 saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2238   unsigned th;
2239
2240   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2241   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2242   if (!ok)
2243     return false;
2244
2245   /* If there are any SLP instances mark them as pure_slp.  */
2246   bool slp = vect_make_slp_decision (loop_vinfo);
2247   if (slp)
2248     {
2249       /* Find stmts that need to be both vectorized and SLPed.  */
2250       vect_detect_hybrid_slp (loop_vinfo);
2251
2252       /* Update the vectorization factor based on the SLP decision.  */
2253       vect_update_vf_for_slp (loop_vinfo);
2254     }
2255
2256   bool saved_can_fully_mask_p = LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo);
2257
2258   /* We don't expect to have to roll back to anything other than an empty
2259      set of rgroups.  */
2260   gcc_assert (LOOP_VINFO_MASKS (loop_vinfo).is_empty ());
2261
2262   /* This is the point where we can re-start analysis with SLP forced off.  */
2263 start_over:
2264
2265   /* Now the vectorization factor is final.  */
2266   poly_uint64 vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2267   gcc_assert (known_ne (vectorization_factor, 0U));
2268
2269   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2270     {
2271       dump_printf_loc (MSG_NOTE, vect_location,
2272                        "vectorization_factor = ");
2273       dump_dec (MSG_NOTE, vectorization_factor);
2274       dump_printf (MSG_NOTE, ", niters = " HOST_WIDE_INT_PRINT_DEC "\n",
2275                    LOOP_VINFO_INT_NITERS (loop_vinfo));
2276     }
2277
2278   HOST_WIDE_INT max_niter
2279     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2280
2281   /* Analyze the alignment of the data-refs in the loop.
2282      Fail if a data reference is found that cannot be vectorized.  */
2283
2284   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2285   if (!ok)
2286     {
2287       if (dump_enabled_p ())
2288         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2289                          "bad data alignment.\n");
2290       return false;
2291     }
2292
2293   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2294      It is important to call pruning after vect_analyze_data_ref_accesses,
2295      since we use grouping information gathered by interleaving analysis.  */
2296   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2297   if (!ok)
2298     return false;
2299
2300   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2301      vectorization.  */
2302   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2303     {
2304     /* This pass will decide on using loop versioning and/or loop peeling in
2305        order to enhance the alignment of data references in the loop.  */
2306     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2307     if (!ok)
2308       {
2309         if (dump_enabled_p ())
2310           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2311                            "bad data alignment.\n");
2312         return false;
2313       }
2314     }
2315
2316   if (slp)
2317     {
2318       /* Analyze operations in the SLP instances.  Note this may
2319          remove unsupported SLP instances which makes the above
2320          SLP kind detection invalid.  */
2321       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2322       vect_slp_analyze_operations (loop_vinfo);
2323       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2324         goto again;
2325     }
2326
2327   /* Scan all the remaining operations in the loop that are not subject
2328      to SLP and make sure they are vectorizable.  */
2329   ok = vect_analyze_loop_operations (loop_vinfo);
2330   if (!ok)
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2334                          "bad operation or unsupported loop bound.\n");
2335       return false;
2336     }
2337
2338   /* Decide whether to use a fully-masked loop for this vectorization
2339      factor.  */
2340   LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
2341     = (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo)
2342        && vect_verify_full_masking (loop_vinfo));
2343   if (dump_enabled_p ())
2344     {
2345       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2346         dump_printf_loc (MSG_NOTE, vect_location,
2347                          "using a fully-masked loop.\n");
2348       else
2349         dump_printf_loc (MSG_NOTE, vect_location,
2350                          "not using a fully-masked loop.\n");
2351     }
2352
2353   /* If epilog loop is required because of data accesses with gaps,
2354      one additional iteration needs to be peeled.  Check if there is
2355      enough iterations for vectorization.  */
2356   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2357       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2358       && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2359     {
2360       poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2361       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2362
2363       if (known_lt (wi::to_widest (scalar_niters), vf))
2364         {
2365           if (dump_enabled_p ())
2366             dump_printf_loc (MSG_NOTE, vect_location,
2367                              "loop has no enough iterations to support"
2368                              " peeling for gaps.\n");
2369           return false;
2370         }
2371     }
2372
2373   /* Check the costings of the loop make vectorizing worthwhile.  */
2374   res = vect_analyze_loop_costing (loop_vinfo);
2375   if (res < 0)
2376     goto again;
2377   if (!res)
2378     {
2379       if (dump_enabled_p ())
2380         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381                          "Loop costings not worthwhile.\n");
2382       return false;
2383     }
2384
2385   /* Decide whether we need to create an epilogue loop to handle
2386      remaining scalar iterations.  */
2387   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
2388
2389   unsigned HOST_WIDE_INT const_vf;
2390   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2391     /* The main loop handles all iterations.  */
2392     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2393   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2394            && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2395     {
2396       if (!multiple_p (LOOP_VINFO_INT_NITERS (loop_vinfo)
2397                        - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo),
2398                        LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2399         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2400     }
2401   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2402            || !LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&const_vf)
2403            || ((tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2404                 < (unsigned) exact_log2 (const_vf))
2405                /* In case of versioning, check if the maximum number of
2406                   iterations is greater than th.  If they are identical,
2407                   the epilogue is unnecessary.  */
2408                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2409                    || ((unsigned HOST_WIDE_INT) max_niter
2410                        > (th / const_vf) * const_vf))))
2411     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2412
2413   /* If an epilogue loop is required make sure we can create one.  */
2414   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2415       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2416     {
2417       if (dump_enabled_p ())
2418         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2419       if (!vect_can_advance_ivs_p (loop_vinfo)
2420           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2421                                            single_exit (LOOP_VINFO_LOOP
2422                                                          (loop_vinfo))))
2423         {
2424           if (dump_enabled_p ())
2425             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2426                              "not vectorized: can't create required "
2427                              "epilog loop\n");
2428           goto again;
2429         }
2430     }
2431
2432   /* During peeling, we need to check if number of loop iterations is
2433      enough for both peeled prolog loop and vector loop.  This check
2434      can be merged along with threshold check of loop versioning, so
2435      increase threshold for this case if necessary.  */
2436   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
2437     {
2438       poly_uint64 niters_th = 0;
2439
2440       if (!vect_use_loop_mask_for_alignment_p (loop_vinfo))
2441         {
2442           /* Niters for peeled prolog loop.  */
2443           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2444             {
2445               struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2446               tree vectype
2447                 = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2448               niters_th += TYPE_VECTOR_SUBPARTS (vectype) - 1;
2449             }
2450           else
2451             niters_th += LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2452         }
2453
2454       /* Niters for at least one iteration of vectorized loop.  */
2455       if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
2456         niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2457       /* One additional iteration because of peeling for gap.  */
2458       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2459         niters_th += 1;
2460       LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = niters_th;
2461     }
2462
2463   gcc_assert (known_eq (vectorization_factor,
2464                         LOOP_VINFO_VECT_FACTOR (loop_vinfo)));
2465
2466   /* Ok to vectorize!  */
2467   return true;
2468
2469 again:
2470   /* Try again with SLP forced off but if we didn't do any SLP there is
2471      no point in re-trying.  */
2472   if (!slp)
2473     return false;
2474
2475   /* If there are reduction chains re-trying will fail anyway.  */
2476   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2477     return false;
2478
2479   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2480      via interleaving or lane instructions.  */
2481   slp_instance instance;
2482   slp_tree node;
2483   unsigned i, j;
2484   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2485     {
2486       stmt_vec_info vinfo;
2487       vinfo = vinfo_for_stmt
2488           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2489       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2490         continue;
2491       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2492       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2493       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2494       if (! vect_store_lanes_supported (vectype, size, false)
2495          && ! known_eq (TYPE_VECTOR_SUBPARTS (vectype), 1U)
2496          && ! vect_grouped_store_supported (vectype, size))
2497        return false;
2498       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2499         {
2500           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2501           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2502           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2503           size = STMT_VINFO_GROUP_SIZE (vinfo);
2504           vectype = STMT_VINFO_VECTYPE (vinfo);
2505           if (! vect_load_lanes_supported (vectype, size, false)
2506               && ! vect_grouped_load_supported (vectype, single_element_p,
2507                                                 size))
2508             return false;
2509         }
2510     }
2511
2512   if (dump_enabled_p ())
2513     dump_printf_loc (MSG_NOTE, vect_location,
2514                      "re-trying with SLP disabled\n");
2515
2516   /* Roll back state appropriately.  No SLP this time.  */
2517   slp = false;
2518   /* Restore vectorization factor as it were without SLP.  */
2519   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2520   /* Free the SLP instances.  */
2521   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2522     vect_free_slp_instance (instance);
2523   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2524   /* Reset SLP type to loop_vect on all stmts.  */
2525   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2526     {
2527       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2528       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2529            !gsi_end_p (si); gsi_next (&si))
2530         {
2531           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2532           STMT_SLP_TYPE (stmt_info) = loop_vect;
2533         }
2534       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2535            !gsi_end_p (si); gsi_next (&si))
2536         {
2537           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2538           STMT_SLP_TYPE (stmt_info) = loop_vect;
2539           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2540             {
2541               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2542               STMT_SLP_TYPE (stmt_info) = loop_vect;
2543               for (gimple_stmt_iterator pi
2544                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2545                    !gsi_end_p (pi); gsi_next (&pi))
2546                 {
2547                   gimple *pstmt = gsi_stmt (pi);
2548                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2549                 }
2550             }
2551         }
2552     }
2553   /* Free optimized alias test DDRS.  */
2554   LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).truncate (0);
2555   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2556   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2557   /* Reset target cost data.  */
2558   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2559   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2560     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2561   /* Reset accumulated rgroup information.  */
2562   release_vec_loop_masks (&LOOP_VINFO_MASKS (loop_vinfo));
2563   /* Reset assorted flags.  */
2564   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2565   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2566   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2567   LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo) = 0;
2568   LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = saved_can_fully_mask_p;
2569
2570   goto start_over;
2571 }
2572
2573 /* Function vect_analyze_loop.
2574
2575    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2576    for it.  The different analyses will record information in the
2577    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2578    be vectorized.  */
2579 loop_vec_info
2580 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2581 {
2582   loop_vec_info loop_vinfo;
2583   auto_vector_sizes vector_sizes;
2584
2585   /* Autodetect first vector size we try.  */
2586   current_vector_size = 0;
2587   targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
2588   unsigned int next_size = 0;
2589
2590   if (dump_enabled_p ())
2591     dump_printf_loc (MSG_NOTE, vect_location,
2592                      "===== analyze_loop_nest =====\n");
2593
2594   if (loop_outer (loop)
2595       && loop_vec_info_for_loop (loop_outer (loop))
2596       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2597     {
2598       if (dump_enabled_p ())
2599         dump_printf_loc (MSG_NOTE, vect_location,
2600                          "outer-loop already vectorized.\n");
2601       return NULL;
2602     }
2603
2604   poly_uint64 autodetected_vector_size = 0;
2605   while (1)
2606     {
2607       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2608       loop_vinfo = vect_analyze_loop_form (loop);
2609       if (!loop_vinfo)
2610         {
2611           if (dump_enabled_p ())
2612             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2613                              "bad loop form.\n");
2614           return NULL;
2615         }
2616
2617       bool fatal = false;
2618
2619       if (orig_loop_vinfo)
2620         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2621
2622       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2623         {
2624           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2625
2626           return loop_vinfo;
2627         }
2628
2629       delete loop_vinfo;
2630
2631       if (next_size == 0)
2632         autodetected_vector_size = current_vector_size;
2633
2634       if (next_size < vector_sizes.length ()
2635           && known_eq (vector_sizes[next_size], autodetected_vector_size))
2636         next_size += 1;
2637
2638       if (fatal
2639           || next_size == vector_sizes.length ()
2640           || known_eq (current_vector_size, 0U))
2641         return NULL;
2642
2643       /* Try the next biggest vector size.  */
2644       current_vector_size = vector_sizes[next_size++];
2645       if (dump_enabled_p ())
2646         {
2647           dump_printf_loc (MSG_NOTE, vect_location,
2648                            "***** Re-trying analysis with "
2649                            "vector size ");
2650           dump_dec (MSG_NOTE, current_vector_size);
2651           dump_printf (MSG_NOTE, "\n");
2652         }
2653     }
2654 }
2655
2656 /* Return true if there is an in-order reduction function for CODE, storing
2657    it in *REDUC_FN if so.  */
2658
2659 static bool
2660 fold_left_reduction_fn (tree_code code, internal_fn *reduc_fn)
2661 {
2662   switch (code)
2663     {
2664     case PLUS_EXPR:
2665       *reduc_fn = IFN_FOLD_LEFT_PLUS;
2666       return true;
2667
2668     default:
2669       return false;
2670     }
2671 }
2672
2673 /* Function reduction_fn_for_scalar_code
2674
2675    Input:
2676    CODE - tree_code of a reduction operations.
2677
2678    Output:
2679    REDUC_FN - the corresponding internal function to be used to reduce the
2680       vector of partial results into a single scalar result, or IFN_LAST
2681       if the operation is a supported reduction operation, but does not have
2682       such an internal function.
2683
2684    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2685
2686 static bool
2687 reduction_fn_for_scalar_code (enum tree_code code, internal_fn *reduc_fn)
2688 {
2689   switch (code)
2690     {
2691       case MAX_EXPR:
2692         *reduc_fn = IFN_REDUC_MAX;
2693         return true;
2694
2695       case MIN_EXPR:
2696         *reduc_fn = IFN_REDUC_MIN;
2697         return true;
2698
2699       case PLUS_EXPR:
2700         *reduc_fn = IFN_REDUC_PLUS;
2701         return true;
2702
2703       case BIT_AND_EXPR:
2704         *reduc_fn = IFN_REDUC_AND;
2705         return true;
2706
2707       case BIT_IOR_EXPR:
2708         *reduc_fn = IFN_REDUC_IOR;
2709         return true;
2710
2711       case BIT_XOR_EXPR:
2712         *reduc_fn = IFN_REDUC_XOR;
2713         return true;
2714
2715       case MULT_EXPR:
2716       case MINUS_EXPR:
2717         *reduc_fn = IFN_LAST;
2718         return true;
2719
2720       default:
2721        return false;
2722     }
2723 }
2724
2725 /* If there is a neutral value X such that SLP reduction NODE would not
2726    be affected by the introduction of additional X elements, return that X,
2727    otherwise return null.  CODE is the code of the reduction.  REDUC_CHAIN
2728    is true if the SLP statements perform a single reduction, false if each
2729    statement performs an independent reduction.  */
2730
2731 static tree
2732 neutral_op_for_slp_reduction (slp_tree slp_node, tree_code code,
2733                               bool reduc_chain)
2734 {
2735   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
2736   gimple *stmt = stmts[0];
2737   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
2738   tree vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
2739   tree scalar_type = TREE_TYPE (vector_type);
2740   struct loop *loop = gimple_bb (stmt)->loop_father;
2741   gcc_assert (loop);
2742
2743   switch (code)
2744     {
2745     case WIDEN_SUM_EXPR:
2746     case DOT_PROD_EXPR:
2747     case SAD_EXPR:
2748     case PLUS_EXPR:
2749     case MINUS_EXPR:
2750     case BIT_IOR_EXPR:
2751     case BIT_XOR_EXPR:
2752       return build_zero_cst (scalar_type);
2753
2754     case MULT_EXPR:
2755       return build_one_cst (scalar_type);
2756
2757     case BIT_AND_EXPR:
2758       return build_all_ones_cst (scalar_type);
2759
2760     case MAX_EXPR:
2761     case MIN_EXPR:
2762       /* For MIN/MAX the initial values are neutral.  A reduction chain
2763          has only a single initial value, so that value is neutral for
2764          all statements.  */
2765       if (reduc_chain)
2766         return PHI_ARG_DEF_FROM_EDGE (stmt, loop_preheader_edge (loop));
2767       return NULL_TREE;
2768
2769     default:
2770       return NULL_TREE;
2771     }
2772 }
2773
2774 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2775    STMT is printed with a message MSG. */
2776
2777 static void
2778 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2779 {
2780   dump_printf_loc (msg_type, vect_location, "%s", msg);
2781   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2782 }
2783
2784
2785 /* Detect SLP reduction of the form:
2786
2787    #a1 = phi <a5, a0>
2788    a2 = operation (a1)
2789    a3 = operation (a2)
2790    a4 = operation (a3)
2791    a5 = operation (a4)
2792
2793    #a = phi <a5>
2794
2795    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2796    FIRST_STMT is the first reduction stmt in the chain
2797    (a2 = operation (a1)).
2798
2799    Return TRUE if a reduction chain was detected.  */
2800
2801 static bool
2802 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2803                        gimple *first_stmt)
2804 {
2805   struct loop *loop = (gimple_bb (phi))->loop_father;
2806   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2807   enum tree_code code;
2808   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2809   stmt_vec_info use_stmt_info, current_stmt_info;
2810   tree lhs;
2811   imm_use_iterator imm_iter;
2812   use_operand_p use_p;
2813   int nloop_uses, size = 0, n_out_of_loop_uses;
2814   bool found = false;
2815
2816   if (loop != vect_loop)
2817     return false;
2818
2819   lhs = PHI_RESULT (phi);
2820   code = gimple_assign_rhs_code (first_stmt);
2821   while (1)
2822     {
2823       nloop_uses = 0;
2824       n_out_of_loop_uses = 0;
2825       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2826         {
2827           gimple *use_stmt = USE_STMT (use_p);
2828           if (is_gimple_debug (use_stmt))
2829             continue;
2830
2831           /* Check if we got back to the reduction phi.  */
2832           if (use_stmt == phi)
2833             {
2834               loop_use_stmt = use_stmt;
2835               found = true;
2836               break;
2837             }
2838
2839           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2840             {
2841               loop_use_stmt = use_stmt;
2842               nloop_uses++;
2843             }
2844            else
2845              n_out_of_loop_uses++;
2846
2847            /* There are can be either a single use in the loop or two uses in
2848               phi nodes.  */
2849            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2850              return false;
2851         }
2852
2853       if (found)
2854         break;
2855
2856       /* We reached a statement with no loop uses.  */
2857       if (nloop_uses == 0)
2858         return false;
2859
2860       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2861       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2862         return false;
2863
2864       if (!is_gimple_assign (loop_use_stmt)
2865           || code != gimple_assign_rhs_code (loop_use_stmt)
2866           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2867         return false;
2868
2869       /* Insert USE_STMT into reduction chain.  */
2870       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2871       if (current_stmt)
2872         {
2873           current_stmt_info = vinfo_for_stmt (current_stmt);
2874           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2875           GROUP_FIRST_ELEMENT (use_stmt_info)
2876             = GROUP_FIRST_ELEMENT (current_stmt_info);
2877         }
2878       else
2879         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2880
2881       lhs = gimple_assign_lhs (loop_use_stmt);
2882       current_stmt = loop_use_stmt;
2883       size++;
2884    }
2885
2886   if (!found || loop_use_stmt != phi || size < 2)
2887     return false;
2888
2889   /* Swap the operands, if needed, to make the reduction operand be the second
2890      operand.  */
2891   lhs = PHI_RESULT (phi);
2892   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2893   while (next_stmt)
2894     {
2895       if (gimple_assign_rhs2 (next_stmt) == lhs)
2896         {
2897           tree op = gimple_assign_rhs1 (next_stmt);
2898           gimple *def_stmt = NULL;
2899
2900           if (TREE_CODE (op) == SSA_NAME)
2901             def_stmt = SSA_NAME_DEF_STMT (op);
2902
2903           /* Check that the other def is either defined in the loop
2904              ("vect_internal_def"), or it's an induction (defined by a
2905              loop-header phi-node).  */
2906           if (def_stmt
2907               && gimple_bb (def_stmt)
2908               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2909               && (is_gimple_assign (def_stmt)
2910                   || is_gimple_call (def_stmt)
2911                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2912                            == vect_induction_def
2913                   || (gimple_code (def_stmt) == GIMPLE_PHI
2914                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2915                                   == vect_internal_def
2916                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2917             {
2918               lhs = gimple_assign_lhs (next_stmt);
2919               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2920               continue;
2921             }
2922
2923           return false;
2924         }
2925       else
2926         {
2927           tree op = gimple_assign_rhs2 (next_stmt);
2928           gimple *def_stmt = NULL;
2929
2930           if (TREE_CODE (op) == SSA_NAME)
2931             def_stmt = SSA_NAME_DEF_STMT (op);
2932
2933           /* Check that the other def is either defined in the loop
2934             ("vect_internal_def"), or it's an induction (defined by a
2935             loop-header phi-node).  */
2936           if (def_stmt
2937               && gimple_bb (def_stmt)
2938               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2939               && (is_gimple_assign (def_stmt)
2940                   || is_gimple_call (def_stmt)
2941                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2942                               == vect_induction_def
2943                   || (gimple_code (def_stmt) == GIMPLE_PHI
2944                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2945                                   == vect_internal_def
2946                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2947             {
2948               if (dump_enabled_p ())
2949                 {
2950                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2951                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2952                 }
2953
2954               swap_ssa_operands (next_stmt,
2955                                  gimple_assign_rhs1_ptr (next_stmt),
2956                                  gimple_assign_rhs2_ptr (next_stmt));
2957               update_stmt (next_stmt);
2958
2959               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2960                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2961             }
2962           else
2963             return false;
2964         }
2965
2966       lhs = gimple_assign_lhs (next_stmt);
2967       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2968     }
2969
2970   /* Save the chain for further analysis in SLP detection.  */
2971   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2972   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2973   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2974
2975   return true;
2976 }
2977
2978 /* Return true if we need an in-order reduction for operation CODE
2979    on type TYPE.  NEED_WRAPPING_INTEGRAL_OVERFLOW is true if integer
2980    overflow must wrap.  */
2981
2982 static bool
2983 needs_fold_left_reduction_p (tree type, tree_code code,
2984                              bool need_wrapping_integral_overflow)
2985 {
2986   /* CHECKME: check for !flag_finite_math_only too?  */
2987   if (SCALAR_FLOAT_TYPE_P (type))
2988     switch (code)
2989       {
2990       case MIN_EXPR:
2991       case MAX_EXPR:
2992         return false;
2993
2994       default:
2995         return !flag_associative_math;
2996       }
2997
2998   if (INTEGRAL_TYPE_P (type))
2999     {
3000       if (!operation_no_trapping_overflow (type, code))
3001         return true;
3002       if (need_wrapping_integral_overflow
3003           && !TYPE_OVERFLOW_WRAPS (type)
3004           && operation_can_overflow (code))
3005         return true;
3006       return false;
3007     }
3008
3009   if (SAT_FIXED_POINT_TYPE_P (type))
3010     return true;
3011
3012   return false;
3013 }
3014
3015 /* Return true if the reduction PHI in LOOP with latch arg LOOP_ARG and
3016    reduction operation CODE has a handled computation expression.  */
3017
3018 bool
3019 check_reduction_path (location_t loc, loop_p loop, gphi *phi, tree loop_arg,
3020                       enum tree_code code)
3021 {
3022   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3023   auto_bitmap visited;
3024   tree lookfor = PHI_RESULT (phi);
3025   ssa_op_iter curri;
3026   use_operand_p curr = op_iter_init_phiuse (&curri, phi, SSA_OP_USE);
3027   while (USE_FROM_PTR (curr) != loop_arg)
3028     curr = op_iter_next_use (&curri);
3029   curri.i = curri.numops;
3030   do
3031     {
3032       path.safe_push (std::make_pair (curri, curr));
3033       tree use = USE_FROM_PTR (curr);
3034       if (use == lookfor)
3035         break;
3036       gimple *def = SSA_NAME_DEF_STMT (use);
3037       if (gimple_nop_p (def)
3038           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3039         {
3040 pop:
3041           do
3042             {
3043               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3044               curri = x.first;
3045               curr = x.second;
3046               do
3047                 curr = op_iter_next_use (&curri);
3048               /* Skip already visited or non-SSA operands (from iterating
3049                  over PHI args).  */
3050               while (curr != NULL_USE_OPERAND_P
3051                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3052                          || ! bitmap_set_bit (visited,
3053                                               SSA_NAME_VERSION
3054                                                 (USE_FROM_PTR (curr)))));
3055             }
3056           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3057           if (curr == NULL_USE_OPERAND_P)
3058             break;
3059         }
3060       else
3061         {
3062           if (gimple_code (def) == GIMPLE_PHI)
3063             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3064           else
3065             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3066           while (curr != NULL_USE_OPERAND_P
3067                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3068                      || ! bitmap_set_bit (visited,
3069                                           SSA_NAME_VERSION
3070                                             (USE_FROM_PTR (curr)))))
3071             curr = op_iter_next_use (&curri);
3072           if (curr == NULL_USE_OPERAND_P)
3073             goto pop;
3074         }
3075     }
3076   while (1);
3077   if (dump_file && (dump_flags & TDF_DETAILS))
3078     {
3079       dump_printf_loc (MSG_NOTE, loc, "reduction path: ");
3080       unsigned i;
3081       std::pair<ssa_op_iter, use_operand_p> *x;
3082       FOR_EACH_VEC_ELT (path, i, x)
3083         {
3084           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3085           dump_printf (MSG_NOTE, " ");
3086         }
3087       dump_printf (MSG_NOTE, "\n");
3088     }
3089
3090   /* Check whether the reduction path detected is valid.  */
3091   bool fail = path.length () == 0;
3092   bool neg = false;
3093   for (unsigned i = 1; i < path.length (); ++i)
3094     {
3095       gimple *use_stmt = USE_STMT (path[i].second);
3096       tree op = USE_FROM_PTR (path[i].second);
3097       if (! has_single_use (op)
3098           || ! is_gimple_assign (use_stmt))
3099         {
3100           fail = true;
3101           break;
3102         }
3103       if (gimple_assign_rhs_code (use_stmt) != code)
3104         {
3105           if (code == PLUS_EXPR
3106               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3107             {
3108               /* Track whether we negate the reduction value each iteration.  */
3109               if (gimple_assign_rhs2 (use_stmt) == op)
3110                 neg = ! neg;
3111             }
3112           else
3113             {
3114               fail = true;
3115               break;
3116             }
3117         }
3118     }
3119   return ! fail && ! neg;
3120 }
3121
3122
3123 /* Function vect_is_simple_reduction
3124
3125    (1) Detect a cross-iteration def-use cycle that represents a simple
3126    reduction computation.  We look for the following pattern:
3127
3128    loop_header:
3129      a1 = phi < a0, a2 >
3130      a3 = ...
3131      a2 = operation (a3, a1)
3132
3133    or
3134
3135    a3 = ...
3136    loop_header:
3137      a1 = phi < a0, a2 >
3138      a2 = operation (a3, a1)
3139
3140    such that:
3141    1. operation is commutative and associative and it is safe to
3142       change the order of the computation
3143    2. no uses for a2 in the loop (a2 is used out of the loop)
3144    3. no uses of a1 in the loop besides the reduction operation
3145    4. no uses of a1 outside the loop.
3146
3147    Conditions 1,4 are tested here.
3148    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
3149
3150    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
3151    nested cycles.
3152
3153    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
3154    reductions:
3155
3156      a1 = phi < a0, a2 >
3157      inner loop (def of a3)
3158      a2 = phi < a3 >
3159
3160    (4) Detect condition expressions, ie:
3161      for (int i = 0; i < N; i++)
3162        if (a[i] < val)
3163         ret_val = a[i];
3164
3165 */
3166
3167 static gimple *
3168 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
3169                           bool *double_reduc,
3170                           bool need_wrapping_integral_overflow,
3171                           enum vect_reduction_type *v_reduc_type)
3172 {
3173   struct loop *loop = (gimple_bb (phi))->loop_father;
3174   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
3175   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
3176   enum tree_code orig_code, code;
3177   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
3178   tree type;
3179   int nloop_uses;
3180   tree name;
3181   imm_use_iterator imm_iter;
3182   use_operand_p use_p;
3183   bool phi_def;
3184
3185   *double_reduc = false;
3186   *v_reduc_type = TREE_CODE_REDUCTION;
3187
3188   tree phi_name = PHI_RESULT (phi);
3189   /* ???  If there are no uses of the PHI result the inner loop reduction
3190      won't be detected as possibly double-reduction by vectorizable_reduction
3191      because that tries to walk the PHI arg from the preheader edge which
3192      can be constant.  See PR60382.  */
3193   if (has_zero_uses (phi_name))
3194     return NULL;
3195   nloop_uses = 0;
3196   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
3197     {
3198       gimple *use_stmt = USE_STMT (use_p);
3199       if (is_gimple_debug (use_stmt))
3200         continue;
3201
3202       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3203         {
3204           if (dump_enabled_p ())
3205             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3206                              "intermediate value used outside loop.\n");
3207
3208           return NULL;
3209         }
3210
3211       nloop_uses++;
3212       if (nloop_uses > 1)
3213         {
3214           if (dump_enabled_p ())
3215             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216                              "reduction value used in loop.\n");
3217           return NULL;
3218         }
3219
3220       phi_use_stmt = use_stmt;
3221     }
3222
3223   edge latch_e = loop_latch_edge (loop);
3224   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
3225   if (TREE_CODE (loop_arg) != SSA_NAME)
3226     {
3227       if (dump_enabled_p ())
3228         {
3229           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3230                            "reduction: not ssa_name: ");
3231           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
3232           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3233         }
3234       return NULL;
3235     }
3236
3237   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
3238   if (is_gimple_assign (def_stmt))
3239     {
3240       name = gimple_assign_lhs (def_stmt);
3241       phi_def = false;
3242     }
3243   else if (gimple_code (def_stmt) == GIMPLE_PHI)
3244     {
3245       name = PHI_RESULT (def_stmt);
3246       phi_def = true;
3247     }
3248   else
3249     {
3250       if (dump_enabled_p ())
3251         {
3252           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3253                            "reduction: unhandled reduction operation: ");
3254           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
3255         }
3256       return NULL;
3257     }
3258
3259   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
3260     return NULL;
3261
3262   nloop_uses = 0;
3263   auto_vec<gphi *, 3> lcphis;
3264   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
3265     {
3266       gimple *use_stmt = USE_STMT (use_p);
3267       if (is_gimple_debug (use_stmt))
3268         continue;
3269       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
3270         nloop_uses++;
3271       else
3272         /* We can have more than one loop-closed PHI.  */
3273         lcphis.safe_push (as_a <gphi *> (use_stmt));
3274       if (nloop_uses > 1)
3275         {
3276           if (dump_enabled_p ())
3277             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3278                              "reduction used in loop.\n");
3279           return NULL;
3280         }
3281     }
3282
3283   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
3284      defined in the inner loop.  */
3285   if (phi_def)
3286     {
3287       op1 = PHI_ARG_DEF (def_stmt, 0);
3288
3289       if (gimple_phi_num_args (def_stmt) != 1
3290           || TREE_CODE (op1) != SSA_NAME)
3291         {
3292           if (dump_enabled_p ())
3293             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3294                              "unsupported phi node definition.\n");
3295
3296           return NULL;
3297         }
3298
3299       def1 = SSA_NAME_DEF_STMT (op1);
3300       if (gimple_bb (def1)
3301           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3302           && loop->inner
3303           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
3304           && is_gimple_assign (def1)
3305           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
3306         {
3307           if (dump_enabled_p ())
3308             report_vect_op (MSG_NOTE, def_stmt,
3309                             "detected double reduction: ");
3310
3311           *double_reduc = true;
3312           return def_stmt;
3313         }
3314
3315       return NULL;
3316     }
3317
3318   /* If we are vectorizing an inner reduction we are executing that
3319      in the original order only in case we are not dealing with a
3320      double reduction.  */
3321   bool check_reduction = true;
3322   if (flow_loop_nested_p (vect_loop, loop))
3323     {
3324       gphi *lcphi;
3325       unsigned i;
3326       check_reduction = false;
3327       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
3328         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
3329           {
3330             gimple *use_stmt = USE_STMT (use_p);
3331             if (is_gimple_debug (use_stmt))
3332               continue;
3333             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
3334               check_reduction = true;
3335           }
3336     }
3337
3338   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
3339   code = orig_code = gimple_assign_rhs_code (def_stmt);
3340
3341   /* We can handle "res -= x[i]", which is non-associative by
3342      simply rewriting this into "res += -x[i]".  Avoid changing
3343      gimple instruction for the first simple tests and only do this
3344      if we're allowed to change code at all.  */
3345   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
3346     code = PLUS_EXPR;
3347
3348   if (code == COND_EXPR)
3349     {
3350       if (! nested_in_vect_loop)
3351         *v_reduc_type = COND_REDUCTION;
3352
3353       op3 = gimple_assign_rhs1 (def_stmt);
3354       if (COMPARISON_CLASS_P (op3))
3355         {
3356           op4 = TREE_OPERAND (op3, 1);
3357           op3 = TREE_OPERAND (op3, 0);
3358         }
3359       if (op3 == phi_name || op4 == phi_name)
3360         {
3361           if (dump_enabled_p ())
3362             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3363                             "reduction: condition depends on previous"
3364                             " iteration: ");
3365           return NULL;
3366         }
3367
3368       op1 = gimple_assign_rhs2 (def_stmt);
3369       op2 = gimple_assign_rhs3 (def_stmt);
3370     }
3371   else if (!commutative_tree_code (code) || !associative_tree_code (code))
3372     {
3373       if (dump_enabled_p ())
3374         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3375                         "reduction: not commutative/associative: ");
3376       return NULL;
3377     }
3378   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
3379     {
3380       op1 = gimple_assign_rhs1 (def_stmt);
3381       op2 = gimple_assign_rhs2 (def_stmt);
3382     }
3383   else
3384     {
3385       if (dump_enabled_p ())
3386         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3387                         "reduction: not handled operation: ");
3388       return NULL;
3389     }
3390
3391   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
3392     {
3393       if (dump_enabled_p ())
3394         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3395                         "reduction: both uses not ssa_names: ");
3396
3397       return NULL;
3398     }
3399
3400   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
3401   if ((TREE_CODE (op1) == SSA_NAME
3402        && !types_compatible_p (type,TREE_TYPE (op1)))
3403       || (TREE_CODE (op2) == SSA_NAME
3404           && !types_compatible_p (type, TREE_TYPE (op2)))
3405       || (op3 && TREE_CODE (op3) == SSA_NAME
3406           && !types_compatible_p (type, TREE_TYPE (op3)))
3407       || (op4 && TREE_CODE (op4) == SSA_NAME
3408           && !types_compatible_p (type, TREE_TYPE (op4))))
3409     {
3410       if (dump_enabled_p ())
3411         {
3412           dump_printf_loc (MSG_NOTE, vect_location,
3413                            "reduction: multiple types: operation type: ");
3414           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
3415           dump_printf (MSG_NOTE, ", operands types: ");
3416           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3417                              TREE_TYPE (op1));
3418           dump_printf (MSG_NOTE, ",");
3419           dump_generic_expr (MSG_NOTE, TDF_SLIM,
3420                              TREE_TYPE (op2));
3421           if (op3)
3422             {
3423               dump_printf (MSG_NOTE, ",");
3424               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3425                                  TREE_TYPE (op3));
3426             }
3427
3428           if (op4)
3429             {
3430               dump_printf (MSG_NOTE, ",");
3431               dump_generic_expr (MSG_NOTE, TDF_SLIM,
3432                                  TREE_TYPE (op4));
3433             }
3434           dump_printf (MSG_NOTE, "\n");
3435         }
3436
3437       return NULL;
3438     }
3439
3440   /* Check whether it's ok to change the order of the computation.
3441      Generally, when vectorizing a reduction we change the order of the
3442      computation.  This may change the behavior of the program in some
3443      cases, so we need to check that this is ok.  One exception is when
3444      vectorizing an outer-loop: the inner-loop is executed sequentially,
3445      and therefore vectorizing reductions in the inner-loop during
3446      outer-loop vectorization is safe.  */
3447   if (check_reduction
3448       && *v_reduc_type == TREE_CODE_REDUCTION
3449       && needs_fold_left_reduction_p (type, code,
3450                                       need_wrapping_integral_overflow))
3451     *v_reduc_type = FOLD_LEFT_REDUCTION;
3452
3453   /* Reduction is safe. We're dealing with one of the following:
3454      1) integer arithmetic and no trapv
3455      2) floating point arithmetic, and special flags permit this optimization
3456      3) nested cycle (i.e., outer loop vectorization).  */
3457   if (TREE_CODE (op1) == SSA_NAME)
3458     def1 = SSA_NAME_DEF_STMT (op1);
3459
3460   if (TREE_CODE (op2) == SSA_NAME)
3461     def2 = SSA_NAME_DEF_STMT (op2);
3462
3463   if (code != COND_EXPR
3464       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3465     {
3466       if (dump_enabled_p ())
3467         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3468       return NULL;
3469     }
3470
3471   /* Check that one def is the reduction def, defined by PHI,
3472      the other def is either defined in the loop ("vect_internal_def"),
3473      or it's an induction (defined by a loop-header phi-node).  */
3474
3475   if (def2 && def2 == phi
3476       && (code == COND_EXPR
3477           || !def1 || gimple_nop_p (def1)
3478           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3479           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3480               && (is_gimple_assign (def1)
3481                   || is_gimple_call (def1)
3482                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3483                       == vect_induction_def
3484                   || (gimple_code (def1) == GIMPLE_PHI
3485                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3486                           == vect_internal_def
3487                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3488     {
3489       if (dump_enabled_p ())
3490         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3491       return def_stmt;
3492     }
3493
3494   if (def1 && def1 == phi
3495       && (code == COND_EXPR
3496           || !def2 || gimple_nop_p (def2)
3497           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3498           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3499               && (is_gimple_assign (def2)
3500                   || is_gimple_call (def2)
3501                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3502                        == vect_induction_def
3503                   || (gimple_code (def2) == GIMPLE_PHI
3504                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3505                            == vect_internal_def
3506                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3507     {
3508       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3509         {
3510           /* Check if we can swap operands (just for simplicity - so that
3511              the rest of the code can assume that the reduction variable
3512              is always the last (second) argument).  */
3513           if (code == COND_EXPR)
3514             {
3515               /* Swap cond_expr by inverting the condition.  */
3516               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3517               enum tree_code invert_code = ERROR_MARK;
3518               enum tree_code cond_code = TREE_CODE (cond_expr);
3519
3520               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3521                 {
3522                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3523                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3524                 }
3525               if (invert_code != ERROR_MARK)
3526                 {
3527                   TREE_SET_CODE (cond_expr, invert_code);
3528                   swap_ssa_operands (def_stmt,
3529                                      gimple_assign_rhs2_ptr (def_stmt),
3530                                      gimple_assign_rhs3_ptr (def_stmt));
3531                 }
3532               else
3533                 {
3534                   if (dump_enabled_p ())
3535                     report_vect_op (MSG_NOTE, def_stmt,
3536                                     "detected reduction: cannot swap operands "
3537                                     "for cond_expr");
3538                   return NULL;
3539                 }
3540             }
3541           else
3542             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3543                                gimple_assign_rhs2_ptr (def_stmt));
3544
3545           if (dump_enabled_p ())
3546             report_vect_op (MSG_NOTE, def_stmt,
3547                             "detected reduction: need to swap operands: ");
3548
3549           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3550             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3551         }
3552       else
3553         {
3554           if (dump_enabled_p ())
3555             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3556         }
3557
3558       return def_stmt;
3559     }
3560
3561   /* Try to find SLP reduction chain.  */
3562   if (! nested_in_vect_loop
3563       && code != COND_EXPR
3564       && orig_code != MINUS_EXPR
3565       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3566     {
3567       if (dump_enabled_p ())
3568         report_vect_op (MSG_NOTE, def_stmt,
3569                         "reduction: detected reduction chain: ");
3570
3571       return def_stmt;
3572     }
3573
3574   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3575   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3576   while (first)
3577     {
3578       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3579       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3580       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3581       first = next;
3582     }
3583
3584   /* Look for the expression computing loop_arg from loop PHI result.  */
3585   if (check_reduction_path (vect_location, loop, as_a <gphi *> (phi), loop_arg,
3586                             code))
3587     return def_stmt;
3588
3589   if (dump_enabled_p ())
3590     {
3591       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3592                       "reduction: unknown pattern: ");
3593     }
3594
3595   return NULL;
3596 }
3597
3598 /* Wrapper around vect_is_simple_reduction, which will modify code
3599    in-place if it enables detection of more reductions.  Arguments
3600    as there.  */
3601
3602 gimple *
3603 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3604                              bool *double_reduc,
3605                              bool need_wrapping_integral_overflow)
3606 {
3607   enum vect_reduction_type v_reduc_type;
3608   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3609                                           need_wrapping_integral_overflow,
3610                                           &v_reduc_type);
3611   if (def)
3612     {
3613       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3614       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3615       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3616       reduc_def_info = vinfo_for_stmt (def);
3617       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3618       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3619     }
3620   return def;
3621 }
3622
3623 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3624 int
3625 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3626                              int *peel_iters_epilogue,
3627                              stmt_vector_for_cost *scalar_cost_vec,
3628                              stmt_vector_for_cost *prologue_cost_vec,
3629                              stmt_vector_for_cost *epilogue_cost_vec)
3630 {
3631   int retval = 0;
3632   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3633
3634   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3635     {
3636       *peel_iters_epilogue = assumed_vf / 2;
3637       if (dump_enabled_p ())
3638         dump_printf_loc (MSG_NOTE, vect_location,
3639                          "cost model: epilogue peel iters set to vf/2 "
3640                          "because loop iterations are unknown .\n");
3641
3642       /* If peeled iterations are known but number of scalar loop
3643          iterations are unknown, count a taken branch per peeled loop.  */
3644       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3645                                  NULL, 0, vect_prologue);
3646       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3647                                  NULL, 0, vect_epilogue);
3648     }
3649   else
3650     {
3651       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3652       peel_iters_prologue = niters < peel_iters_prologue ?
3653                             niters : peel_iters_prologue;
3654       *peel_iters_epilogue = (niters - peel_iters_prologue) % assumed_vf;
3655       /* If we need to peel for gaps, but no peeling is required, we have to
3656          peel VF iterations.  */
3657       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3658         *peel_iters_epilogue = assumed_vf;
3659     }
3660
3661   stmt_info_for_cost *si;
3662   int j;
3663   if (peel_iters_prologue)
3664     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3665         {
3666           stmt_vec_info stmt_info
3667             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3668           retval += record_stmt_cost (prologue_cost_vec,
3669                                       si->count * peel_iters_prologue,
3670                                       si->kind, stmt_info, si->misalign,
3671                                       vect_prologue);
3672         }
3673   if (*peel_iters_epilogue)
3674     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3675         {
3676           stmt_vec_info stmt_info
3677             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3678           retval += record_stmt_cost (epilogue_cost_vec,
3679                                       si->count * *peel_iters_epilogue,
3680                                       si->kind, stmt_info, si->misalign,
3681                                       vect_epilogue);
3682         }
3683
3684   return retval;
3685 }
3686
3687 /* Function vect_estimate_min_profitable_iters
3688
3689    Return the number of iterations required for the vector version of the
3690    loop to be profitable relative to the cost of the scalar version of the
3691    loop.
3692
3693    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3694    of iterations for vectorization.  -1 value means loop vectorization
3695    is not profitable.  This returned value may be used for dynamic
3696    profitability check.
3697
3698    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3699    for static check against estimated number of iterations.  */
3700
3701 static void
3702 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3703                                     int *ret_min_profitable_niters,
3704                                     int *ret_min_profitable_estimate)
3705 {
3706   int min_profitable_iters;
3707   int min_profitable_estimate;
3708   int peel_iters_prologue;
3709   int peel_iters_epilogue;
3710   unsigned vec_inside_cost = 0;
3711   int vec_outside_cost = 0;
3712   unsigned vec_prologue_cost = 0;
3713   unsigned vec_epilogue_cost = 0;
3714   int scalar_single_iter_cost = 0;
3715   int scalar_outside_cost = 0;
3716   int assumed_vf = vect_vf_for_cost (loop_vinfo);
3717   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3718   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719
3720   /* Cost model disabled.  */
3721   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3722     {
3723       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3724       *ret_min_profitable_niters = 0;
3725       *ret_min_profitable_estimate = 0;
3726       return;
3727     }
3728
3729   /* Requires loop versioning tests to handle misalignment.  */
3730   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3731     {
3732       /*  FIXME: Make cost depend on complexity of individual check.  */
3733       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3734       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3735                             vect_prologue);
3736       dump_printf (MSG_NOTE,
3737                    "cost model: Adding cost of checks for loop "
3738                    "versioning to treat misalignment.\n");
3739     }
3740
3741   /* Requires loop versioning with alias checks.  */
3742   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3743     {
3744       /*  FIXME: Make cost depend on complexity of individual check.  */
3745       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3746       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3747                             vect_prologue);
3748       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3749       if (len)
3750         /* Count LEN - 1 ANDs and LEN comparisons.  */
3751         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3752                               NULL, 0, vect_prologue);
3753       len = LOOP_VINFO_LOWER_BOUNDS (loop_vinfo).length ();
3754       if (len)
3755         {
3756           /* Count LEN - 1 ANDs and LEN comparisons.  */
3757           unsigned int nstmts = len * 2 - 1;
3758           /* +1 for each bias that needs adding.  */
3759           for (unsigned int i = 0; i < len; ++i)
3760             if (!LOOP_VINFO_LOWER_BOUNDS (loop_vinfo)[i].unsigned_p)
3761               nstmts += 1;
3762           (void) add_stmt_cost (target_cost_data, nstmts, scalar_stmt,
3763                                 NULL, 0, vect_prologue);
3764         }
3765       dump_printf (MSG_NOTE,
3766                    "cost model: Adding cost of checks for loop "
3767                    "versioning aliasing.\n");
3768     }
3769
3770   /* Requires loop versioning with niter checks.  */
3771   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3772     {
3773       /*  FIXME: Make cost depend on complexity of individual check.  */
3774       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3775                             vect_prologue);
3776       dump_printf (MSG_NOTE,
3777                    "cost model: Adding cost of checks for loop "
3778                    "versioning niters.\n");
3779     }
3780
3781   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3782     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3783                           vect_prologue);
3784
3785   /* Count statements in scalar loop.  Using this as scalar cost for a single
3786      iteration for now.
3787
3788      TODO: Add outer loop support.
3789
3790      TODO: Consider assigning different costs to different scalar
3791      statements.  */
3792
3793   scalar_single_iter_cost
3794     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3795
3796   /* Add additional cost for the peeled instructions in prologue and epilogue
3797      loop.  (For fully-masked loops there will be no peeling.)
3798
3799      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3800      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3801
3802      TODO: Build an expression that represents peel_iters for prologue and
3803      epilogue to be used in a run-time test.  */
3804
3805   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
3806     {
3807       peel_iters_prologue = 0;
3808       peel_iters_epilogue = 0;
3809
3810       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
3811         {
3812           /* We need to peel exactly one iteration.  */
3813           peel_iters_epilogue += 1;
3814           stmt_info_for_cost *si;
3815           int j;
3816           FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
3817                             j, si)
3818             {
3819               struct _stmt_vec_info *stmt_info
3820                 = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3821               (void) add_stmt_cost (target_cost_data, si->count,
3822                                     si->kind, stmt_info, si->misalign,
3823                                     vect_epilogue);
3824             }
3825         }
3826     }
3827   else if (npeel < 0)
3828     {
3829       peel_iters_prologue = assumed_vf / 2;
3830       dump_printf (MSG_NOTE, "cost model: "
3831                    "prologue peel iters set to vf/2.\n");
3832
3833       /* If peeling for alignment is unknown, loop bound of main loop becomes
3834          unknown.  */
3835       peel_iters_epilogue = assumed_vf / 2;
3836       dump_printf (MSG_NOTE, "cost model: "
3837                    "epilogue peel iters set to vf/2 because "
3838                    "peeling for alignment is unknown.\n");
3839
3840       /* If peeled iterations are unknown, count a taken branch and a not taken
3841          branch per peeled loop. Even if scalar loop iterations are known,
3842          vector iterations are not known since peeled prologue iterations are
3843          not known. Hence guards remain the same.  */
3844       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3845                             NULL, 0, vect_prologue);
3846       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3847                             NULL, 0, vect_prologue);
3848       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3849                             NULL, 0, vect_epilogue);
3850       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3851                             NULL, 0, vect_epilogue);
3852       stmt_info_for_cost *si;
3853       int j;
3854       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3855         {
3856           struct _stmt_vec_info *stmt_info
3857             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3858           (void) add_stmt_cost (target_cost_data,
3859                                 si->count * peel_iters_prologue,
3860                                 si->kind, stmt_info, si->misalign,
3861                                 vect_prologue);
3862           (void) add_stmt_cost (target_cost_data,
3863                                 si->count * peel_iters_epilogue,
3864                                 si->kind, stmt_info, si->misalign,
3865                                 vect_epilogue);
3866         }
3867     }
3868   else
3869     {
3870       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3871       stmt_info_for_cost *si;
3872       int j;
3873       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3874
3875       prologue_cost_vec.create (2);
3876       epilogue_cost_vec.create (2);
3877       peel_iters_prologue = npeel;
3878
3879       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3880                                           &peel_iters_epilogue,
3881                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3882                                             (loop_vinfo),
3883                                           &prologue_cost_vec,
3884                                           &epilogue_cost_vec);
3885
3886       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3887         {
3888           struct _stmt_vec_info *stmt_info
3889             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3890           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3891                                 si->misalign, vect_prologue);
3892         }
3893
3894       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3895         {
3896           struct _stmt_vec_info *stmt_info
3897             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3898           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3899                                 si->misalign, vect_epilogue);
3900         }
3901
3902       prologue_cost_vec.release ();
3903       epilogue_cost_vec.release ();
3904     }
3905
3906   /* FORNOW: The scalar outside cost is incremented in one of the
3907      following ways:
3908
3909      1. The vectorizer checks for alignment and aliasing and generates
3910      a condition that allows dynamic vectorization.  A cost model
3911      check is ANDED with the versioning condition.  Hence scalar code
3912      path now has the added cost of the versioning check.
3913
3914        if (cost > th & versioning_check)
3915          jmp to vector code
3916
3917      Hence run-time scalar is incremented by not-taken branch cost.
3918
3919      2. The vectorizer then checks if a prologue is required.  If the
3920      cost model check was not done before during versioning, it has to
3921      be done before the prologue check.
3922
3923        if (cost <= th)
3924          prologue = scalar_iters
3925        if (prologue == 0)
3926          jmp to vector code
3927        else
3928          execute prologue
3929        if (prologue == num_iters)
3930          go to exit
3931
3932      Hence the run-time scalar cost is incremented by a taken branch,
3933      plus a not-taken branch, plus a taken branch cost.
3934
3935      3. The vectorizer then checks if an epilogue is required.  If the
3936      cost model check was not done before during prologue check, it
3937      has to be done with the epilogue check.
3938
3939        if (prologue == 0)
3940          jmp to vector code
3941        else
3942          execute prologue
3943        if (prologue == num_iters)
3944          go to exit
3945        vector code:
3946          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3947            jmp to epilogue
3948
3949      Hence the run-time scalar cost should be incremented by 2 taken
3950      branches.
3951
3952      TODO: The back end may reorder the BBS's differently and reverse
3953      conditions/branch directions.  Change the estimates below to
3954      something more reasonable.  */
3955
3956   /* If the number of iterations is known and we do not do versioning, we can
3957      decide whether to vectorize at compile time.  Hence the scalar version
3958      do not carry cost model guard costs.  */
3959   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3960       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3961     {
3962       /* Cost model check occurs at versioning.  */
3963       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3964         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3965       else
3966         {
3967           /* Cost model check occurs at prologue generation.  */
3968           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3969             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3970               + vect_get_stmt_cost (cond_branch_not_taken);
3971           /* Cost model check occurs at epilogue generation.  */
3972           else
3973             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3974         }
3975     }
3976
3977   /* Complete the target-specific cost calculations.  */
3978   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3979                &vec_inside_cost, &vec_epilogue_cost);
3980
3981   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3982
3983   if (dump_enabled_p ())
3984     {
3985       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3986       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3987                    vec_inside_cost);
3988       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3989                    vec_prologue_cost);
3990       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3991                    vec_epilogue_cost);
3992       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3993                    scalar_single_iter_cost);
3994       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3995                    scalar_outside_cost);
3996       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3997                    vec_outside_cost);
3998       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3999                    peel_iters_prologue);
4000       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
4001                    peel_iters_epilogue);
4002     }
4003
4004   /* Calculate number of iterations required to make the vector version
4005      profitable, relative to the loop bodies only.  The following condition
4006      must hold true:
4007      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
4008      where
4009      SIC = scalar iteration cost, VIC = vector iteration cost,
4010      VOC = vector outside cost, VF = vectorization factor,
4011      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
4012      SOC = scalar outside cost for run time cost model check.  */
4013
4014   if ((scalar_single_iter_cost * assumed_vf) > (int) vec_inside_cost)
4015     {
4016       min_profitable_iters = ((vec_outside_cost - scalar_outside_cost)
4017                               * assumed_vf
4018                               - vec_inside_cost * peel_iters_prologue
4019                               - vec_inside_cost * peel_iters_epilogue);
4020       if (min_profitable_iters <= 0)
4021         min_profitable_iters = 0;
4022       else
4023         {
4024           min_profitable_iters /= ((scalar_single_iter_cost * assumed_vf)
4025                                    - vec_inside_cost);
4026
4027           if ((scalar_single_iter_cost * assumed_vf * min_profitable_iters)
4028               <= (((int) vec_inside_cost * min_profitable_iters)
4029                   + (((int) vec_outside_cost - scalar_outside_cost)
4030                      * assumed_vf)))
4031             min_profitable_iters++;
4032         }
4033     }
4034   /* vector version will never be profitable.  */
4035   else
4036     {
4037       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
4038         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
4039                     "did not happen for a simd loop");
4040
4041       if (dump_enabled_p ())
4042         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4043                          "cost model: the vector iteration cost = %d "
4044                          "divided by the scalar iteration cost = %d "
4045                          "is greater or equal to the vectorization factor = %d"
4046                          ".\n",
4047                          vec_inside_cost, scalar_single_iter_cost, assumed_vf);
4048       *ret_min_profitable_niters = -1;
4049       *ret_min_profitable_estimate = -1;
4050       return;
4051     }
4052
4053   dump_printf (MSG_NOTE,
4054                "  Calculated minimum iters for profitability: %d\n",
4055                min_profitable_iters);
4056
4057   if (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
4058       && min_profitable_iters < (assumed_vf + peel_iters_prologue))
4059     /* We want the vectorized loop to execute at least once.  */
4060     min_profitable_iters = assumed_vf + peel_iters_prologue;
4061
4062   if (dump_enabled_p ())
4063     dump_printf_loc (MSG_NOTE, vect_location,
4064                      "  Runtime profitability threshold = %d\n",
4065                      min_profitable_iters);
4066
4067   *ret_min_profitable_niters = min_profitable_iters;
4068
4069   /* Calculate number of iterations required to make the vector version
4070      profitable, relative to the loop bodies only.
4071
4072      Non-vectorized variant is SIC * niters and it must win over vector
4073      variant on the expected loop trip count.  The following condition must hold true:
4074      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
4075
4076   if (vec_outside_cost <= 0)
4077     min_profitable_estimate = 0;
4078   else
4079     {
4080       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost)
4081                                  * assumed_vf
4082                                  - vec_inside_cost * peel_iters_prologue
4083                                  - vec_inside_cost * peel_iters_epilogue)
4084                                  / ((scalar_single_iter_cost * assumed_vf)
4085                                    - vec_inside_cost);
4086     }
4087   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
4088   if (dump_enabled_p ())
4089     dump_printf_loc (MSG_NOTE, vect_location,
4090                      "  Static estimate profitability threshold = %d\n",
4091                      min_profitable_estimate);
4092
4093   *ret_min_profitable_estimate = min_profitable_estimate;
4094 }
4095
4096 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
4097    vector elements (not bits) for a vector with NELT elements.  */
4098 static void
4099 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
4100                               vec_perm_builder *sel)
4101 {
4102   /* The encoding is a single stepped pattern.  Any wrap-around is handled
4103      by vec_perm_indices.  */
4104   sel->new_vector (nelt, 1, 3);
4105   for (unsigned int i = 0; i < 3; i++)
4106     sel->quick_push (i + offset);
4107 }
4108
4109 /* Checks whether the target supports whole-vector shifts for vectors of mode
4110    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
4111    it supports vec_perm_const with masks for all necessary shift amounts.  */
4112 static bool
4113 have_whole_vector_shift (machine_mode mode)
4114 {
4115   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4116     return true;
4117
4118   /* Variable-length vectors should be handled via the optab.  */
4119   unsigned int nelt;
4120   if (!GET_MODE_NUNITS (mode).is_constant (&nelt))
4121     return false;
4122
4123   vec_perm_builder sel;
4124   vec_perm_indices indices;
4125   for (unsigned int i = nelt / 2; i >= 1; i /= 2)
4126     {
4127       calc_vec_perm_mask_for_shift (i, nelt, &sel);
4128       indices.new_vector (sel, 2, nelt);
4129       if (!can_vec_perm_const_p (mode, indices, false))
4130         return false;
4131     }
4132   return true;
4133 }
4134
4135 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
4136    functions. Design better to avoid maintenance issues.  */
4137
4138 /* Function vect_model_reduction_cost.
4139
4140    Models cost for a reduction operation, including the vector ops
4141    generated within the strip-mine loop, the initial definition before
4142    the loop, and the epilogue code that must be generated.  */
4143
4144 static void
4145 vect_model_reduction_cost (stmt_vec_info stmt_info, internal_fn reduc_fn,
4146                            int ncopies)
4147 {
4148   int prologue_cost = 0, epilogue_cost = 0, inside_cost;
4149   enum tree_code code;
4150   optab optab;
4151   tree vectype;
4152   gimple *orig_stmt;
4153   machine_mode mode;
4154   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4155   struct loop *loop = NULL;
4156   void *target_cost_data;
4157
4158   if (loop_vinfo)
4159     {
4160       loop = LOOP_VINFO_LOOP (loop_vinfo);
4161       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4162     }
4163   else
4164     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
4165
4166   /* Condition reductions generate two reductions in the loop.  */
4167   vect_reduction_type reduction_type
4168     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
4169   if (reduction_type == COND_REDUCTION)
4170     ncopies *= 2;
4171
4172   vectype = STMT_VINFO_VECTYPE (stmt_info);
4173   mode = TYPE_MODE (vectype);
4174   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4175
4176   if (!orig_stmt)
4177     orig_stmt = STMT_VINFO_STMT (stmt_info);
4178
4179   code = gimple_assign_rhs_code (orig_stmt);
4180
4181   if (reduction_type == EXTRACT_LAST_REDUCTION
4182       || reduction_type == FOLD_LEFT_REDUCTION)
4183     {
4184       /* No extra instructions needed in the prologue.  */
4185       prologue_cost = 0;
4186
4187       if (reduction_type == EXTRACT_LAST_REDUCTION || reduc_fn != IFN_LAST)
4188         /* Count one reduction-like operation per vector.  */
4189         inside_cost = add_stmt_cost (target_cost_data, ncopies, vec_to_scalar,
4190                                      stmt_info, 0, vect_body);
4191       else
4192         {
4193           /* Use NELEMENTS extracts and NELEMENTS scalar ops.  */
4194           unsigned int nelements = ncopies * vect_nunits_for_cost (vectype);
4195           inside_cost = add_stmt_cost (target_cost_data,  nelements,
4196                                        vec_to_scalar, stmt_info, 0,
4197                                        vect_body);
4198           inside_cost += add_stmt_cost (target_cost_data,  nelements,
4199                                         scalar_stmt, stmt_info, 0,
4200                                         vect_body);
4201         }
4202     }
4203   else
4204     {
4205       /* Add in cost for initial definition.
4206          For cond reduction we have four vectors: initial index, step,
4207          initial result of the data reduction, initial value of the index
4208          reduction.  */
4209       int prologue_stmts = reduction_type == COND_REDUCTION ? 4 : 1;
4210       prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
4211                                       scalar_to_vec, stmt_info, 0,
4212                                       vect_prologue);
4213
4214       /* Cost of reduction op inside loop.  */
4215       inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4216                                    stmt_info, 0, vect_body);
4217     }
4218
4219   /* Determine cost of epilogue code.
4220
4221      We have a reduction operator that will reduce the vector in one statement.
4222      Also requires scalar extract.  */
4223
4224   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
4225     {
4226       if (reduc_fn != IFN_LAST)
4227         {
4228           if (reduction_type == COND_REDUCTION)
4229             {
4230               /* An EQ stmt and an COND_EXPR stmt.  */
4231               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4232                                               vector_stmt, stmt_info, 0,
4233                                               vect_epilogue);
4234               /* Reduction of the max index and a reduction of the found
4235                  values.  */
4236               epilogue_cost += add_stmt_cost (target_cost_data, 2,
4237                                               vec_to_scalar, stmt_info, 0,
4238                                               vect_epilogue);
4239               /* A broadcast of the max value.  */
4240               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4241                                               scalar_to_vec, stmt_info, 0,
4242                                               vect_epilogue);
4243             }
4244           else
4245             {
4246               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
4247                                               stmt_info, 0, vect_epilogue);
4248               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4249                                               vec_to_scalar, stmt_info, 0,
4250                                               vect_epilogue);
4251             }
4252         }
4253       else if (reduction_type == COND_REDUCTION)
4254         {
4255           unsigned estimated_nunits = vect_nunits_for_cost (vectype);
4256           /* Extraction of scalar elements.  */
4257           epilogue_cost += add_stmt_cost (target_cost_data,
4258                                           2 * estimated_nunits,
4259                                           vec_to_scalar, stmt_info, 0,
4260                                           vect_epilogue);
4261           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
4262           epilogue_cost += add_stmt_cost (target_cost_data,
4263                                           2 * estimated_nunits - 3,
4264                                           scalar_stmt, stmt_info, 0,
4265                                           vect_epilogue);
4266         }
4267       else if (reduction_type == EXTRACT_LAST_REDUCTION
4268                || reduction_type == FOLD_LEFT_REDUCTION)
4269         /* No extra instructions need in the epilogue.  */
4270         ;
4271       else
4272         {
4273           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4274           tree bitsize =
4275             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
4276           int element_bitsize = tree_to_uhwi (bitsize);
4277           int nelements = vec_size_in_bits / element_bitsize;
4278
4279           if (code == COND_EXPR)
4280             code = MAX_EXPR;
4281
4282           optab = optab_for_tree_code (code, vectype, optab_default);
4283
4284           /* We have a whole vector shift available.  */
4285           if (optab != unknown_optab
4286               && VECTOR_MODE_P (mode)
4287               && optab_handler (optab, mode) != CODE_FOR_nothing
4288               && have_whole_vector_shift (mode))
4289             {
4290               /* Final reduction via vector shifts and the reduction operator.
4291                  Also requires scalar extract.  */
4292               epilogue_cost += add_stmt_cost (target_cost_data,
4293                                               exact_log2 (nelements) * 2,
4294                                               vector_stmt, stmt_info, 0,
4295                                               vect_epilogue);
4296               epilogue_cost += add_stmt_cost (target_cost_data, 1,
4297                                               vec_to_scalar, stmt_info, 0,
4298                                               vect_epilogue);
4299             }
4300           else
4301             /* Use extracts and reduction op for final reduction.  For N
4302                elements, we have N extracts and N-1 reduction ops.  */
4303             epilogue_cost += add_stmt_cost (target_cost_data,
4304                                             nelements + nelements - 1,
4305                                             vector_stmt, stmt_info, 0,
4306                                             vect_epilogue);
4307         }
4308     }
4309
4310   if (dump_enabled_p ())
4311     dump_printf (MSG_NOTE,
4312                  "vect_model_reduction_cost: inside_cost = %d, "
4313                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
4314                  prologue_cost, epilogue_cost);
4315 }
4316
4317
4318 /* Function vect_model_induction_cost.
4319
4320    Models cost for induction operations.  */
4321
4322 static void
4323 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
4324 {
4325   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4326   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
4327   unsigned inside_cost, prologue_cost;
4328
4329   if (PURE_SLP_STMT (stmt_info))
4330     return;
4331
4332   /* loop cost for vec_loop.  */
4333   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
4334                                stmt_info, 0, vect_body);
4335
4336   /* prologue cost for vec_init and vec_step.  */
4337   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
4338                                  stmt_info, 0, vect_prologue);
4339
4340   if (dump_enabled_p ())
4341     dump_printf_loc (MSG_NOTE, vect_location,
4342                      "vect_model_induction_cost: inside_cost = %d, "
4343                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
4344 }
4345
4346
4347
4348 /* Function get_initial_def_for_reduction
4349
4350    Input:
4351    STMT - a stmt that performs a reduction operation in the loop.
4352    INIT_VAL - the initial value of the reduction variable
4353
4354    Output:
4355    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4356         of the reduction (used for adjusting the epilog - see below).
4357    Return a vector variable, initialized according to the operation that STMT
4358         performs. This vector will be used as the initial value of the
4359         vector of partial results.
4360
4361    Option1 (adjust in epilog): Initialize the vector as follows:
4362      add/bit or/xor:    [0,0,...,0,0]
4363      mult/bit and:      [1,1,...,1,1]
4364      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4365    and when necessary (e.g. add/mult case) let the caller know
4366    that it needs to adjust the result by init_val.
4367
4368    Option2: Initialize the vector as follows:
4369      add/bit or/xor:    [init_val,0,0,...,0]
4370      mult/bit and:      [init_val,1,1,...,1]
4371      min/max/cond_expr: [init_val,init_val,...,init_val]
4372    and no adjustments are needed.
4373
4374    For example, for the following code:
4375
4376    s = init_val;
4377    for (i=0;i<n;i++)
4378      s = s + a[i];
4379
4380    STMT is 's = s + a[i]', and the reduction variable is 's'.
4381    For a vector of 4 units, we want to return either [0,0,0,init_val],
4382    or [0,0,0,0] and let the caller know that it needs to adjust
4383    the result at the end by 'init_val'.
4384
4385    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4386    initialization vector is simpler (same element in all entries), if
4387    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4388
4389    A cost model should help decide between these two schemes.  */
4390
4391 tree
4392 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4393                                tree *adjustment_def)
4394 {
4395   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4396   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4397   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4398   tree scalar_type = TREE_TYPE (init_val);
4399   tree vectype = get_vectype_for_scalar_type (scalar_type);
4400   enum tree_code code = gimple_assign_rhs_code (stmt);
4401   tree def_for_init;
4402   tree init_def;
4403   bool nested_in_vect_loop = false;
4404   REAL_VALUE_TYPE real_init_val = dconst0;
4405   int int_init_val = 0;
4406   gimple *def_stmt = NULL;
4407   gimple_seq stmts = NULL;
4408
4409   gcc_assert (vectype);
4410
4411   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4412               || SCALAR_FLOAT_TYPE_P (scalar_type));
4413
4414   if (nested_in_vect_loop_p (loop, stmt))
4415     nested_in_vect_loop = true;
4416   else
4417     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4418
4419   /* In case of double reduction we only create a vector variable to be put
4420      in the reduction phi node.  The actual statement creation is done in
4421      vect_create_epilog_for_reduction.  */
4422   if (adjustment_def && nested_in_vect_loop
4423       && TREE_CODE (init_val) == SSA_NAME
4424       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4425       && gimple_code (def_stmt) == GIMPLE_PHI
4426       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4427       && vinfo_for_stmt (def_stmt)
4428       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4429           == vect_double_reduction_def)
4430     {
4431       *adjustment_def = NULL;
4432       return vect_create_destination_var (init_val, vectype);
4433     }
4434
4435   vect_reduction_type reduction_type
4436     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo);
4437
4438   /* In case of a nested reduction do not use an adjustment def as
4439      that case is not supported by the epilogue generation correctly
4440      if ncopies is not one.  */
4441   if (adjustment_def && nested_in_vect_loop)
4442     {
4443       *adjustment_def = NULL;
4444       return vect_get_vec_def_for_operand (init_val, stmt);
4445     }
4446
4447   switch (code)
4448     {
4449     case WIDEN_SUM_EXPR:
4450     case DOT_PROD_EXPR:
4451     case SAD_EXPR:
4452     case PLUS_EXPR:
4453     case MINUS_EXPR:
4454     case BIT_IOR_EXPR:
4455     case BIT_XOR_EXPR:
4456     case MULT_EXPR:
4457     case BIT_AND_EXPR:
4458       {
4459         /* ADJUSTMENT_DEF is NULL when called from
4460            vect_create_epilog_for_reduction to vectorize double reduction.  */
4461         if (adjustment_def)
4462           *adjustment_def = init_val;
4463
4464         if (code == MULT_EXPR)
4465           {
4466             real_init_val = dconst1;
4467             int_init_val = 1;
4468           }
4469
4470         if (code == BIT_AND_EXPR)
4471           int_init_val = -1;
4472
4473         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4474           def_for_init = build_real (scalar_type, real_init_val);
4475         else
4476           def_for_init = build_int_cst (scalar_type, int_init_val);
4477
4478         if (adjustment_def)
4479           /* Option1: the first element is '0' or '1' as well.  */
4480           init_def = gimple_build_vector_from_val (&stmts, vectype,
4481                                                    def_for_init);
4482         else if (!TYPE_VECTOR_SUBPARTS (vectype).is_constant ())
4483           {
4484             /* Option2 (variable length): the first element is INIT_VAL.  */
4485             init_def = build_vector_from_val (vectype, def_for_init);
4486             gcall *call = gimple_build_call_internal (IFN_VEC_SHL_INSERT,
4487                                                       2, init_def, init_val);
4488             init_def = make_ssa_name (vectype);
4489             gimple_call_set_lhs (call, init_def);
4490             gimple_seq_add_stmt (&stmts, call);
4491           }
4492         else
4493           {
4494             /* Option2: the first element is INIT_VAL.  */
4495             tree_vector_builder elts (vectype, 1, 2);
4496             elts.quick_push (init_val);
4497             elts.quick_push (def_for_init);
4498             init_def = gimple_build_vector (&stmts, &elts);
4499           }
4500       }
4501       break;
4502
4503     case MIN_EXPR:
4504     case MAX_EXPR:
4505     case COND_EXPR:
4506       {
4507         if (adjustment_def)
4508           {
4509             *adjustment_def = NULL_TREE;
4510             if (reduction_type != COND_REDUCTION
4511                 && reduction_type != EXTRACT_LAST_REDUCTION)
4512               {
4513                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4514                 break;
4515               }
4516           }
4517         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4518         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4519       }
4520       break;
4521
4522     default:
4523       gcc_unreachable ();
4524     }
4525
4526   if (stmts)
4527     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4528   return init_def;
4529 }
4530
4531 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4532    NUMBER_OF_VECTORS is the number of vector defs to create.
4533    If NEUTRAL_OP is nonnull, introducing extra elements of that
4534    value will not change the result.  */
4535
4536 static void
4537 get_initial_defs_for_reduction (slp_tree slp_node,
4538                                 vec<tree> *vec_oprnds,
4539                                 unsigned int number_of_vectors,
4540                                 bool reduc_chain, tree neutral_op)
4541 {
4542   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4543   gimple *stmt = stmts[0];
4544   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4545   unsigned HOST_WIDE_INT nunits;
4546   unsigned j, number_of_places_left_in_vector;
4547   tree vector_type;
4548   tree vop;
4549   int group_size = stmts.length ();
4550   unsigned int vec_num, i;
4551   unsigned number_of_copies = 1;
4552   vec<tree> voprnds;
4553   voprnds.create (number_of_vectors);
4554   struct loop *loop;
4555   auto_vec<tree, 16> permute_results;
4556
4557   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4558
4559   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4560
4561   loop = (gimple_bb (stmt))->loop_father;
4562   gcc_assert (loop);
4563   edge pe = loop_preheader_edge (loop);
4564
4565   gcc_assert (!reduc_chain || neutral_op);
4566
4567   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4568      created vectors. It is greater than 1 if unrolling is performed.
4569
4570      For example, we have two scalar operands, s1 and s2 (e.g., group of
4571      strided accesses of size two), while NUNITS is four (i.e., four scalars
4572      of this type can be packed in a vector).  The output vector will contain
4573      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4574      will be 2).
4575
4576      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4577      containing the operands.
4578
4579      For example, NUNITS is four as before, and the group size is 8
4580      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4581      {s5, s6, s7, s8}.  */
4582
4583   if (!TYPE_VECTOR_SUBPARTS (vector_type).is_constant (&nunits))
4584     nunits = group_size;
4585
4586   number_of_copies = nunits * number_of_vectors / group_size;
4587
4588   number_of_places_left_in_vector = nunits;
4589   bool constant_p = true;
4590   tree_vector_builder elts (vector_type, nunits, 1);
4591   elts.quick_grow (nunits);
4592   for (j = 0; j < number_of_copies; j++)
4593     {
4594       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4595         {
4596           tree op;
4597           /* Get the def before the loop.  In reduction chain we have only
4598              one initial value.  */
4599           if ((j != (number_of_copies - 1)
4600                || (reduc_chain && i != 0))
4601               && neutral_op)
4602             op = neutral_op;
4603           else
4604             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4605
4606           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4607           number_of_places_left_in_vector--;
4608           elts[number_of_places_left_in_vector] = op;
4609           if (!CONSTANT_CLASS_P (op))
4610             constant_p = false;
4611
4612           if (number_of_places_left_in_vector == 0)
4613             {
4614               gimple_seq ctor_seq = NULL;
4615               tree init;
4616               if (constant_p && !neutral_op
4617                   ? multiple_p (TYPE_VECTOR_SUBPARTS (vector_type), nunits)
4618                   : known_eq (TYPE_VECTOR_SUBPARTS (vector_type), nunits))
4619                 /* Build the vector directly from ELTS.  */
4620                 init = gimple_build_vector (&ctor_seq, &elts);
4621               else if (neutral_op)
4622                 {
4623                   /* Build a vector of the neutral value and shift the
4624                      other elements into place.  */
4625                   init = gimple_build_vector_from_val (&ctor_seq, vector_type,
4626                                                        neutral_op);
4627                   int k = nunits;
4628                   while (k > 0 && elts[k - 1] == neutral_op)
4629                     k -= 1;
4630                   while (k > 0)
4631                     {
4632                       k -= 1;
4633                       gcall *call = gimple_build_call_internal
4634                         (IFN_VEC_SHL_INSERT, 2, init, elts[k]);
4635                       init = make_ssa_name (vector_type);
4636                       gimple_call_set_lhs (call, init);
4637                       gimple_seq_add_stmt (&ctor_seq, call);
4638                     }
4639                 }
4640               else
4641                 {
4642                   /* First time round, duplicate ELTS to fill the
4643                      required number of vectors, then cherry pick the
4644                      appropriate result for each iteration.  */
4645                   if (vec_oprnds->is_empty ())
4646                     duplicate_and_interleave (&ctor_seq, vector_type, elts,
4647                                               number_of_vectors,
4648                                               permute_results);
4649                   init = permute_results[number_of_vectors - j - 1];
4650                 }
4651               if (ctor_seq != NULL)
4652                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4653               voprnds.quick_push (init);
4654
4655               number_of_places_left_in_vector = nunits;
4656               elts.new_vector (vector_type, nunits, 1);
4657               elts.quick_grow (nunits);
4658               constant_p = true;
4659             }
4660         }
4661     }
4662
4663   /* Since the vectors are created in the reverse order, we should invert
4664      them.  */
4665   vec_num = voprnds.length ();
4666   for (j = vec_num; j != 0; j--)
4667     {
4668       vop = voprnds[j - 1];
4669       vec_oprnds->quick_push (vop);
4670     }
4671
4672   voprnds.release ();
4673
4674   /* In case that VF is greater than the unrolling factor needed for the SLP
4675      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4676      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4677      to replicate the vectors.  */
4678   tree neutral_vec = NULL;
4679   while (number_of_vectors > vec_oprnds->length ())
4680     {
4681       if (neutral_op)
4682         {
4683           if (!neutral_vec)
4684             {
4685               gimple_seq ctor_seq = NULL;
4686               neutral_vec = gimple_build_vector_from_val
4687                 (&ctor_seq, vector_type, neutral_op);
4688               if (ctor_seq != NULL)
4689                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4690             }
4691           vec_oprnds->quick_push (neutral_vec);
4692         }
4693       else
4694         {
4695           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4696             vec_oprnds->quick_push (vop);
4697         }
4698     }
4699 }
4700
4701
4702 /* Function vect_create_epilog_for_reduction
4703
4704    Create code at the loop-epilog to finalize the result of a reduction
4705    computation.
4706
4707    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4708      reduction statements.
4709    STMT is the scalar reduction stmt that is being vectorized.
4710    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4711      number of elements that we can fit in a vectype (nunits).  In this case
4712      we have to generate more than one vector stmt - i.e - we need to "unroll"
4713      the vector stmt by a factor VF/nunits.  For more details see documentation
4714      in vectorizable_operation.
4715    REDUC_FN is the internal function for the epilog reduction.
4716    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4717      computation.
4718    REDUC_INDEX is the index of the operand in the right hand side of the
4719      statement that is defined by REDUCTION_PHI.
4720    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4721    SLP_NODE is an SLP node containing a group of reduction statements. The
4722      first one in this group is STMT.
4723    INDUC_VAL is for INTEGER_INDUC_COND_REDUCTION the value to use for the case
4724      when the COND_EXPR is never true in the loop.  For MAX_EXPR, it needs to
4725      be smaller than any value of the IV in the loop, for MIN_EXPR larger than
4726      any value of the IV in the loop.
4727    INDUC_CODE is the code for epilog reduction if INTEGER_INDUC_COND_REDUCTION.
4728    NEUTRAL_OP is the value given by neutral_op_for_slp_reduction; it is
4729      null if this is not an SLP reduction
4730
4731    This function:
4732    1. Creates the reduction def-use cycles: sets the arguments for
4733       REDUCTION_PHIS:
4734       The loop-entry argument is the vectorized initial-value of the reduction.
4735       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4736       sums.
4737    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4738       by calling the function specified by REDUC_FN if available, or by
4739       other means (whole-vector shifts or a scalar loop).
4740       The function also creates a new phi node at the loop exit to preserve
4741       loop-closed form, as illustrated below.
4742
4743      The flow at the entry to this function:
4744
4745         loop:
4746           vec_def = phi <null, null>            # REDUCTION_PHI
4747           VECT_DEF = vector_stmt                # vectorized form of STMT
4748           s_loop = scalar_stmt                  # (scalar) STMT
4749         loop_exit:
4750           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4751           use <s_out0>
4752           use <s_out0>
4753
4754      The above is transformed by this function into:
4755
4756         loop:
4757           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4758           VECT_DEF = vector_stmt                # vectorized form of STMT
4759           s_loop = scalar_stmt                  # (scalar) STMT
4760         loop_exit:
4761           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4762           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4763           v_out2 = reduce <v_out1>
4764           s_out3 = extract_field <v_out2, 0>
4765           s_out4 = adjust_result <s_out3>
4766           use <s_out4>
4767           use <s_out4>
4768 */
4769
4770 static void
4771 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4772                                   gimple *reduc_def_stmt,
4773                                   int ncopies, internal_fn reduc_fn,
4774                                   vec<gimple *> reduction_phis,
4775                                   bool double_reduc,
4776                                   slp_tree slp_node,
4777                                   slp_instance slp_node_instance,
4778                                   tree induc_val, enum tree_code induc_code,
4779                                   tree neutral_op)
4780 {
4781   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4782   stmt_vec_info prev_phi_info;
4783   tree vectype;
4784   machine_mode mode;
4785   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4786   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4787   basic_block exit_bb;
4788   tree scalar_dest;
4789   tree scalar_type;
4790   gimple *new_phi = NULL, *phi;
4791   gimple_stmt_iterator exit_gsi;
4792   tree vec_dest;
4793   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4794   gimple *epilog_stmt = NULL;
4795   enum tree_code code = gimple_assign_rhs_code (stmt);
4796   gimple *exit_phi;
4797   tree bitsize;
4798   tree adjustment_def = NULL;
4799   tree vec_initial_def = NULL;
4800   tree expr, def, initial_def = NULL;
4801   tree orig_name, scalar_result;
4802   imm_use_iterator imm_iter, phi_imm_iter;
4803   use_operand_p use_p, phi_use_p;
4804   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4805   bool nested_in_vect_loop = false;
4806   auto_vec<gimple *> new_phis;
4807   auto_vec<gimple *> inner_phis;
4808   enum vect_def_type dt = vect_unknown_def_type;
4809   int j, i;
4810   auto_vec<tree> scalar_results;
4811   unsigned int group_size = 1, k, ratio;
4812   auto_vec<tree> vec_initial_defs;
4813   auto_vec<gimple *> phis;
4814   bool slp_reduc = false;
4815   bool direct_slp_reduc;
4816   tree new_phi_result;
4817   gimple *inner_phi = NULL;
4818   tree induction_index = NULL_TREE;
4819
4820   if (slp_node)
4821     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4822
4823   if (nested_in_vect_loop_p (loop, stmt))
4824     {
4825       outer_loop = loop;
4826       loop = loop->inner;
4827       nested_in_vect_loop = true;
4828       gcc_assert (!slp_node);
4829     }
4830
4831   vectype = STMT_VINFO_VECTYPE (stmt_info);
4832   gcc_assert (vectype);
4833   mode = TYPE_MODE (vectype);
4834
4835   /* 1. Create the reduction def-use cycle:
4836      Set the arguments of REDUCTION_PHIS, i.e., transform
4837
4838         loop:
4839           vec_def = phi <null, null>            # REDUCTION_PHI
4840           VECT_DEF = vector_stmt                # vectorized form of STMT
4841           ...
4842
4843      into:
4844
4845         loop:
4846           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4847           VECT_DEF = vector_stmt                # vectorized form of STMT
4848           ...
4849
4850      (in case of SLP, do it for all the phis). */
4851
4852   /* Get the loop-entry arguments.  */
4853   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4854   if (slp_node)
4855     {
4856       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4857       vec_initial_defs.reserve (vec_num);
4858       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4859                                       &vec_initial_defs, vec_num,
4860                                       GROUP_FIRST_ELEMENT (stmt_info),
4861                                       neutral_op);
4862     }
4863   else
4864     {
4865       /* Get at the scalar def before the loop, that defines the initial value
4866          of the reduction variable.  */
4867       gimple *def_stmt;
4868       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4869                                            loop_preheader_edge (loop));
4870       /* Optimize: if initial_def is for REDUC_MAX smaller than the base
4871          and we can't use zero for induc_val, use initial_def.  Similarly
4872          for REDUC_MIN and initial_def larger than the base.  */
4873       if (TREE_CODE (initial_def) == INTEGER_CST
4874           && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4875               == INTEGER_INDUC_COND_REDUCTION)
4876           && !integer_zerop (induc_val)
4877           && ((induc_code == MAX_EXPR
4878                && tree_int_cst_lt (initial_def, induc_val))
4879               || (induc_code == MIN_EXPR
4880                   && tree_int_cst_lt (induc_val, initial_def))))
4881         induc_val = initial_def;
4882       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4883       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4884                                                        &adjustment_def);
4885       vec_initial_defs.create (1);
4886       vec_initial_defs.quick_push (vec_initial_def);
4887     }
4888
4889   /* Set phi nodes arguments.  */
4890   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4891     {
4892       tree vec_init_def = vec_initial_defs[i];
4893       tree def = vect_defs[i];
4894       for (j = 0; j < ncopies; j++)
4895         {
4896           if (j != 0)
4897             {
4898               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4899               if (nested_in_vect_loop)
4900                 vec_init_def
4901                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4902                                                     vec_init_def);
4903             }
4904
4905           /* Set the loop-entry arg of the reduction-phi.  */
4906
4907           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4908               == INTEGER_INDUC_COND_REDUCTION)
4909             {
4910               /* Initialise the reduction phi to zero.  This prevents initial
4911                  values of non-zero interferring with the reduction op.  */
4912               gcc_assert (ncopies == 1);
4913               gcc_assert (i == 0);
4914
4915               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4916               tree induc_val_vec
4917                 = build_vector_from_val (vec_init_def_type, induc_val);
4918
4919               add_phi_arg (as_a <gphi *> (phi), induc_val_vec,
4920                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4921             }
4922           else
4923             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4924                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4925
4926           /* Set the loop-latch arg for the reduction-phi.  */
4927           if (j > 0)
4928             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4929
4930           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4931                        UNKNOWN_LOCATION);
4932
4933           if (dump_enabled_p ())
4934             {
4935               dump_printf_loc (MSG_NOTE, vect_location,
4936                                "transform reduction: created def-use cycle: ");
4937               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4938               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4939             }
4940         }
4941     }
4942
4943   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4944      which is updated with the current index of the loop for every match of
4945      the original loop's cond_expr (VEC_STMT).  This results in a vector
4946      containing the last time the condition passed for that vector lane.
4947      The first match will be a 1 to allow 0 to be used for non-matching
4948      indexes.  If there are no matches at all then the vector will be all
4949      zeroes.  */
4950   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4951     {
4952       tree indx_before_incr, indx_after_incr;
4953       poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4954
4955       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4956       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4957
4958       int scalar_precision
4959         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4960       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4961       tree cr_index_vector_type = build_vector_type
4962         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4963
4964       /* First we create a simple vector induction variable which starts
4965          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4966          vector size (STEP).  */
4967
4968       /* Create a {1,2,3,...} vector.  */
4969       tree series_vect = build_index_vector (cr_index_vector_type, 1, 1);
4970
4971       /* Create a vector of the step value.  */
4972       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4973       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4974
4975       /* Create an induction variable.  */
4976       gimple_stmt_iterator incr_gsi;
4977       bool insert_after;
4978       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4979       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4980                  insert_after, &indx_before_incr, &indx_after_incr);
4981
4982       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4983          filled with zeros (VEC_ZERO).  */
4984
4985       /* Create a vector of 0s.  */
4986       tree zero = build_zero_cst (cr_index_scalar_type);
4987       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4988
4989       /* Create a vector phi node.  */
4990       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4991       new_phi = create_phi_node (new_phi_tree, loop->header);
4992       set_vinfo_for_stmt (new_phi,
4993                           new_stmt_vec_info (new_phi, loop_vinfo));
4994       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4995                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4996
4997       /* Now take the condition from the loops original cond_expr
4998          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4999          every match uses values from the induction variable
5000          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
5001          (NEW_PHI_TREE).
5002          Finally, we update the phi (NEW_PHI_TREE) to take the value of
5003          the new cond_expr (INDEX_COND_EXPR).  */
5004
5005       /* Duplicate the condition from vec_stmt.  */
5006       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
5007
5008       /* Create a conditional, where the condition is taken from vec_stmt
5009          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
5010          else is the phi (NEW_PHI_TREE).  */
5011       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
5012                                      ccompare, indx_before_incr,
5013                                      new_phi_tree);
5014       induction_index = make_ssa_name (cr_index_vector_type);
5015       gimple *index_condition = gimple_build_assign (induction_index,
5016                                                      index_cond_expr);
5017       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
5018       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
5019                                                         loop_vinfo);
5020       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
5021       set_vinfo_for_stmt (index_condition, index_vec_info);
5022
5023       /* Update the phi with the vec cond.  */
5024       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
5025                    loop_latch_edge (loop), UNKNOWN_LOCATION);
5026     }
5027
5028   /* 2. Create epilog code.
5029         The reduction epilog code operates across the elements of the vector
5030         of partial results computed by the vectorized loop.
5031         The reduction epilog code consists of:
5032
5033         step 1: compute the scalar result in a vector (v_out2)
5034         step 2: extract the scalar result (s_out3) from the vector (v_out2)
5035         step 3: adjust the scalar result (s_out3) if needed.
5036
5037         Step 1 can be accomplished using one the following three schemes:
5038           (scheme 1) using reduc_fn, if available.
5039           (scheme 2) using whole-vector shifts, if available.
5040           (scheme 3) using a scalar loop. In this case steps 1+2 above are
5041                      combined.
5042
5043           The overall epilog code looks like this:
5044
5045           s_out0 = phi <s_loop>         # original EXIT_PHI
5046           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
5047           v_out2 = reduce <v_out1>              # step 1
5048           s_out3 = extract_field <v_out2, 0>    # step 2
5049           s_out4 = adjust_result <s_out3>       # step 3
5050
5051           (step 3 is optional, and steps 1 and 2 may be combined).
5052           Lastly, the uses of s_out0 are replaced by s_out4.  */
5053
5054
5055   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
5056          v_out1 = phi <VECT_DEF>
5057          Store them in NEW_PHIS.  */
5058
5059   exit_bb = single_exit (loop)->dest;
5060   prev_phi_info = NULL;
5061   new_phis.create (vect_defs.length ());
5062   FOR_EACH_VEC_ELT (vect_defs, i, def)
5063     {
5064       for (j = 0; j < ncopies; j++)
5065         {
5066           tree new_def = copy_ssa_name (def);
5067           phi = create_phi_node (new_def, exit_bb);
5068           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
5069           if (j == 0)
5070             new_phis.quick_push (phi);
5071           else
5072             {
5073               def = vect_get_vec_def_for_stmt_copy (dt, def);
5074               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
5075             }
5076
5077           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
5078           prev_phi_info = vinfo_for_stmt (phi);
5079         }
5080     }
5081
5082   /* The epilogue is created for the outer-loop, i.e., for the loop being
5083      vectorized.  Create exit phis for the outer loop.  */
5084   if (double_reduc)
5085     {
5086       loop = outer_loop;
5087       exit_bb = single_exit (loop)->dest;
5088       inner_phis.create (vect_defs.length ());
5089       FOR_EACH_VEC_ELT (new_phis, i, phi)
5090         {
5091           tree new_result = copy_ssa_name (PHI_RESULT (phi));
5092           gphi *outer_phi = create_phi_node (new_result, exit_bb);
5093           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5094                            PHI_RESULT (phi));
5095           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5096                                                             loop_vinfo));
5097           inner_phis.quick_push (phi);
5098           new_phis[i] = outer_phi;
5099           prev_phi_info = vinfo_for_stmt (outer_phi);
5100           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
5101             {
5102               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
5103               new_result = copy_ssa_name (PHI_RESULT (phi));
5104               outer_phi = create_phi_node (new_result, exit_bb);
5105               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
5106                                PHI_RESULT (phi));
5107               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
5108                                                                 loop_vinfo));
5109               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
5110               prev_phi_info = vinfo_for_stmt (outer_phi);
5111             }
5112         }
5113     }
5114
5115   exit_gsi = gsi_after_labels (exit_bb);
5116
5117   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
5118          (i.e. when reduc_fn is not available) and in the final adjustment
5119          code (if needed).  Also get the original scalar reduction variable as
5120          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
5121          represents a reduction pattern), the tree-code and scalar-def are
5122          taken from the original stmt that the pattern-stmt (STMT) replaces.
5123          Otherwise (it is a regular reduction) - the tree-code and scalar-def
5124          are taken from STMT.  */
5125
5126   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
5127   if (!orig_stmt)
5128     {
5129       /* Regular reduction  */
5130       orig_stmt = stmt;
5131     }
5132   else
5133     {
5134       /* Reduction pattern  */
5135       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
5136       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
5137       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
5138     }
5139
5140   code = gimple_assign_rhs_code (orig_stmt);
5141   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
5142      partial results are added and not subtracted.  */
5143   if (code == MINUS_EXPR)
5144     code = PLUS_EXPR;
5145
5146   scalar_dest = gimple_assign_lhs (orig_stmt);
5147   scalar_type = TREE_TYPE (scalar_dest);
5148   scalar_results.create (group_size);
5149   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
5150   bitsize = TYPE_SIZE (scalar_type);
5151
5152   /* In case this is a reduction in an inner-loop while vectorizing an outer
5153      loop - we don't need to extract a single scalar result at the end of the
5154      inner-loop (unless it is double reduction, i.e., the use of reduction is
5155      outside the outer-loop).  The final vector of partial results will be used
5156      in the vectorized outer-loop, or reduced to a scalar result at the end of
5157      the outer-loop.  */
5158   if (nested_in_vect_loop && !double_reduc)
5159     goto vect_finalize_reduction;
5160
5161   /* SLP reduction without reduction chain, e.g.,
5162      # a1 = phi <a2, a0>
5163      # b1 = phi <b2, b0>
5164      a2 = operation (a1)
5165      b2 = operation (b1)  */
5166   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
5167
5168   /* True if we should implement SLP_REDUC using native reduction operations
5169      instead of scalar operations.  */
5170   direct_slp_reduc = (reduc_fn != IFN_LAST
5171                       && slp_reduc
5172                       && !TYPE_VECTOR_SUBPARTS (vectype).is_constant ());
5173
5174   /* In case of reduction chain, e.g.,
5175      # a1 = phi <a3, a0>
5176      a2 = operation (a1)
5177      a3 = operation (a2),
5178
5179      we may end up with more than one vector result.  Here we reduce them to
5180      one vector.  */
5181   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)) || direct_slp_reduc)
5182     {
5183       tree first_vect = PHI_RESULT (new_phis[0]);
5184       gassign *new_vec_stmt = NULL;
5185       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5186       for (k = 1; k < new_phis.length (); k++)
5187         {
5188           gimple *next_phi = new_phis[k];
5189           tree second_vect = PHI_RESULT (next_phi);
5190           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5191           new_vec_stmt = gimple_build_assign (tem, code,
5192                                               first_vect, second_vect);
5193           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5194           first_vect = tem;
5195         }
5196
5197       new_phi_result = first_vect;
5198       if (new_vec_stmt)
5199         {
5200           new_phis.truncate (0);
5201           new_phis.safe_push (new_vec_stmt);
5202         }
5203     }
5204   /* Likewise if we couldn't use a single defuse cycle.  */
5205   else if (ncopies > 1)
5206     {
5207       gcc_assert (new_phis.length () == 1);
5208       tree first_vect = PHI_RESULT (new_phis[0]);
5209       gassign *new_vec_stmt = NULL;
5210       vec_dest = vect_create_destination_var (scalar_dest, vectype);
5211       gimple *next_phi = new_phis[0];
5212       for (int k = 1; k < ncopies; ++k)
5213         {
5214           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
5215           tree second_vect = PHI_RESULT (next_phi);
5216           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
5217           new_vec_stmt = gimple_build_assign (tem, code,
5218                                               first_vect, second_vect);
5219           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
5220           first_vect = tem;
5221         }
5222       new_phi_result = first_vect;
5223       new_phis.truncate (0);
5224       new_phis.safe_push (new_vec_stmt);
5225     }
5226   else
5227     new_phi_result = PHI_RESULT (new_phis[0]);
5228
5229   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5230       && reduc_fn != IFN_LAST)
5231     {
5232       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
5233          various data values where the condition matched and another vector
5234          (INDUCTION_INDEX) containing all the indexes of those matches.  We
5235          need to extract the last matching index (which will be the index with
5236          highest value) and use this to index into the data vector.
5237          For the case where there were no matches, the data vector will contain
5238          all default values and the index vector will be all zeros.  */
5239
5240       /* Get various versions of the type of the vector of indexes.  */
5241       tree index_vec_type = TREE_TYPE (induction_index);
5242       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
5243       tree index_scalar_type = TREE_TYPE (index_vec_type);
5244       tree index_vec_cmp_type = build_same_sized_truth_vector_type
5245         (index_vec_type);
5246
5247       /* Get an unsigned integer version of the type of the data vector.  */
5248       int scalar_precision
5249         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
5250       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
5251       tree vectype_unsigned = build_vector_type
5252         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
5253
5254       /* First we need to create a vector (ZERO_VEC) of zeros and another
5255          vector (MAX_INDEX_VEC) filled with the last matching index, which we
5256          can create using a MAX reduction and then expanding.
5257          In the case where the loop never made any matches, the max index will
5258          be zero.  */
5259
5260       /* Vector of {0, 0, 0,...}.  */
5261       tree zero_vec = make_ssa_name (vectype);
5262       tree zero_vec_rhs = build_zero_cst (vectype);
5263       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
5264       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
5265
5266       /* Find maximum value from the vector of found indexes.  */
5267       tree max_index = make_ssa_name (index_scalar_type);
5268       gcall *max_index_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5269                                                           1, induction_index);
5270       gimple_call_set_lhs (max_index_stmt, max_index);
5271       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
5272
5273       /* Vector of {max_index, max_index, max_index,...}.  */
5274       tree max_index_vec = make_ssa_name (index_vec_type);
5275       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
5276                                                       max_index);
5277       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
5278                                                         max_index_vec_rhs);
5279       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
5280
5281       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
5282          with the vector (INDUCTION_INDEX) of found indexes, choosing values
5283          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
5284          otherwise.  Only one value should match, resulting in a vector
5285          (VEC_COND) with one data value and the rest zeros.
5286          In the case where the loop never made any matches, every index will
5287          match, resulting in a vector with all data values (which will all be
5288          the default value).  */
5289
5290       /* Compare the max index vector to the vector of found indexes to find
5291          the position of the max value.  */
5292       tree vec_compare = make_ssa_name (index_vec_cmp_type);
5293       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
5294                                                       induction_index,
5295                                                       max_index_vec);
5296       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
5297
5298       /* Use the compare to choose either values from the data vector or
5299          zero.  */
5300       tree vec_cond = make_ssa_name (vectype);
5301       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
5302                                                    vec_compare, new_phi_result,
5303                                                    zero_vec);
5304       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
5305
5306       /* Finally we need to extract the data value from the vector (VEC_COND)
5307          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
5308          reduction, but because this doesn't exist, we can use a MAX reduction
5309          instead.  The data value might be signed or a float so we need to cast
5310          it first.
5311          In the case where the loop never made any matches, the data values are
5312          all identical, and so will reduce down correctly.  */
5313
5314       /* Make the matched data values unsigned.  */
5315       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
5316       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
5317                                        vec_cond);
5318       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
5319                                                         VIEW_CONVERT_EXPR,
5320                                                         vec_cond_cast_rhs);
5321       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
5322
5323       /* Reduce down to a scalar value.  */
5324       tree data_reduc = make_ssa_name (scalar_type_unsigned);
5325       gcall *data_reduc_stmt = gimple_build_call_internal (IFN_REDUC_MAX,
5326                                                            1, vec_cond_cast);
5327       gimple_call_set_lhs (data_reduc_stmt, data_reduc);
5328       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
5329
5330       /* Convert the reduced value back to the result type and set as the
5331          result.  */
5332       gimple_seq stmts = NULL;
5333       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
5334                                data_reduc);
5335       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5336       scalar_results.safe_push (new_temp);
5337     }
5338   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5339            && reduc_fn == IFN_LAST)
5340     {
5341       /* Condition reduction without supported IFN_REDUC_MAX.  Generate
5342          idx = 0;
5343          idx_val = induction_index[0];
5344          val = data_reduc[0];
5345          for (idx = 0, val = init, i = 0; i < nelts; ++i)
5346            if (induction_index[i] > idx_val)
5347              val = data_reduc[i], idx_val = induction_index[i];
5348          return val;  */
5349
5350       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
5351       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
5352       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
5353       poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
5354       /* Enforced by vectorizable_reduction, which ensures we have target
5355          support before allowing a conditional reduction on variable-length
5356          vectors.  */
5357       unsigned HOST_WIDE_INT v_size = el_size * nunits.to_constant ();
5358       tree idx_val = NULL_TREE, val = NULL_TREE;
5359       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
5360         {
5361           tree old_idx_val = idx_val;
5362           tree old_val = val;
5363           idx_val = make_ssa_name (idx_eltype);
5364           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
5365                                              build3 (BIT_FIELD_REF, idx_eltype,
5366                                                      induction_index,
5367                                                      bitsize_int (el_size),
5368                                                      bitsize_int (off)));
5369           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5370           val = make_ssa_name (data_eltype);
5371           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
5372                                              build3 (BIT_FIELD_REF,
5373                                                      data_eltype,
5374                                                      new_phi_result,
5375                                                      bitsize_int (el_size),
5376                                                      bitsize_int (off)));
5377           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5378           if (off != 0)
5379             {
5380               tree new_idx_val = idx_val;
5381               tree new_val = val;
5382               if (off != v_size - el_size)
5383                 {
5384                   new_idx_val = make_ssa_name (idx_eltype);
5385                   epilog_stmt = gimple_build_assign (new_idx_val,
5386                                                      MAX_EXPR, idx_val,
5387                                                      old_idx_val);
5388                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5389                 }
5390               new_val = make_ssa_name (data_eltype);
5391               epilog_stmt = gimple_build_assign (new_val,
5392                                                  COND_EXPR,
5393                                                  build2 (GT_EXPR,
5394                                                          boolean_type_node,
5395                                                          idx_val,
5396                                                          old_idx_val),
5397                                                  val, old_val);
5398               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5399               idx_val = new_idx_val;
5400               val = new_val;
5401             }
5402         }
5403       /* Convert the reduced value back to the result type and set as the
5404          result.  */
5405       gimple_seq stmts = NULL;
5406       val = gimple_convert (&stmts, scalar_type, val);
5407       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
5408       scalar_results.safe_push (val);
5409     }
5410
5411   /* 2.3 Create the reduction code, using one of the three schemes described
5412          above. In SLP we simply need to extract all the elements from the
5413          vector (without reducing them), so we use scalar shifts.  */
5414   else if (reduc_fn != IFN_LAST && !slp_reduc)
5415     {
5416       tree tmp;
5417       tree vec_elem_type;
5418
5419       /* Case 1:  Create:
5420          v_out2 = reduc_expr <v_out1>  */
5421
5422       if (dump_enabled_p ())
5423         dump_printf_loc (MSG_NOTE, vect_location,
5424                          "Reduce using direct vector reduction.\n");
5425
5426       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5427       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5428         {
5429           tree tmp_dest
5430             = vect_create_destination_var (scalar_dest, vec_elem_type);
5431           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5432                                                     new_phi_result);
5433           gimple_set_lhs (epilog_stmt, tmp_dest);
5434           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5435           gimple_set_lhs (epilog_stmt, new_temp);
5436           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5437
5438           epilog_stmt = gimple_build_assign (new_scalar_dest, NOP_EXPR,
5439                                              new_temp);
5440         }
5441       else
5442         {
5443           epilog_stmt = gimple_build_call_internal (reduc_fn, 1,
5444                                                     new_phi_result);
5445           gimple_set_lhs (epilog_stmt, new_scalar_dest);
5446         }
5447
5448       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5449       gimple_set_lhs (epilog_stmt, new_temp);
5450       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5451
5452       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5453            == INTEGER_INDUC_COND_REDUCTION)
5454           && !operand_equal_p (initial_def, induc_val, 0))
5455         {
5456           /* Earlier we set the initial value to be a vector if induc_val
5457              values.  Check the result and if it is induc_val then replace
5458              with the original initial value, unless induc_val is
5459              the same as initial_def already.  */
5460           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5461                                   induc_val);
5462
5463           tmp = make_ssa_name (new_scalar_dest);
5464           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5465                                              initial_def, new_temp);
5466           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5467           new_temp = tmp;
5468         }
5469
5470       scalar_results.safe_push (new_temp);
5471     }
5472   else if (direct_slp_reduc)
5473     {
5474       /* Here we create one vector for each of the GROUP_SIZE results,
5475          with the elements for other SLP statements replaced with the
5476          neutral value.  We can then do a normal reduction on each vector.  */
5477
5478       /* Enforced by vectorizable_reduction.  */
5479       gcc_assert (new_phis.length () == 1);
5480       gcc_assert (pow2p_hwi (group_size));
5481
5482       slp_tree orig_phis_slp_node = slp_node_instance->reduc_phis;
5483       vec<gimple *> orig_phis = SLP_TREE_SCALAR_STMTS (orig_phis_slp_node);
5484       gimple_seq seq = NULL;
5485
5486       /* Build a vector {0, 1, 2, ...}, with the same number of elements
5487          and the same element size as VECTYPE.  */
5488       tree index = build_index_vector (vectype, 0, 1);
5489       tree index_type = TREE_TYPE (index);
5490       tree index_elt_type = TREE_TYPE (index_type);
5491       tree mask_type = build_same_sized_truth_vector_type (index_type);
5492
5493       /* Create a vector that, for each element, identifies which of
5494          the GROUP_SIZE results should use it.  */
5495       tree index_mask = build_int_cst (index_elt_type, group_size - 1);
5496       index = gimple_build (&seq, BIT_AND_EXPR, index_type, index,
5497                             build_vector_from_val (index_type, index_mask));
5498
5499       /* Get a neutral vector value.  This is simply a splat of the neutral
5500          scalar value if we have one, otherwise the initial scalar value
5501          is itself a neutral value.  */
5502       tree vector_identity = NULL_TREE;
5503       if (neutral_op)
5504         vector_identity = gimple_build_vector_from_val (&seq, vectype,
5505                                                         neutral_op);
5506       for (unsigned int i = 0; i < group_size; ++i)
5507         {
5508           /* If there's no univeral neutral value, we can use the
5509              initial scalar value from the original PHI.  This is used
5510              for MIN and MAX reduction, for example.  */
5511           if (!neutral_op)
5512             {
5513               tree scalar_value
5514                 = PHI_ARG_DEF_FROM_EDGE (orig_phis[i],
5515                                          loop_preheader_edge (loop));
5516               vector_identity = gimple_build_vector_from_val (&seq, vectype,
5517                                                               scalar_value);
5518             }
5519
5520           /* Calculate the equivalent of:
5521
5522              sel[j] = (index[j] == i);
5523
5524              which selects the elements of NEW_PHI_RESULT that should
5525              be included in the result.  */
5526           tree compare_val = build_int_cst (index_elt_type, i);
5527           compare_val = build_vector_from_val (index_type, compare_val);
5528           tree sel = gimple_build (&seq, EQ_EXPR, mask_type,
5529                                    index, compare_val);
5530
5531           /* Calculate the equivalent of:
5532
5533              vec = seq ? new_phi_result : vector_identity;
5534
5535              VEC is now suitable for a full vector reduction.  */
5536           tree vec = gimple_build (&seq, VEC_COND_EXPR, vectype,
5537                                    sel, new_phi_result, vector_identity);
5538
5539           /* Do the reduction and convert it to the appropriate type.  */
5540           gcall *call = gimple_build_call_internal (reduc_fn, 1, vec);
5541           tree scalar = make_ssa_name (TREE_TYPE (vectype));
5542           gimple_call_set_lhs (call, scalar);
5543           gimple_seq_add_stmt (&seq, call);
5544           scalar = gimple_convert (&seq, scalar_type, scalar);
5545           scalar_results.safe_push (scalar);
5546         }
5547       gsi_insert_seq_before (&exit_gsi, seq, GSI_SAME_STMT);
5548     }
5549   else
5550     {
5551       bool reduce_with_shift;
5552       tree vec_temp;
5553
5554       /* COND reductions all do the final reduction with MAX_EXPR
5555          or MIN_EXPR.  */
5556       if (code == COND_EXPR)
5557         {
5558           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5559               == INTEGER_INDUC_COND_REDUCTION)
5560             code = induc_code;
5561           else
5562             code = MAX_EXPR;
5563         }
5564
5565       /* See if the target wants to do the final (shift) reduction
5566          in a vector mode of smaller size and first reduce upper/lower
5567          halves against each other.  */
5568       enum machine_mode mode1 = mode;
5569       tree vectype1 = vectype;
5570       unsigned sz = tree_to_uhwi (TYPE_SIZE_UNIT (vectype));
5571       unsigned sz1 = sz;
5572       if (!slp_reduc
5573           && (mode1 = targetm.vectorize.split_reduction (mode)) != mode)
5574         sz1 = GET_MODE_SIZE (mode1).to_constant ();
5575
5576       vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz1);
5577       reduce_with_shift = have_whole_vector_shift (mode1);
5578       if (!VECTOR_MODE_P (mode1))
5579         reduce_with_shift = false;
5580       else
5581         {
5582           optab optab = optab_for_tree_code (code, vectype1, optab_default);
5583           if (optab_handler (optab, mode1) == CODE_FOR_nothing)
5584             reduce_with_shift = false;
5585         }
5586
5587       /* First reduce the vector to the desired vector size we should
5588          do shift reduction on by combining upper and lower halves.  */
5589       new_temp = new_phi_result;
5590       while (sz > sz1)
5591         {
5592           gcc_assert (!slp_reduc);
5593           sz /= 2;
5594           vectype1 = get_vectype_for_scalar_type_and_size (scalar_type, sz);
5595
5596           /* The target has to make sure we support lowpart/highpart
5597              extraction, either via direct vector extract or through
5598              an integer mode punning.  */
5599           tree dst1, dst2;
5600           if (convert_optab_handler (vec_extract_optab,
5601                                      TYPE_MODE (TREE_TYPE (new_temp)),
5602                                      TYPE_MODE (vectype1))
5603               != CODE_FOR_nothing)
5604             {
5605               /* Extract sub-vectors directly once vec_extract becomes
5606                  a conversion optab.  */
5607               dst1 = make_ssa_name (vectype1);
5608               epilog_stmt
5609                   = gimple_build_assign (dst1, BIT_FIELD_REF,
5610                                          build3 (BIT_FIELD_REF, vectype1,
5611                                                  new_temp, TYPE_SIZE (vectype1),
5612                                                  bitsize_int (0)));
5613               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5614               dst2 =  make_ssa_name (vectype1);
5615               epilog_stmt
5616                   = gimple_build_assign (dst2, BIT_FIELD_REF,
5617                                          build3 (BIT_FIELD_REF, vectype1,
5618                                                  new_temp, TYPE_SIZE (vectype1),
5619                                                  bitsize_int (sz * BITS_PER_UNIT)));
5620               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5621             }
5622           else
5623             {
5624               /* Extract via punning to appropriately sized integer mode
5625                  vector.  */
5626               tree eltype = build_nonstandard_integer_type (sz * BITS_PER_UNIT,
5627                                                             1);
5628               tree etype = build_vector_type (eltype, 2);
5629               gcc_assert (convert_optab_handler (vec_extract_optab,
5630                                                  TYPE_MODE (etype),
5631                                                  TYPE_MODE (eltype))
5632                           != CODE_FOR_nothing);
5633               tree tem = make_ssa_name (etype);
5634               epilog_stmt = gimple_build_assign (tem, VIEW_CONVERT_EXPR,
5635                                                  build1 (VIEW_CONVERT_EXPR,
5636                                                          etype, new_temp));
5637               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5638               new_temp = tem;
5639               tem = make_ssa_name (eltype);
5640               epilog_stmt
5641                   = gimple_build_assign (tem, BIT_FIELD_REF,
5642                                          build3 (BIT_FIELD_REF, eltype,
5643                                                  new_temp, TYPE_SIZE (eltype),
5644                                                  bitsize_int (0)));
5645               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5646               dst1 = make_ssa_name (vectype1);
5647               epilog_stmt = gimple_build_assign (dst1, VIEW_CONVERT_EXPR,
5648                                                  build1 (VIEW_CONVERT_EXPR,
5649                                                          vectype1, tem));
5650               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5651               tem = make_ssa_name (eltype);
5652               epilog_stmt
5653                   = gimple_build_assign (tem, BIT_FIELD_REF,
5654                                          build3 (BIT_FIELD_REF, eltype,
5655                                                  new_temp, TYPE_SIZE (eltype),
5656                                                  bitsize_int (sz * BITS_PER_UNIT)));
5657               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5658               dst2 =  make_ssa_name (vectype1);
5659               epilog_stmt = gimple_build_assign (dst2, VIEW_CONVERT_EXPR,
5660                                                  build1 (VIEW_CONVERT_EXPR,
5661                                                          vectype1, tem));
5662               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5663             }
5664
5665           new_temp = make_ssa_name (vectype1);
5666           epilog_stmt = gimple_build_assign (new_temp, code, dst1, dst2);
5667           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5668         }
5669
5670       if (reduce_with_shift && !slp_reduc)
5671         {
5672           int element_bitsize = tree_to_uhwi (bitsize);
5673           /* Enforced by vectorizable_reduction, which disallows SLP reductions
5674              for variable-length vectors and also requires direct target support
5675              for loop reductions.  */
5676           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5677           int nelements = vec_size_in_bits / element_bitsize;
5678           vec_perm_builder sel;
5679           vec_perm_indices indices;
5680
5681           int elt_offset;
5682
5683           tree zero_vec = build_zero_cst (vectype1);
5684           /* Case 2: Create:
5685              for (offset = nelements/2; offset >= 1; offset/=2)
5686                 {
5687                   Create:  va' = vec_shift <va, offset>
5688                   Create:  va = vop <va, va'>
5689                 }  */
5690
5691           tree rhs;
5692
5693           if (dump_enabled_p ())
5694             dump_printf_loc (MSG_NOTE, vect_location,
5695                              "Reduce using vector shifts\n");
5696
5697           mode1 = TYPE_MODE (vectype1);
5698           vec_dest = vect_create_destination_var (scalar_dest, vectype1);
5699           for (elt_offset = nelements / 2;
5700                elt_offset >= 1;
5701                elt_offset /= 2)
5702             {
5703               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5704               indices.new_vector (sel, 2, nelements);
5705               tree mask = vect_gen_perm_mask_any (vectype1, indices);
5706               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5707                                                  new_temp, zero_vec, mask);
5708               new_name = make_ssa_name (vec_dest, epilog_stmt);
5709               gimple_assign_set_lhs (epilog_stmt, new_name);
5710               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5711
5712               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5713                                                  new_temp);
5714               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5715               gimple_assign_set_lhs (epilog_stmt, new_temp);
5716               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5717             }
5718
5719           /* 2.4  Extract the final scalar result.  Create:
5720              s_out3 = extract_field <v_out2, bitpos>  */
5721
5722           if (dump_enabled_p ())
5723             dump_printf_loc (MSG_NOTE, vect_location,
5724                              "extract scalar result\n");
5725
5726           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5727                         bitsize, bitsize_zero_node);
5728           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5729           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5730           gimple_assign_set_lhs (epilog_stmt, new_temp);
5731           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5732           scalar_results.safe_push (new_temp);
5733         }
5734       else
5735         {
5736           /* Case 3: Create:
5737              s = extract_field <v_out2, 0>
5738              for (offset = element_size;
5739                   offset < vector_size;
5740                   offset += element_size;)
5741                {
5742                  Create:  s' = extract_field <v_out2, offset>
5743                  Create:  s = op <s, s'>  // For non SLP cases
5744                }  */
5745
5746           if (dump_enabled_p ())
5747             dump_printf_loc (MSG_NOTE, vect_location,
5748                              "Reduce using scalar code.\n");
5749
5750           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype1));
5751           int element_bitsize = tree_to_uhwi (bitsize);
5752           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5753             {
5754               int bit_offset;
5755               if (gimple_code (new_phi) == GIMPLE_PHI)
5756                 vec_temp = PHI_RESULT (new_phi);
5757               else
5758                 vec_temp = gimple_assign_lhs (new_phi);
5759               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5760                                  bitsize_zero_node);
5761               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5762               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5763               gimple_assign_set_lhs (epilog_stmt, new_temp);
5764               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5765
5766               /* In SLP we don't need to apply reduction operation, so we just
5767                  collect s' values in SCALAR_RESULTS.  */
5768               if (slp_reduc)
5769                 scalar_results.safe_push (new_temp);
5770
5771               for (bit_offset = element_bitsize;
5772                    bit_offset < vec_size_in_bits;
5773                    bit_offset += element_bitsize)
5774                 {
5775                   tree bitpos = bitsize_int (bit_offset);
5776                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5777                                      bitsize, bitpos);
5778
5779                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5780                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5781                   gimple_assign_set_lhs (epilog_stmt, new_name);
5782                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5783
5784                   if (slp_reduc)
5785                     {
5786                       /* In SLP we don't need to apply reduction operation, so
5787                          we just collect s' values in SCALAR_RESULTS.  */
5788                       new_temp = new_name;
5789                       scalar_results.safe_push (new_name);
5790                     }
5791                   else
5792                     {
5793                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5794                                                          new_name, new_temp);
5795                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5796                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5797                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5798                     }
5799                 }
5800             }
5801
5802           /* The only case where we need to reduce scalar results in SLP, is
5803              unrolling.  If the size of SCALAR_RESULTS is greater than
5804              GROUP_SIZE, we reduce them combining elements modulo
5805              GROUP_SIZE.  */
5806           if (slp_reduc)
5807             {
5808               tree res, first_res, new_res;
5809               gimple *new_stmt;
5810
5811               /* Reduce multiple scalar results in case of SLP unrolling.  */
5812               for (j = group_size; scalar_results.iterate (j, &res);
5813                    j++)
5814                 {
5815                   first_res = scalar_results[j % group_size];
5816                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5817                                                   first_res, res);
5818                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5819                   gimple_assign_set_lhs (new_stmt, new_res);
5820                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5821                   scalar_results[j % group_size] = new_res;
5822                 }
5823             }
5824           else
5825             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5826             scalar_results.safe_push (new_temp);
5827         }
5828
5829       if ((STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5830            == INTEGER_INDUC_COND_REDUCTION)
5831           && !operand_equal_p (initial_def, induc_val, 0))
5832         {
5833           /* Earlier we set the initial value to be a vector if induc_val
5834              values.  Check the result and if it is induc_val then replace
5835              with the original initial value, unless induc_val is
5836              the same as initial_def already.  */
5837           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp,
5838                                   induc_val);
5839
5840           tree tmp = make_ssa_name (new_scalar_dest);
5841           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5842                                              initial_def, new_temp);
5843           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5844           scalar_results[0] = tmp;
5845         }
5846     }
5847
5848 vect_finalize_reduction:
5849
5850   if (double_reduc)
5851     loop = loop->inner;
5852
5853   /* 2.5 Adjust the final result by the initial value of the reduction
5854          variable. (When such adjustment is not needed, then
5855          'adjustment_def' is zero).  For example, if code is PLUS we create:
5856          new_temp = loop_exit_def + adjustment_def  */
5857
5858   if (adjustment_def)
5859     {
5860       gcc_assert (!slp_reduc);
5861       if (nested_in_vect_loop)
5862         {
5863           new_phi = new_phis[0];
5864           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5865           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5866           new_dest = vect_create_destination_var (scalar_dest, vectype);
5867         }
5868       else
5869         {
5870           new_temp = scalar_results[0];
5871           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5872           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5873           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5874         }
5875
5876       epilog_stmt = gimple_build_assign (new_dest, expr);
5877       new_temp = make_ssa_name (new_dest, epilog_stmt);
5878       gimple_assign_set_lhs (epilog_stmt, new_temp);
5879       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5880       if (nested_in_vect_loop)
5881         {
5882           set_vinfo_for_stmt (epilog_stmt,
5883                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5884           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5885                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5886
5887           if (!double_reduc)
5888             scalar_results.quick_push (new_temp);
5889           else
5890             scalar_results[0] = new_temp;
5891         }
5892       else
5893         scalar_results[0] = new_temp;
5894
5895       new_phis[0] = epilog_stmt;
5896     }
5897
5898   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5899           phis with new adjusted scalar results, i.e., replace use <s_out0>
5900           with use <s_out4>.
5901
5902      Transform:
5903         loop_exit:
5904           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5905           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5906           v_out2 = reduce <v_out1>
5907           s_out3 = extract_field <v_out2, 0>
5908           s_out4 = adjust_result <s_out3>
5909           use <s_out0>
5910           use <s_out0>
5911
5912      into:
5913
5914         loop_exit:
5915           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5916           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5917           v_out2 = reduce <v_out1>
5918           s_out3 = extract_field <v_out2, 0>
5919           s_out4 = adjust_result <s_out3>
5920           use <s_out4>
5921           use <s_out4> */
5922
5923
5924   /* In SLP reduction chain we reduce vector results into one vector if
5925      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5926      the last stmt in the reduction chain, since we are looking for the loop
5927      exit phi node.  */
5928   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5929     {
5930       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5931       /* Handle reduction patterns.  */
5932       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5933         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5934
5935       scalar_dest = gimple_assign_lhs (dest_stmt);
5936       group_size = 1;
5937     }
5938
5939   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5940      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5941      need to match SCALAR_RESULTS with corresponding statements.  The first
5942      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5943      the first vector stmt, etc.
5944      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5945   if (group_size > new_phis.length ())
5946     {
5947       ratio = group_size / new_phis.length ();
5948       gcc_assert (!(group_size % new_phis.length ()));
5949     }
5950   else
5951     ratio = 1;
5952
5953   for (k = 0; k < group_size; k++)
5954     {
5955       if (k % ratio == 0)
5956         {
5957           epilog_stmt = new_phis[k / ratio];
5958           reduction_phi = reduction_phis[k / ratio];
5959           if (double_reduc)
5960             inner_phi = inner_phis[k / ratio];
5961         }
5962
5963       if (slp_reduc)
5964         {
5965           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5966
5967           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5968           /* SLP statements can't participate in patterns.  */
5969           gcc_assert (!orig_stmt);
5970           scalar_dest = gimple_assign_lhs (current_stmt);
5971         }
5972
5973       phis.create (3);
5974       /* Find the loop-closed-use at the loop exit of the original scalar
5975          result.  (The reduction result is expected to have two immediate uses -
5976          one at the latch block, and one at the loop exit).  */
5977       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5978         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5979             && !is_gimple_debug (USE_STMT (use_p)))
5980           phis.safe_push (USE_STMT (use_p));
5981
5982       /* While we expect to have found an exit_phi because of loop-closed-ssa
5983          form we can end up without one if the scalar cycle is dead.  */
5984
5985       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5986         {
5987           if (outer_loop)
5988             {
5989               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5990               gphi *vect_phi;
5991
5992               /* FORNOW. Currently not supporting the case that an inner-loop
5993                  reduction is not used in the outer-loop (but only outside the
5994                  outer-loop), unless it is double reduction.  */
5995               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5996                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5997                           || double_reduc);
5998
5999               if (double_reduc)
6000                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
6001               else
6002                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
6003               if (!double_reduc
6004                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
6005                       != vect_double_reduction_def)
6006                 continue;
6007
6008               /* Handle double reduction:
6009
6010                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
6011                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
6012                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
6013                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
6014
6015                  At that point the regular reduction (stmt2 and stmt3) is
6016                  already vectorized, as well as the exit phi node, stmt4.
6017                  Here we vectorize the phi node of double reduction, stmt1, and
6018                  update all relevant statements.  */
6019
6020               /* Go through all the uses of s2 to find double reduction phi
6021                  node, i.e., stmt1 above.  */
6022               orig_name = PHI_RESULT (exit_phi);
6023               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6024                 {
6025                   stmt_vec_info use_stmt_vinfo;
6026                   stmt_vec_info new_phi_vinfo;
6027                   tree vect_phi_init, preheader_arg, vect_phi_res;
6028                   basic_block bb = gimple_bb (use_stmt);
6029                   gimple *use;
6030
6031                   /* Check that USE_STMT is really double reduction phi
6032                      node.  */
6033                   if (gimple_code (use_stmt) != GIMPLE_PHI
6034                       || gimple_phi_num_args (use_stmt) != 2
6035                       || bb->loop_father != outer_loop)
6036                     continue;
6037                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
6038                   if (!use_stmt_vinfo
6039                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
6040                           != vect_double_reduction_def)
6041                     continue;
6042
6043                   /* Create vector phi node for double reduction:
6044                      vs1 = phi <vs0, vs2>
6045                      vs1 was created previously in this function by a call to
6046                        vect_get_vec_def_for_operand and is stored in
6047                        vec_initial_def;
6048                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
6049                      vs0 is created here.  */
6050
6051                   /* Create vector phi node.  */
6052                   vect_phi = create_phi_node (vec_initial_def, bb);
6053                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
6054                                     loop_vec_info_for_loop (outer_loop));
6055                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
6056
6057                   /* Create vs0 - initial def of the double reduction phi.  */
6058                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
6059                                              loop_preheader_edge (outer_loop));
6060                   vect_phi_init = get_initial_def_for_reduction
6061                     (stmt, preheader_arg, NULL);
6062
6063                   /* Update phi node arguments with vs0 and vs2.  */
6064                   add_phi_arg (vect_phi, vect_phi_init,
6065                                loop_preheader_edge (outer_loop),
6066                                UNKNOWN_LOCATION);
6067                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
6068                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
6069                   if (dump_enabled_p ())
6070                     {
6071                       dump_printf_loc (MSG_NOTE, vect_location,
6072                                        "created double reduction phi node: ");
6073                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
6074                     }
6075
6076                   vect_phi_res = PHI_RESULT (vect_phi);
6077
6078                   /* Replace the use, i.e., set the correct vs1 in the regular
6079                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
6080                      loop is redundant.  */
6081                   use = reduction_phi;
6082                   for (j = 0; j < ncopies; j++)
6083                     {
6084                       edge pr_edge = loop_preheader_edge (loop);
6085                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
6086                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
6087                     }
6088                 }
6089             }
6090         }
6091
6092       phis.release ();
6093       if (nested_in_vect_loop)
6094         {
6095           if (double_reduc)
6096             loop = outer_loop;
6097           else
6098             continue;
6099         }
6100
6101       phis.create (3);
6102       /* Find the loop-closed-use at the loop exit of the original scalar
6103          result.  (The reduction result is expected to have two immediate uses,
6104          one at the latch block, and one at the loop exit).  For double
6105          reductions we are looking for exit phis of the outer loop.  */
6106       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
6107         {
6108           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
6109             {
6110               if (!is_gimple_debug (USE_STMT (use_p)))
6111                 phis.safe_push (USE_STMT (use_p));
6112             }
6113           else
6114             {
6115               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
6116                 {
6117                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
6118
6119                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
6120                     {
6121                       if (!flow_bb_inside_loop_p (loop,
6122                                              gimple_bb (USE_STMT (phi_use_p)))
6123                           && !is_gimple_debug (USE_STMT (phi_use_p)))
6124                         phis.safe_push (USE_STMT (phi_use_p));
6125                     }
6126                 }
6127             }
6128         }
6129
6130       FOR_EACH_VEC_ELT (phis, i, exit_phi)
6131         {
6132           /* Replace the uses:  */
6133           orig_name = PHI_RESULT (exit_phi);
6134           scalar_result = scalar_results[k];
6135           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
6136             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
6137               SET_USE (use_p, scalar_result);
6138         }
6139
6140       phis.release ();
6141     }
6142 }
6143
6144 /* Return a vector of type VECTYPE that is equal to the vector select
6145    operation "MASK ? VEC : IDENTITY".  Insert the select statements
6146    before GSI.  */
6147
6148 static tree
6149 merge_with_identity (gimple_stmt_iterator *gsi, tree mask, tree vectype,
6150                      tree vec, tree identity)
6151 {
6152   tree cond = make_temp_ssa_name (vectype, NULL, "cond");
6153   gimple *new_stmt = gimple_build_assign (cond, VEC_COND_EXPR,
6154                                           mask, vec, identity);
6155   gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6156   return cond;
6157 }
6158
6159 /* Successively apply CODE to each element of VECTOR_RHS, in left-to-right
6160    order, starting with LHS.  Insert the extraction statements before GSI and
6161    associate the new scalar SSA names with variable SCALAR_DEST.
6162    Return the SSA name for the result.  */
6163
6164 static tree
6165 vect_expand_fold_left (gimple_stmt_iterator *gsi, tree scalar_dest,
6166                        tree_code code, tree lhs, tree vector_rhs)
6167 {
6168   tree vectype = TREE_TYPE (vector_rhs);
6169   tree scalar_type = TREE_TYPE (vectype);
6170   tree bitsize = TYPE_SIZE (scalar_type);
6171   unsigned HOST_WIDE_INT vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
6172   unsigned HOST_WIDE_INT element_bitsize = tree_to_uhwi (bitsize);
6173
6174   for (unsigned HOST_WIDE_INT bit_offset = 0;
6175        bit_offset < vec_size_in_bits;
6176        bit_offset += element_bitsize)
6177     {
6178       tree bitpos = bitsize_int (bit_offset);
6179       tree rhs = build3 (BIT_FIELD_REF, scalar_type, vector_rhs,
6180                          bitsize, bitpos);
6181
6182       gassign *stmt = gimple_build_assign (scalar_dest, rhs);
6183       rhs = make_ssa_name (scalar_dest, stmt);
6184       gimple_assign_set_lhs (stmt, rhs);
6185       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6186
6187       stmt = gimple_build_assign (scalar_dest, code, lhs, rhs);
6188       tree new_name = make_ssa_name (scalar_dest, stmt);
6189       gimple_assign_set_lhs (stmt, new_name);
6190       gsi_insert_before (gsi, stmt, GSI_SAME_STMT);
6191       lhs = new_name;
6192     }
6193   return lhs;
6194 }
6195
6196 /* Perform an in-order reduction (FOLD_LEFT_REDUCTION).  STMT is the
6197    statement that sets the live-out value.  REDUC_DEF_STMT is the phi
6198    statement.  CODE is the operation performed by STMT and OPS are
6199    its scalar operands.  REDUC_INDEX is the index of the operand in
6200    OPS that is set by REDUC_DEF_STMT.  REDUC_FN is the function that
6201    implements in-order reduction, or IFN_LAST if we should open-code it.
6202    VECTYPE_IN is the type of the vector input.  MASKS specifies the masks
6203    that should be used to control the operation in a fully-masked loop.  */
6204
6205 static bool
6206 vectorize_fold_left_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6207                                gimple **vec_stmt, slp_tree slp_node,
6208                                gimple *reduc_def_stmt,
6209                                tree_code code, internal_fn reduc_fn,
6210                                tree ops[3], tree vectype_in,
6211                                int reduc_index, vec_loop_masks *masks)
6212 {
6213   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6214   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6215   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6216   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6217   gimple *new_stmt = NULL;
6218
6219   int ncopies;
6220   if (slp_node)
6221     ncopies = 1;
6222   else
6223     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6224
6225   gcc_assert (!nested_in_vect_loop_p (loop, stmt));
6226   gcc_assert (ncopies == 1);
6227   gcc_assert (TREE_CODE_LENGTH (code) == binary_op);
6228   gcc_assert (reduc_index == (code == MINUS_EXPR ? 0 : 1));
6229   gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6230               == FOLD_LEFT_REDUCTION);
6231
6232   if (slp_node)
6233     gcc_assert (known_eq (TYPE_VECTOR_SUBPARTS (vectype_out),
6234                           TYPE_VECTOR_SUBPARTS (vectype_in)));
6235
6236   tree op0 = ops[1 - reduc_index];
6237
6238   int group_size = 1;
6239   gimple *scalar_dest_def;
6240   auto_vec<tree> vec_oprnds0;
6241   if (slp_node)
6242     {
6243       vect_get_vec_defs (op0, NULL_TREE, stmt, &vec_oprnds0, NULL, slp_node);
6244       group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6245       scalar_dest_def = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
6246     }
6247   else
6248     {
6249       tree loop_vec_def0 = vect_get_vec_def_for_operand (op0, stmt);
6250       vec_oprnds0.create (1);
6251       vec_oprnds0.quick_push (loop_vec_def0);
6252       scalar_dest_def = stmt;
6253     }
6254
6255   tree scalar_dest = gimple_assign_lhs (scalar_dest_def);
6256   tree scalar_type = TREE_TYPE (scalar_dest);
6257   tree reduc_var = gimple_phi_result (reduc_def_stmt);
6258
6259   int vec_num = vec_oprnds0.length ();
6260   gcc_assert (vec_num == 1 || slp_node);
6261   tree vec_elem_type = TREE_TYPE (vectype_out);
6262   gcc_checking_assert (useless_type_conversion_p (scalar_type, vec_elem_type));
6263
6264   tree vector_identity = NULL_TREE;
6265   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6266     vector_identity = build_zero_cst (vectype_out);
6267
6268   tree scalar_dest_var = vect_create_destination_var (scalar_dest, NULL);
6269   int i;
6270   tree def0;
6271   FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6272     {
6273       tree mask = NULL_TREE;
6274       if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
6275         mask = vect_get_loop_mask (gsi, masks, vec_num, vectype_in, i);
6276
6277       /* Handle MINUS by adding the negative.  */
6278       if (reduc_fn != IFN_LAST && code == MINUS_EXPR)
6279         {
6280           tree negated = make_ssa_name (vectype_out);
6281           new_stmt = gimple_build_assign (negated, NEGATE_EXPR, def0);
6282           gsi_insert_before (gsi, new_stmt, GSI_SAME_STMT);
6283           def0 = negated;
6284         }
6285
6286       if (mask)
6287         def0 = merge_with_identity (gsi, mask, vectype_out, def0,
6288                                     vector_identity);
6289
6290       /* On the first iteration the input is simply the scalar phi
6291          result, and for subsequent iterations it is the output of
6292          the preceding operation.  */
6293       if (reduc_fn != IFN_LAST)
6294         {
6295           new_stmt = gimple_build_call_internal (reduc_fn, 2, reduc_var, def0);
6296           /* For chained SLP reductions the output of the previous reduction
6297              operation serves as the input of the next. For the final statement
6298              the output cannot be a temporary - we reuse the original
6299              scalar destination of the last statement.  */
6300           if (i != vec_num - 1)
6301             {
6302               gimple_set_lhs (new_stmt, scalar_dest_var);
6303               reduc_var = make_ssa_name (scalar_dest_var, new_stmt);
6304               gimple_set_lhs (new_stmt, reduc_var);
6305             }
6306         }
6307       else
6308         {
6309           reduc_var = vect_expand_fold_left (gsi, scalar_dest_var, code,
6310                                              reduc_var, def0);
6311           new_stmt = SSA_NAME_DEF_STMT (reduc_var);
6312           /* Remove the statement, so that we can use the same code paths
6313              as for statements that we've just created.  */
6314           gimple_stmt_iterator tmp_gsi = gsi_for_stmt (new_stmt);
6315           gsi_remove (&tmp_gsi, false);
6316         }
6317
6318       if (i == vec_num - 1)
6319         {
6320           gimple_set_lhs (new_stmt, scalar_dest);
6321           vect_finish_replace_stmt (scalar_dest_def, new_stmt);
6322         }
6323       else
6324         vect_finish_stmt_generation (scalar_dest_def, new_stmt, gsi);
6325
6326       if (slp_node)
6327         SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6328     }
6329
6330   if (!slp_node)
6331     STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6332
6333   return true;
6334 }
6335
6336 /* Function is_nonwrapping_integer_induction.
6337
6338    Check if STMT (which is part of loop LOOP) both increments and
6339    does not cause overflow.  */
6340
6341 static bool
6342 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
6343 {
6344   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
6345   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
6346   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
6347   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
6348   widest_int ni, max_loop_value, lhs_max;
6349   bool overflow = false;
6350
6351   /* Make sure the loop is integer based.  */
6352   if (TREE_CODE (base) != INTEGER_CST
6353       || TREE_CODE (step) != INTEGER_CST)
6354     return false;
6355
6356   /* Check that the max size of the loop will not wrap.  */
6357
6358   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
6359     return true;
6360
6361   if (! max_stmt_executions (loop, &ni))
6362     return false;
6363
6364   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
6365                             &overflow);
6366   if (overflow)
6367     return false;
6368
6369   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
6370                             TYPE_SIGN (lhs_type), &overflow);
6371   if (overflow)
6372     return false;
6373
6374   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
6375           <= TYPE_PRECISION (lhs_type));
6376 }
6377
6378 /* Function vectorizable_reduction.
6379
6380    Check if STMT performs a reduction operation that can be vectorized.
6381    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
6382    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
6383    Return FALSE if not a vectorizable STMT, TRUE otherwise.
6384
6385    This function also handles reduction idioms (patterns) that have been
6386    recognized in advance during vect_pattern_recog.  In this case, STMT may be
6387    of this form:
6388      X = pattern_expr (arg0, arg1, ..., X)
6389    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
6390    sequence that had been detected and replaced by the pattern-stmt (STMT).
6391
6392    This function also handles reduction of condition expressions, for example:
6393      for (int i = 0; i < N; i++)
6394        if (a[i] < value)
6395          last = a[i];
6396    This is handled by vectorising the loop and creating an additional vector
6397    containing the loop indexes for which "a[i] < value" was true.  In the
6398    function epilogue this is reduced to a single max value and then used to
6399    index into the vector of results.
6400
6401    In some cases of reduction patterns, the type of the reduction variable X is
6402    different than the type of the other arguments of STMT.
6403    In such cases, the vectype that is used when transforming STMT into a vector
6404    stmt is different than the vectype that is used to determine the
6405    vectorization factor, because it consists of a different number of elements
6406    than the actual number of elements that are being operated upon in parallel.
6407
6408    For example, consider an accumulation of shorts into an int accumulator.
6409    On some targets it's possible to vectorize this pattern operating on 8
6410    shorts at a time (hence, the vectype for purposes of determining the
6411    vectorization factor should be V8HI); on the other hand, the vectype that
6412    is used to create the vector form is actually V4SI (the type of the result).
6413
6414    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
6415    indicates what is the actual level of parallelism (V8HI in the example), so
6416    that the right vectorization factor would be derived.  This vectype
6417    corresponds to the type of arguments to the reduction stmt, and should *NOT*
6418    be used to create the vectorized stmt.  The right vectype for the vectorized
6419    stmt is obtained from the type of the result X:
6420         get_vectype_for_scalar_type (TREE_TYPE (X))
6421
6422    This means that, contrary to "regular" reductions (or "regular" stmts in
6423    general), the following equation:
6424       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
6425    does *NOT* necessarily hold for reduction patterns.  */
6426
6427 bool
6428 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
6429                         gimple **vec_stmt, slp_tree slp_node,
6430                         slp_instance slp_node_instance)
6431 {
6432   tree vec_dest;
6433   tree scalar_dest;
6434   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6435   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
6436   tree vectype_in = NULL_TREE;
6437   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6439   enum tree_code code, orig_code;
6440   internal_fn reduc_fn;
6441   machine_mode vec_mode;
6442   int op_type;
6443   optab optab;
6444   tree new_temp = NULL_TREE;
6445   gimple *def_stmt;
6446   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
6447   gimple *cond_reduc_def_stmt = NULL;
6448   enum tree_code cond_reduc_op_code = ERROR_MARK;
6449   tree scalar_type;
6450   bool is_simple_use;
6451   gimple *orig_stmt;
6452   stmt_vec_info orig_stmt_info = NULL;
6453   int i;
6454   int ncopies;
6455   int epilog_copies;
6456   stmt_vec_info prev_stmt_info, prev_phi_info;
6457   bool single_defuse_cycle = false;
6458   gimple *new_stmt = NULL;
6459   int j;
6460   tree ops[3];
6461   enum vect_def_type dts[3];
6462   bool nested_cycle = false, found_nested_cycle_def = false;
6463   bool double_reduc = false;
6464   basic_block def_bb;
6465   struct loop * def_stmt_loop, *outer_loop = NULL;
6466   tree def_arg;
6467   gimple *def_arg_stmt;
6468   auto_vec<tree> vec_oprnds0;
6469   auto_vec<tree> vec_oprnds1;
6470   auto_vec<tree> vec_oprnds2;
6471   auto_vec<tree> vect_defs;
6472   auto_vec<gimple *> phis;
6473   int vec_num;
6474   tree def0, tem;
6475   bool first_p = true;
6476   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
6477   tree cond_reduc_val = NULL_TREE;
6478
6479   /* Make sure it was already recognized as a reduction computation.  */
6480   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
6481       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
6482     return false;
6483
6484   if (nested_in_vect_loop_p (loop, stmt))
6485     {
6486       outer_loop = loop;
6487       loop = loop->inner;
6488       nested_cycle = true;
6489     }
6490
6491   /* In case of reduction chain we switch to the first stmt in the chain, but
6492      we don't update STMT_INFO, since only the last stmt is marked as reduction
6493      and has reduction properties.  */
6494   if (GROUP_FIRST_ELEMENT (stmt_info)
6495       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
6496     {
6497       stmt = GROUP_FIRST_ELEMENT (stmt_info);
6498       first_p = false;
6499     }
6500
6501   if (gimple_code (stmt) == GIMPLE_PHI)
6502     {
6503       /* Analysis is fully done on the reduction stmt invocation.  */
6504       if (! vec_stmt)
6505         {
6506           if (slp_node)
6507             slp_node_instance->reduc_phis = slp_node;
6508
6509           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6510           return true;
6511         }
6512
6513       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6514         /* Leave the scalar phi in place.  Note that checking
6515            STMT_VINFO_VEC_REDUCTION_TYPE (as below) only works
6516            for reductions involving a single statement.  */
6517         return true;
6518
6519       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6520       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
6521         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
6522
6523       if (STMT_VINFO_VEC_REDUCTION_TYPE (vinfo_for_stmt (reduc_stmt))
6524           == EXTRACT_LAST_REDUCTION)
6525         /* Leave the scalar phi in place.  */
6526         return true;
6527
6528       gcc_assert (is_gimple_assign (reduc_stmt));
6529       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
6530         {
6531           tree op = gimple_op (reduc_stmt, k);
6532           if (op == gimple_phi_result (stmt))
6533             continue;
6534           if (k == 1
6535               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
6536             continue;
6537           if (!vectype_in
6538               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6539                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (op)))))
6540             vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
6541           break;
6542         }
6543       gcc_assert (vectype_in);
6544
6545       if (slp_node)
6546         ncopies = 1;
6547       else
6548         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6549
6550       use_operand_p use_p;
6551       gimple *use_stmt;
6552       if (ncopies > 1
6553           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
6554               <= vect_used_only_live)
6555           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
6556           && (use_stmt == reduc_stmt
6557               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
6558                   == reduc_stmt)))
6559         single_defuse_cycle = true;
6560
6561       /* Create the destination vector  */
6562       scalar_dest = gimple_assign_lhs (reduc_stmt);
6563       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6564
6565       if (slp_node)
6566         /* The size vect_schedule_slp_instance computes is off for us.  */
6567         vec_num = vect_get_num_vectors
6568           (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6569            * SLP_TREE_SCALAR_STMTS (slp_node).length (),
6570            vectype_in);
6571       else
6572         vec_num = 1;
6573
6574       /* Generate the reduction PHIs upfront.  */
6575       prev_phi_info = NULL;
6576       for (j = 0; j < ncopies; j++)
6577         {
6578           if (j == 0 || !single_defuse_cycle)
6579             {
6580               for (i = 0; i < vec_num; i++)
6581                 {
6582                   /* Create the reduction-phi that defines the reduction
6583                      operand.  */
6584                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
6585                   set_vinfo_for_stmt (new_phi,
6586                                       new_stmt_vec_info (new_phi, loop_vinfo));
6587
6588                   if (slp_node)
6589                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
6590                   else
6591                     {
6592                       if (j == 0)
6593                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
6594                       else
6595                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6596                       prev_phi_info = vinfo_for_stmt (new_phi);
6597                     }
6598                 }
6599             }
6600         }
6601
6602       return true;
6603     }
6604
6605   /* 1. Is vectorizable reduction?  */
6606   /* Not supportable if the reduction variable is used in the loop, unless
6607      it's a reduction chain.  */
6608   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
6609       && !GROUP_FIRST_ELEMENT (stmt_info))
6610     return false;
6611
6612   /* Reductions that are not used even in an enclosing outer-loop,
6613      are expected to be "live" (used out of the loop).  */
6614   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
6615       && !STMT_VINFO_LIVE_P (stmt_info))
6616     return false;
6617
6618   /* 2. Has this been recognized as a reduction pattern?
6619
6620      Check if STMT represents a pattern that has been recognized
6621      in earlier analysis stages.  For stmts that represent a pattern,
6622      the STMT_VINFO_RELATED_STMT field records the last stmt in
6623      the original sequence that constitutes the pattern.  */
6624
6625   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
6626   if (orig_stmt)
6627     {
6628       orig_stmt_info = vinfo_for_stmt (orig_stmt);
6629       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
6630       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
6631     }
6632
6633   /* 3. Check the operands of the operation.  The first operands are defined
6634         inside the loop body. The last operand is the reduction variable,
6635         which is defined by the loop-header-phi.  */
6636
6637   gcc_assert (is_gimple_assign (stmt));
6638
6639   /* Flatten RHS.  */
6640   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
6641     {
6642     case GIMPLE_BINARY_RHS:
6643       code = gimple_assign_rhs_code (stmt);
6644       op_type = TREE_CODE_LENGTH (code);
6645       gcc_assert (op_type == binary_op);
6646       ops[0] = gimple_assign_rhs1 (stmt);
6647       ops[1] = gimple_assign_rhs2 (stmt);
6648       break;
6649
6650     case GIMPLE_TERNARY_RHS:
6651       code = gimple_assign_rhs_code (stmt);
6652       op_type = TREE_CODE_LENGTH (code);
6653       gcc_assert (op_type == ternary_op);
6654       ops[0] = gimple_assign_rhs1 (stmt);
6655       ops[1] = gimple_assign_rhs2 (stmt);
6656       ops[2] = gimple_assign_rhs3 (stmt);
6657       break;
6658
6659     case GIMPLE_UNARY_RHS:
6660       return false;
6661
6662     default:
6663       gcc_unreachable ();
6664     }
6665
6666   if (code == COND_EXPR && slp_node)
6667     return false;
6668
6669   scalar_dest = gimple_assign_lhs (stmt);
6670   scalar_type = TREE_TYPE (scalar_dest);
6671   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
6672       && !SCALAR_FLOAT_TYPE_P (scalar_type))
6673     return false;
6674
6675   /* Do not try to vectorize bit-precision reductions.  */
6676   if (!type_has_mode_precision_p (scalar_type))
6677     return false;
6678
6679   /* All uses but the last are expected to be defined in the loop.
6680      The last use is the reduction variable.  In case of nested cycle this
6681      assumption is not true: we use reduc_index to record the index of the
6682      reduction variable.  */
6683   gimple *reduc_def_stmt = NULL;
6684   int reduc_index = -1;
6685   for (i = 0; i < op_type; i++)
6686     {
6687       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
6688       if (i == 0 && code == COND_EXPR)
6689         continue;
6690
6691       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
6692                                           &def_stmt, &dts[i], &tem);
6693       dt = dts[i];
6694       gcc_assert (is_simple_use);
6695       if (dt == vect_reduction_def)
6696         {
6697           reduc_def_stmt = def_stmt;
6698           reduc_index = i;
6699           continue;
6700         }
6701       else if (tem)
6702         {
6703           /* To properly compute ncopies we are interested in the widest
6704              input type in case we're looking at a widening accumulation.  */
6705           if (!vectype_in
6706               || (GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (vectype_in)))
6707                   < GET_MODE_SIZE (SCALAR_TYPE_MODE (TREE_TYPE (tem)))))
6708             vectype_in = tem;
6709         }
6710
6711       if (dt != vect_internal_def
6712           && dt != vect_external_def
6713           && dt != vect_constant_def
6714           && dt != vect_induction_def
6715           && !(dt == vect_nested_cycle && nested_cycle))
6716         return false;
6717
6718       if (dt == vect_nested_cycle)
6719         {
6720           found_nested_cycle_def = true;
6721           reduc_def_stmt = def_stmt;
6722           reduc_index = i;
6723         }
6724
6725       if (i == 1 && code == COND_EXPR)
6726         {
6727           /* Record how value of COND_EXPR is defined.  */
6728           if (dt == vect_constant_def)
6729             {
6730               cond_reduc_dt = dt;
6731               cond_reduc_val = ops[i];
6732             }
6733           if (dt == vect_induction_def
6734               && def_stmt != NULL
6735               && is_nonwrapping_integer_induction (def_stmt, loop))
6736             {
6737               cond_reduc_dt = dt;
6738               cond_reduc_def_stmt = def_stmt;
6739             }
6740         }
6741     }
6742
6743   if (!vectype_in)
6744     vectype_in = vectype_out;
6745
6746   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
6747      directy used in stmt.  */
6748   if (reduc_index == -1)
6749     {
6750       if (STMT_VINFO_REDUC_TYPE (stmt_info) == FOLD_LEFT_REDUCTION)
6751         {
6752           if (dump_enabled_p ())
6753             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6754                              "in-order reduction chain without SLP.\n");
6755           return false;
6756         }
6757
6758       if (orig_stmt)
6759         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
6760       else
6761         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
6762     }
6763
6764   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
6765     return false;
6766
6767   if (!(reduc_index == -1
6768         || dts[reduc_index] == vect_reduction_def
6769         || dts[reduc_index] == vect_nested_cycle
6770         || ((dts[reduc_index] == vect_internal_def
6771              || dts[reduc_index] == vect_external_def
6772              || dts[reduc_index] == vect_constant_def
6773              || dts[reduc_index] == vect_induction_def)
6774             && nested_cycle && found_nested_cycle_def)))
6775     {
6776       /* For pattern recognized stmts, orig_stmt might be a reduction,
6777          but some helper statements for the pattern might not, or
6778          might be COND_EXPRs with reduction uses in the condition.  */
6779       gcc_assert (orig_stmt);
6780       return false;
6781     }
6782
6783   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
6784   enum vect_reduction_type v_reduc_type
6785     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
6786   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
6787
6788   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
6789   /* If we have a condition reduction, see if we can simplify it further.  */
6790   if (v_reduc_type == COND_REDUCTION)
6791     {
6792       /* TODO: We can't yet handle reduction chains, since we need to treat
6793          each COND_EXPR in the chain specially, not just the last one.
6794          E.g. for:
6795
6796             x_1 = PHI <x_3, ...>
6797             x_2 = a_2 ? ... : x_1;
6798             x_3 = a_3 ? ... : x_2;
6799
6800          we're interested in the last element in x_3 for which a_2 || a_3
6801          is true, whereas the current reduction chain handling would
6802          vectorize x_2 as a normal VEC_COND_EXPR and only treat x_3
6803          as a reduction operation.  */
6804       if (reduc_index == -1)
6805         {
6806           if (dump_enabled_p ())
6807             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6808                              "conditional reduction chains not supported\n");
6809           return false;
6810         }
6811
6812       /* vect_is_simple_reduction ensured that operand 2 is the
6813          loop-carried operand.  */
6814       gcc_assert (reduc_index == 2);
6815
6816       /* Loop peeling modifies initial value of reduction PHI, which
6817          makes the reduction stmt to be transformed different to the
6818          original stmt analyzed.  We need to record reduction code for
6819          CONST_COND_REDUCTION type reduction at analyzing stage, thus
6820          it can be used directly at transform stage.  */
6821       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
6822           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
6823         {
6824           /* Also set the reduction type to CONST_COND_REDUCTION.  */
6825           gcc_assert (cond_reduc_dt == vect_constant_def);
6826           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
6827         }
6828       else if (direct_internal_fn_supported_p (IFN_FOLD_EXTRACT_LAST,
6829                                                vectype_in, OPTIMIZE_FOR_SPEED))
6830         {
6831           if (dump_enabled_p ())
6832             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6833                              "optimizing condition reduction with"
6834                              " FOLD_EXTRACT_LAST.\n");
6835           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = EXTRACT_LAST_REDUCTION;
6836         }
6837       else if (cond_reduc_dt == vect_induction_def)
6838         {
6839           stmt_vec_info cond_stmt_vinfo = vinfo_for_stmt (cond_reduc_def_stmt);
6840           tree base
6841             = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (cond_stmt_vinfo);
6842           tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (cond_stmt_vinfo);
6843
6844           gcc_assert (TREE_CODE (base) == INTEGER_CST
6845                       && TREE_CODE (step) == INTEGER_CST);
6846           cond_reduc_val = NULL_TREE;
6847           /* Find a suitable value, for MAX_EXPR below base, for MIN_EXPR
6848              above base; punt if base is the minimum value of the type for
6849              MAX_EXPR or maximum value of the type for MIN_EXPR for now.  */
6850           if (tree_int_cst_sgn (step) == -1)
6851             {
6852               cond_reduc_op_code = MIN_EXPR;
6853               if (tree_int_cst_sgn (base) == -1)
6854                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6855               else if (tree_int_cst_lt (base,
6856                                         TYPE_MAX_VALUE (TREE_TYPE (base))))
6857                 cond_reduc_val
6858                   = int_const_binop (PLUS_EXPR, base, integer_one_node);
6859             }
6860           else
6861             {
6862               cond_reduc_op_code = MAX_EXPR;
6863               if (tree_int_cst_sgn (base) == 1)
6864                 cond_reduc_val = build_int_cst (TREE_TYPE (base), 0);
6865               else if (tree_int_cst_lt (TYPE_MIN_VALUE (TREE_TYPE (base)),
6866                                         base))
6867                 cond_reduc_val
6868                   = int_const_binop (MINUS_EXPR, base, integer_one_node);
6869             }
6870           if (cond_reduc_val)
6871             {
6872               if (dump_enabled_p ())
6873                 dump_printf_loc (MSG_NOTE, vect_location,
6874                                  "condition expression based on "
6875                                  "integer induction.\n");
6876               STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6877                 = INTEGER_INDUC_COND_REDUCTION;
6878             }
6879         }
6880       else if (cond_reduc_dt == vect_constant_def)
6881         {
6882           enum vect_def_type cond_initial_dt;
6883           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
6884           tree cond_initial_val
6885             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6886
6887           gcc_assert (cond_reduc_val != NULL_TREE);
6888           vect_is_simple_use (cond_initial_val, loop_vinfo,
6889                               &def_stmt, &cond_initial_dt);
6890           if (cond_initial_dt == vect_constant_def
6891               && types_compatible_p (TREE_TYPE (cond_initial_val),
6892                                      TREE_TYPE (cond_reduc_val)))
6893             {
6894               tree e = fold_binary (LE_EXPR, boolean_type_node,
6895                                     cond_initial_val, cond_reduc_val);
6896               if (e && (integer_onep (e) || integer_zerop (e)))
6897                 {
6898                   if (dump_enabled_p ())
6899                     dump_printf_loc (MSG_NOTE, vect_location,
6900                                      "condition expression based on "
6901                                      "compile time constant.\n");
6902                   /* Record reduction code at analysis stage.  */
6903                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6904                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6905                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6906                     = CONST_COND_REDUCTION;
6907                 }
6908             }
6909         }
6910     }
6911
6912   if (orig_stmt)
6913     gcc_assert (tmp == orig_stmt
6914                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6915   else
6916     /* We changed STMT to be the first stmt in reduction chain, hence we
6917        check that in this case the first element in the chain is STMT.  */
6918     gcc_assert (stmt == tmp
6919                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6920
6921   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6922     return false;
6923
6924   if (slp_node)
6925     ncopies = 1;
6926   else
6927     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
6928
6929   gcc_assert (ncopies >= 1);
6930
6931   vec_mode = TYPE_MODE (vectype_in);
6932   poly_uint64 nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6933
6934   if (code == COND_EXPR)
6935     {
6936       /* Only call during the analysis stage, otherwise we'll lose
6937          STMT_VINFO_TYPE.  */
6938       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6939                                                 ops[reduc_index], 0, NULL))
6940         {
6941           if (dump_enabled_p ())
6942             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6943                              "unsupported condition in reduction\n");
6944           return false;
6945         }
6946     }
6947   else
6948     {
6949       /* 4. Supportable by target?  */
6950
6951       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6952           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6953         {
6954           /* Shifts and rotates are only supported by vectorizable_shifts,
6955              not vectorizable_reduction.  */
6956           if (dump_enabled_p ())
6957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6958                              "unsupported shift or rotation.\n");
6959           return false;
6960         }
6961
6962       /* 4.1. check support for the operation in the loop  */
6963       optab = optab_for_tree_code (code, vectype_in, optab_default);
6964       if (!optab)
6965         {
6966           if (dump_enabled_p ())
6967             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6968                              "no optab.\n");
6969
6970           return false;
6971         }
6972
6973       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6974         {
6975           if (dump_enabled_p ())
6976             dump_printf (MSG_NOTE, "op not supported by target.\n");
6977
6978           if (maybe_ne (GET_MODE_SIZE (vec_mode), UNITS_PER_WORD)
6979               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6980             return false;
6981
6982           if (dump_enabled_p ())
6983             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6984         }
6985
6986       /* Worthwhile without SIMD support?  */
6987       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6988           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6989         {
6990           if (dump_enabled_p ())
6991             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6992                              "not worthwhile without SIMD support.\n");
6993
6994           return false;
6995         }
6996     }
6997
6998   /* 4.2. Check support for the epilog operation.
6999
7000           If STMT represents a reduction pattern, then the type of the
7001           reduction variable may be different than the type of the rest
7002           of the arguments.  For example, consider the case of accumulation
7003           of shorts into an int accumulator; The original code:
7004                         S1: int_a = (int) short_a;
7005           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
7006
7007           was replaced with:
7008                         STMT: int_acc = widen_sum <short_a, int_acc>
7009
7010           This means that:
7011           1. The tree-code that is used to create the vector operation in the
7012              epilog code (that reduces the partial results) is not the
7013              tree-code of STMT, but is rather the tree-code of the original
7014              stmt from the pattern that STMT is replacing.  I.e, in the example
7015              above we want to use 'widen_sum' in the loop, but 'plus' in the
7016              epilog.
7017           2. The type (mode) we use to check available target support
7018              for the vector operation to be created in the *epilog*, is
7019              determined by the type of the reduction variable (in the example
7020              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
7021              However the type (mode) we use to check available target support
7022              for the vector operation to be created *inside the loop*, is
7023              determined by the type of the other arguments to STMT (in the
7024              example we'd check this: optab_handler (widen_sum_optab,
7025              vect_short_mode)).
7026
7027           This is contrary to "regular" reductions, in which the types of all
7028           the arguments are the same as the type of the reduction variable.
7029           For "regular" reductions we can therefore use the same vector type
7030           (and also the same tree-code) when generating the epilog code and
7031           when generating the code inside the loop.  */
7032
7033   vect_reduction_type reduction_type
7034     = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info);
7035   if (orig_stmt
7036       && (reduction_type == TREE_CODE_REDUCTION
7037           || reduction_type == FOLD_LEFT_REDUCTION))
7038     {
7039       /* This is a reduction pattern: get the vectype from the type of the
7040          reduction variable, and get the tree-code from orig_stmt.  */
7041       orig_code = gimple_assign_rhs_code (orig_stmt);
7042       gcc_assert (vectype_out);
7043       vec_mode = TYPE_MODE (vectype_out);
7044     }
7045   else
7046     {
7047       /* Regular reduction: use the same vectype and tree-code as used for
7048          the vector code inside the loop can be used for the epilog code. */
7049       orig_code = code;
7050
7051       if (code == MINUS_EXPR)
7052         orig_code = PLUS_EXPR;
7053
7054       /* For simple condition reductions, replace with the actual expression
7055          we want to base our reduction around.  */
7056       if (reduction_type == CONST_COND_REDUCTION)
7057         {
7058           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
7059           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
7060         }
7061       else if (reduction_type == INTEGER_INDUC_COND_REDUCTION)
7062         orig_code = cond_reduc_op_code;
7063     }
7064
7065   if (nested_cycle)
7066     {
7067       def_bb = gimple_bb (reduc_def_stmt);
7068       def_stmt_loop = def_bb->loop_father;
7069       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
7070                                        loop_preheader_edge (def_stmt_loop));
7071       if (TREE_CODE (def_arg) == SSA_NAME
7072           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
7073           && gimple_code (def_arg_stmt) == GIMPLE_PHI
7074           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
7075           && vinfo_for_stmt (def_arg_stmt)
7076           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
7077               == vect_double_reduction_def)
7078         double_reduc = true;
7079     }
7080
7081   reduc_fn = IFN_LAST;
7082
7083   if (reduction_type == TREE_CODE_REDUCTION
7084       || reduction_type == FOLD_LEFT_REDUCTION
7085       || reduction_type == INTEGER_INDUC_COND_REDUCTION
7086       || reduction_type == CONST_COND_REDUCTION)
7087     {
7088       if (reduction_type == FOLD_LEFT_REDUCTION
7089           ? fold_left_reduction_fn (orig_code, &reduc_fn)
7090           : reduction_fn_for_scalar_code (orig_code, &reduc_fn))
7091         {
7092           if (reduc_fn != IFN_LAST
7093               && !direct_internal_fn_supported_p (reduc_fn, vectype_out,
7094                                                   OPTIMIZE_FOR_SPEED))
7095             {
7096               if (dump_enabled_p ())
7097                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7098                                  "reduc op not supported by target.\n");
7099
7100               reduc_fn = IFN_LAST;
7101             }
7102         }
7103       else
7104         {
7105           if (!nested_cycle || double_reduc)
7106             {
7107               if (dump_enabled_p ())
7108                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7109                                  "no reduc code for scalar code.\n");
7110
7111               return false;
7112             }
7113         }
7114     }
7115   else if (reduction_type == COND_REDUCTION)
7116     {
7117       int scalar_precision
7118         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
7119       cr_index_scalar_type = make_unsigned_type (scalar_precision);
7120       cr_index_vector_type = build_vector_type (cr_index_scalar_type,
7121                                                 nunits_out);
7122
7123       if (direct_internal_fn_supported_p (IFN_REDUC_MAX, cr_index_vector_type,
7124                                           OPTIMIZE_FOR_SPEED))
7125         reduc_fn = IFN_REDUC_MAX;
7126     }
7127
7128   if (reduction_type != EXTRACT_LAST_REDUCTION
7129       && reduc_fn == IFN_LAST
7130       && !nunits_out.is_constant ())
7131     {
7132       if (dump_enabled_p ())
7133         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7134                          "missing target support for reduction on"
7135                          " variable-length vectors.\n");
7136       return false;
7137     }
7138
7139   if ((double_reduc || reduction_type != TREE_CODE_REDUCTION)
7140       && ncopies > 1)
7141     {
7142       if (dump_enabled_p ())
7143         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7144                          "multiple types in double reduction or condition "
7145                          "reduction.\n");
7146       return false;
7147     }
7148
7149   /* For SLP reductions, see if there is a neutral value we can use.  */
7150   tree neutral_op = NULL_TREE;
7151   if (slp_node)
7152     neutral_op
7153       = neutral_op_for_slp_reduction (slp_node_instance->reduc_phis, code,
7154                                       GROUP_FIRST_ELEMENT (stmt_info) != NULL);
7155
7156   if (double_reduc && reduction_type == FOLD_LEFT_REDUCTION)
7157     {
7158       /* We can't support in-order reductions of code such as this:
7159
7160            for (int i = 0; i < n1; ++i)
7161              for (int j = 0; j < n2; ++j)
7162                l += a[j];
7163
7164          since GCC effectively transforms the loop when vectorizing:
7165
7166            for (int i = 0; i < n1 / VF; ++i)
7167              for (int j = 0; j < n2; ++j)
7168                for (int k = 0; k < VF; ++k)
7169                  l += a[j];
7170
7171          which is a reassociation of the original operation.  */
7172       if (dump_enabled_p ())
7173         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7174                          "in-order double reduction not supported.\n");
7175
7176       return false;
7177     }
7178
7179   if (reduction_type == FOLD_LEFT_REDUCTION
7180       && slp_node
7181       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
7182     {
7183       /* We cannot use in-order reductions in this case because there is
7184          an implicit reassociation of the operations involved.  */
7185       if (dump_enabled_p ())
7186         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7187                          "in-order unchained SLP reductions not supported.\n");
7188       return false;
7189     }
7190
7191   /* For double reductions, and for SLP reductions with a neutral value,
7192      we construct a variable-length initial vector by loading a vector
7193      full of the neutral value and then shift-and-inserting the start
7194      values into the low-numbered elements.  */
7195   if ((double_reduc || neutral_op)
7196       && !nunits_out.is_constant ()
7197       && !direct_internal_fn_supported_p (IFN_VEC_SHL_INSERT,
7198                                           vectype_out, OPTIMIZE_FOR_SPEED))
7199     {
7200       if (dump_enabled_p ())
7201         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7202                          "reduction on variable-length vectors requires"
7203                          " target support for a vector-shift-and-insert"
7204                          " operation.\n");
7205       return false;
7206     }
7207
7208   /* Check extra constraints for variable-length unchained SLP reductions.  */
7209   if (STMT_SLP_TYPE (stmt_info)
7210       && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
7211       && !nunits_out.is_constant ())
7212     {
7213       /* We checked above that we could build the initial vector when
7214          there's a neutral element value.  Check here for the case in
7215          which each SLP statement has its own initial value and in which
7216          that value needs to be repeated for every instance of the
7217          statement within the initial vector.  */
7218       unsigned int group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7219       scalar_mode elt_mode = SCALAR_TYPE_MODE (TREE_TYPE (vectype_out));
7220       if (!neutral_op
7221           && !can_duplicate_and_interleave_p (group_size, elt_mode))
7222         {
7223           if (dump_enabled_p ())
7224             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7225                              "unsupported form of SLP reduction for"
7226                              " variable-length vectors: cannot build"
7227                              " initial vector.\n");
7228           return false;
7229         }
7230       /* The epilogue code relies on the number of elements being a multiple
7231          of the group size.  The duplicate-and-interleave approach to setting
7232          up the the initial vector does too.  */
7233       if (!multiple_p (nunits_out, group_size))
7234         {
7235           if (dump_enabled_p ())
7236             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7237                              "unsupported form of SLP reduction for"
7238                              " variable-length vectors: the vector size"
7239                              " is not a multiple of the number of results.\n");
7240           return false;
7241         }
7242     }
7243
7244   /* In case of widenning multiplication by a constant, we update the type
7245      of the constant to be the type of the other operand.  We check that the
7246      constant fits the type in the pattern recognition pass.  */
7247   if (code == DOT_PROD_EXPR
7248       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
7249     {
7250       if (TREE_CODE (ops[0]) == INTEGER_CST)
7251         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
7252       else if (TREE_CODE (ops[1]) == INTEGER_CST)
7253         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
7254       else
7255         {
7256           if (dump_enabled_p ())
7257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7258                              "invalid types in dot-prod\n");
7259
7260           return false;
7261         }
7262     }
7263
7264   if (reduction_type == COND_REDUCTION)
7265     {
7266       widest_int ni;
7267
7268       if (! max_loop_iterations (loop, &ni))
7269         {
7270           if (dump_enabled_p ())
7271             dump_printf_loc (MSG_NOTE, vect_location,
7272                              "loop count not known, cannot create cond "
7273                              "reduction.\n");
7274           return false;
7275         }
7276       /* Convert backedges to iterations.  */
7277       ni += 1;
7278
7279       /* The additional index will be the same type as the condition.  Check
7280          that the loop can fit into this less one (because we'll use up the
7281          zero slot for when there are no matches).  */
7282       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
7283       if (wi::geu_p (ni, wi::to_widest (max_index)))
7284         {
7285           if (dump_enabled_p ())
7286             dump_printf_loc (MSG_NOTE, vect_location,
7287                              "loop size is greater than data size.\n");
7288           return false;
7289         }
7290     }
7291
7292   /* In case the vectorization factor (VF) is bigger than the number
7293      of elements that we can fit in a vectype (nunits), we have to generate
7294      more than one vector stmt - i.e - we need to "unroll" the
7295      vector stmt by a factor VF/nunits.  For more details see documentation
7296      in vectorizable_operation.  */
7297
7298   /* If the reduction is used in an outer loop we need to generate
7299      VF intermediate results, like so (e.g. for ncopies=2):
7300         r0 = phi (init, r0)
7301         r1 = phi (init, r1)
7302         r0 = x0 + r0;
7303         r1 = x1 + r1;
7304     (i.e. we generate VF results in 2 registers).
7305     In this case we have a separate def-use cycle for each copy, and therefore
7306     for each copy we get the vector def for the reduction variable from the
7307     respective phi node created for this copy.
7308
7309     Otherwise (the reduction is unused in the loop nest), we can combine
7310     together intermediate results, like so (e.g. for ncopies=2):
7311         r = phi (init, r)
7312         r = x0 + r;
7313         r = x1 + r;
7314    (i.e. we generate VF/2 results in a single register).
7315    In this case for each copy we get the vector def for the reduction variable
7316    from the vectorized reduction operation generated in the previous iteration.
7317
7318    This only works when we see both the reduction PHI and its only consumer
7319    in vectorizable_reduction and there are no intermediate stmts
7320    participating.  */
7321   use_operand_p use_p;
7322   gimple *use_stmt;
7323   if (ncopies > 1
7324       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
7325       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
7326       && (use_stmt == stmt
7327           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
7328     {
7329       single_defuse_cycle = true;
7330       epilog_copies = 1;
7331     }
7332   else
7333     epilog_copies = ncopies;
7334
7335   /* If the reduction stmt is one of the patterns that have lane
7336      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
7337   if ((ncopies > 1
7338        && ! single_defuse_cycle)
7339       && (code == DOT_PROD_EXPR
7340           || code == WIDEN_SUM_EXPR
7341           || code == SAD_EXPR))
7342     {
7343       if (dump_enabled_p ())
7344         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7345                          "multi def-use cycle not possible for lane-reducing "
7346                          "reduction operation\n");
7347       return false;
7348     }
7349
7350   if (slp_node)
7351     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7352   else
7353     vec_num = 1;
7354
7355   internal_fn cond_fn = get_conditional_internal_fn (code);
7356   vec_loop_masks *masks = &LOOP_VINFO_MASKS (loop_vinfo);
7357
7358   if (!vec_stmt) /* transformation not required.  */
7359     {
7360       if (first_p)
7361         vect_model_reduction_cost (stmt_info, reduc_fn, ncopies);
7362       if (loop_vinfo && LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
7363         {
7364           if (reduction_type != FOLD_LEFT_REDUCTION
7365               && (cond_fn == IFN_LAST
7366                   || !direct_internal_fn_supported_p (cond_fn, vectype_in,
7367                                                       OPTIMIZE_FOR_SPEED)))
7368             {
7369               if (dump_enabled_p ())
7370                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7371                                  "can't use a fully-masked loop because no"
7372                                  " conditional operation is available.\n");
7373               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7374             }
7375           else if (reduc_index == -1)
7376             {
7377               if (dump_enabled_p ())
7378                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7379                                  "can't use a fully-masked loop for chained"
7380                                  " reductions.\n");
7381               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
7382             }
7383           else
7384             vect_record_loop_mask (loop_vinfo, masks, ncopies * vec_num,
7385                                    vectype_in);
7386         }
7387       if (dump_enabled_p ()
7388           && reduction_type == FOLD_LEFT_REDUCTION)
7389         dump_printf_loc (MSG_NOTE, vect_location,
7390                          "using an in-order (fold-left) reduction.\n");
7391       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
7392       return true;
7393     }
7394
7395   /* Transform.  */
7396
7397   if (dump_enabled_p ())
7398     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
7399
7400   /* FORNOW: Multiple types are not supported for condition.  */
7401   if (code == COND_EXPR)
7402     gcc_assert (ncopies == 1);
7403
7404   bool masked_loop_p = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
7405
7406   if (reduction_type == FOLD_LEFT_REDUCTION)
7407     return vectorize_fold_left_reduction
7408       (stmt, gsi, vec_stmt, slp_node, reduc_def_stmt, code,
7409        reduc_fn, ops, vectype_in, reduc_index, masks);
7410
7411   if (reduction_type == EXTRACT_LAST_REDUCTION)
7412     {
7413       gcc_assert (!slp_node);
7414       return vectorizable_condition (stmt, gsi, vec_stmt,
7415                                      NULL, reduc_index, NULL);
7416     }
7417
7418   /* Create the destination vector  */
7419   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
7420
7421   prev_stmt_info = NULL;
7422   prev_phi_info = NULL;
7423   if (!slp_node)
7424     {
7425       vec_oprnds0.create (1);
7426       vec_oprnds1.create (1);
7427       if (op_type == ternary_op)
7428         vec_oprnds2.create (1);
7429     }
7430
7431   phis.create (vec_num);
7432   vect_defs.create (vec_num);
7433   if (!slp_node)
7434     vect_defs.quick_push (NULL_TREE);
7435
7436   if (slp_node)
7437     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
7438   else
7439     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
7440
7441   for (j = 0; j < ncopies; j++)
7442     {
7443       if (code == COND_EXPR)
7444         {
7445           gcc_assert (!slp_node);
7446           vectorizable_condition (stmt, gsi, vec_stmt,
7447                                   PHI_RESULT (phis[0]),
7448                                   reduc_index, NULL);
7449           /* Multiple types are not supported for condition.  */
7450           break;
7451         }
7452
7453       /* Handle uses.  */
7454       if (j == 0)
7455         {
7456           if (slp_node)
7457             {
7458               /* Get vec defs for all the operands except the reduction index,
7459                  ensuring the ordering of the ops in the vector is kept.  */
7460               auto_vec<tree, 3> slp_ops;
7461               auto_vec<vec<tree>, 3> vec_defs;
7462
7463               slp_ops.quick_push (ops[0]);
7464               slp_ops.quick_push (ops[1]);
7465               if (op_type == ternary_op)
7466                 slp_ops.quick_push (ops[2]);
7467
7468               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
7469
7470               vec_oprnds0.safe_splice (vec_defs[0]);
7471               vec_defs[0].release ();
7472               vec_oprnds1.safe_splice (vec_defs[1]);
7473               vec_defs[1].release ();
7474               if (op_type == ternary_op)
7475                 {
7476                   vec_oprnds2.safe_splice (vec_defs[2]);
7477                   vec_defs[2].release ();
7478                 }
7479             }
7480           else
7481             {
7482               vec_oprnds0.quick_push
7483                 (vect_get_vec_def_for_operand (ops[0], stmt));
7484               vec_oprnds1.quick_push
7485                 (vect_get_vec_def_for_operand (ops[1], stmt));
7486               if (op_type == ternary_op)
7487                 vec_oprnds2.quick_push
7488                   (vect_get_vec_def_for_operand (ops[2], stmt));
7489             }
7490         }
7491       else
7492         {
7493           if (!slp_node)
7494             {
7495               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
7496
7497               if (single_defuse_cycle && reduc_index == 0)
7498                 vec_oprnds0[0] = gimple_get_lhs (new_stmt);
7499               else
7500                 vec_oprnds0[0]
7501                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
7502               if (single_defuse_cycle && reduc_index == 1)
7503                 vec_oprnds1[0] = gimple_get_lhs (new_stmt);
7504               else
7505                 vec_oprnds1[0]
7506                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
7507               if (op_type == ternary_op)
7508                 {
7509                   if (single_defuse_cycle && reduc_index == 2)
7510                     vec_oprnds2[0] = gimple_get_lhs (new_stmt);
7511                   else
7512                     vec_oprnds2[0]
7513                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
7514                 }
7515             }
7516         }
7517
7518       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
7519         {
7520           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
7521           if (masked_loop_p)
7522             {
7523               /* Make sure that the reduction accumulator is vop[0].  */
7524               if (reduc_index == 1)
7525                 {
7526                   gcc_assert (commutative_tree_code (code));
7527                   std::swap (vop[0], vop[1]);
7528                 }
7529               tree mask = vect_get_loop_mask (gsi, masks, vec_num * ncopies,
7530                                               vectype_in, i * ncopies + j);
7531               gcall *call = gimple_build_call_internal (cond_fn, 3, mask,
7532                                                         vop[0], vop[1]);
7533               new_temp = make_ssa_name (vec_dest, call);
7534               gimple_call_set_lhs (call, new_temp);
7535               gimple_call_set_nothrow (call, true);
7536               new_stmt = call;
7537             }
7538           else
7539             {
7540               if (op_type == ternary_op)
7541                 vop[2] = vec_oprnds2[i];
7542
7543               new_temp = make_ssa_name (vec_dest, new_stmt);
7544               new_stmt = gimple_build_assign (new_temp, code,
7545                                               vop[0], vop[1], vop[2]);
7546             }
7547           vect_finish_stmt_generation (stmt, new_stmt, gsi);
7548
7549           if (slp_node)
7550             {
7551               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7552               vect_defs.quick_push (new_temp);
7553             }
7554           else
7555             vect_defs[0] = new_temp;
7556         }
7557
7558       if (slp_node)
7559         continue;
7560
7561       if (j == 0)
7562         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
7563       else
7564         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
7565
7566       prev_stmt_info = vinfo_for_stmt (new_stmt);
7567     }
7568
7569   /* Finalize the reduction-phi (set its arguments) and create the
7570      epilog reduction code.  */
7571   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
7572     vect_defs[0] = gimple_get_lhs (*vec_stmt);
7573
7574   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
7575                                     epilog_copies, reduc_fn, phis,
7576                                     double_reduc, slp_node, slp_node_instance,
7577                                     cond_reduc_val, cond_reduc_op_code,
7578                                     neutral_op);
7579
7580   return true;
7581 }
7582
7583 /* Function vect_min_worthwhile_factor.
7584
7585    For a loop where we could vectorize the operation indicated by CODE,
7586    return the minimum vectorization factor that makes it worthwhile
7587    to use generic vectors.  */
7588 static unsigned int
7589 vect_min_worthwhile_factor (enum tree_code code)
7590 {
7591   switch (code)
7592     {
7593     case PLUS_EXPR:
7594     case MINUS_EXPR:
7595     case NEGATE_EXPR:
7596       return 4;
7597
7598     case BIT_AND_EXPR:
7599     case BIT_IOR_EXPR:
7600     case BIT_XOR_EXPR:
7601     case BIT_NOT_EXPR:
7602       return 2;
7603
7604     default:
7605       return INT_MAX;
7606     }
7607 }
7608
7609 /* Return true if VINFO indicates we are doing loop vectorization and if
7610    it is worth decomposing CODE operations into scalar operations for
7611    that loop's vectorization factor.  */
7612
7613 bool
7614 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
7615 {
7616   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
7617   unsigned HOST_WIDE_INT value;
7618   return (loop_vinfo
7619           && LOOP_VINFO_VECT_FACTOR (loop_vinfo).is_constant (&value)
7620           && value >= vect_min_worthwhile_factor (code));
7621 }
7622
7623 /* Function vectorizable_induction
7624
7625    Check if PHI performs an induction computation that can be vectorized.
7626    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
7627    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
7628    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
7629
7630 bool
7631 vectorizable_induction (gimple *phi,
7632                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7633                         gimple **vec_stmt, slp_tree slp_node)
7634 {
7635   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
7636   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7637   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7638   unsigned ncopies;
7639   bool nested_in_vect_loop = false;
7640   struct loop *iv_loop;
7641   tree vec_def;
7642   edge pe = loop_preheader_edge (loop);
7643   basic_block new_bb;
7644   tree new_vec, vec_init, vec_step, t;
7645   tree new_name;
7646   gimple *new_stmt;
7647   gphi *induction_phi;
7648   tree induc_def, vec_dest;
7649   tree init_expr, step_expr;
7650   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7651   unsigned i;
7652   tree expr;
7653   gimple_seq stmts;
7654   imm_use_iterator imm_iter;
7655   use_operand_p use_p;
7656   gimple *exit_phi;
7657   edge latch_e;
7658   tree loop_arg;
7659   gimple_stmt_iterator si;
7660   basic_block bb = gimple_bb (phi);
7661
7662   if (gimple_code (phi) != GIMPLE_PHI)
7663     return false;
7664
7665   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7666     return false;
7667
7668   /* Make sure it was recognized as induction computation.  */
7669   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
7670     return false;
7671
7672   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7673   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
7674
7675   if (slp_node)
7676     ncopies = 1;
7677   else
7678     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7679   gcc_assert (ncopies >= 1);
7680
7681   /* FORNOW. These restrictions should be relaxed.  */
7682   if (nested_in_vect_loop_p (loop, phi))
7683     {
7684       imm_use_iterator imm_iter;
7685       use_operand_p use_p;
7686       gimple *exit_phi;
7687       edge latch_e;
7688       tree loop_arg;
7689
7690       if (ncopies > 1)
7691         {
7692           if (dump_enabled_p ())
7693             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7694                              "multiple types in nested loop.\n");
7695           return false;
7696         }
7697
7698       /* FORNOW: outer loop induction with SLP not supported.  */
7699       if (STMT_SLP_TYPE (stmt_info))
7700         return false;
7701
7702       exit_phi = NULL;
7703       latch_e = loop_latch_edge (loop->inner);
7704       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7705       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7706         {
7707           gimple *use_stmt = USE_STMT (use_p);
7708           if (is_gimple_debug (use_stmt))
7709             continue;
7710
7711           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
7712             {
7713               exit_phi = use_stmt;
7714               break;
7715             }
7716         }
7717       if (exit_phi)
7718         {
7719           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
7720           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
7721                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
7722             {
7723               if (dump_enabled_p ())
7724                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7725                                  "inner-loop induction only used outside "
7726                                  "of the outer vectorized loop.\n");
7727               return false;
7728             }
7729         }
7730
7731       nested_in_vect_loop = true;
7732       iv_loop = loop->inner;
7733     }
7734   else
7735     iv_loop = loop;
7736   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
7737
7738   if (slp_node && !nunits.is_constant ())
7739     {
7740       /* The current SLP code creates the initial value element-by-element.  */
7741       if (dump_enabled_p ())
7742         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
7743                          "SLP induction not supported for variable-length"
7744                          " vectors.\n");
7745       return false;
7746     }
7747
7748   if (!vec_stmt) /* transformation not required.  */
7749     {
7750       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
7751       if (dump_enabled_p ())
7752         dump_printf_loc (MSG_NOTE, vect_location,
7753                          "=== vectorizable_induction ===\n");
7754       vect_model_induction_cost (stmt_info, ncopies);
7755       return true;
7756     }
7757
7758   /* Transform.  */
7759
7760   /* Compute a vector variable, initialized with the first VF values of
7761      the induction variable.  E.g., for an iv with IV_PHI='X' and
7762      evolution S, for a vector of 4 units, we want to compute:
7763      [X, X + S, X + 2*S, X + 3*S].  */
7764
7765   if (dump_enabled_p ())
7766     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
7767
7768   latch_e = loop_latch_edge (iv_loop);
7769   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
7770
7771   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
7772   gcc_assert (step_expr != NULL_TREE);
7773
7774   pe = loop_preheader_edge (iv_loop);
7775   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
7776                                      loop_preheader_edge (iv_loop));
7777
7778   stmts = NULL;
7779   if (!nested_in_vect_loop)
7780     {
7781       /* Convert the initial value to the desired type.  */
7782       tree new_type = TREE_TYPE (vectype);
7783       init_expr = gimple_convert (&stmts, new_type, init_expr);
7784
7785       /* If we are using the loop mask to "peel" for alignment then we need
7786          to adjust the start value here.  */
7787       tree skip_niters = LOOP_VINFO_MASK_SKIP_NITERS (loop_vinfo);
7788       if (skip_niters != NULL_TREE)
7789         {
7790           if (FLOAT_TYPE_P (vectype))
7791             skip_niters = gimple_build (&stmts, FLOAT_EXPR, new_type,
7792                                         skip_niters);
7793           else
7794             skip_niters = gimple_convert (&stmts, new_type, skip_niters);
7795           tree skip_step = gimple_build (&stmts, MULT_EXPR, new_type,
7796                                          skip_niters, step_expr);
7797           init_expr = gimple_build (&stmts, MINUS_EXPR, new_type,
7798                                     init_expr, skip_step);
7799         }
7800     }
7801
7802   /* Convert the step to the desired type.  */
7803   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
7804
7805   if (stmts)
7806     {
7807       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7808       gcc_assert (!new_bb);
7809     }
7810
7811   /* Find the first insertion point in the BB.  */
7812   si = gsi_after_labels (bb);
7813
7814   /* For SLP induction we have to generate several IVs as for example
7815      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
7816      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
7817      [VF*S, VF*S, VF*S, VF*S] for all.  */
7818   if (slp_node)
7819     {
7820       /* Enforced above.  */
7821       unsigned int const_nunits = nunits.to_constant ();
7822
7823       /* Generate [VF*S, VF*S, ... ].  */
7824       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7825         {
7826           expr = build_int_cst (integer_type_node, vf);
7827           expr = fold_convert (TREE_TYPE (step_expr), expr);
7828         }
7829       else
7830         expr = build_int_cst (TREE_TYPE (step_expr), vf);
7831       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7832                               expr, step_expr);
7833       if (! CONSTANT_CLASS_P (new_name))
7834         new_name = vect_init_vector (phi, new_name,
7835                                      TREE_TYPE (step_expr), NULL);
7836       new_vec = build_vector_from_val (vectype, new_name);
7837       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7838
7839       /* Now generate the IVs.  */
7840       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7841       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7842       unsigned elts = const_nunits * nvects;
7843       unsigned nivs = least_common_multiple (group_size,
7844                                              const_nunits) / const_nunits;
7845       gcc_assert (elts % group_size == 0);
7846       tree elt = init_expr;
7847       unsigned ivn;
7848       for (ivn = 0; ivn < nivs; ++ivn)
7849         {
7850           tree_vector_builder elts (vectype, const_nunits, 1);
7851           stmts = NULL;
7852           for (unsigned eltn = 0; eltn < const_nunits; ++eltn)
7853             {
7854               if (ivn*const_nunits + eltn >= group_size
7855                   && (ivn * const_nunits + eltn) % group_size == 0)
7856                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
7857                                     elt, step_expr);
7858               elts.quick_push (elt);
7859             }
7860           vec_init = gimple_build_vector (&stmts, &elts);
7861           if (stmts)
7862             {
7863               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
7864               gcc_assert (!new_bb);
7865             }
7866
7867           /* Create the induction-phi that defines the induction-operand.  */
7868           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
7869           induction_phi = create_phi_node (vec_dest, iv_loop->header);
7870           set_vinfo_for_stmt (induction_phi,
7871                               new_stmt_vec_info (induction_phi, loop_vinfo));
7872           induc_def = PHI_RESULT (induction_phi);
7873
7874           /* Create the iv update inside the loop  */
7875           vec_def = make_ssa_name (vec_dest);
7876           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
7877           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7878           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
7879
7880           /* Set the arguments of the phi node:  */
7881           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
7882           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
7883                        UNKNOWN_LOCATION);
7884
7885           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
7886         }
7887
7888       /* Re-use IVs when we can.  */
7889       if (ivn < nvects)
7890         {
7891           unsigned vfp
7892             = least_common_multiple (group_size, const_nunits) / group_size;
7893           /* Generate [VF'*S, VF'*S, ... ].  */
7894           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
7895             {
7896               expr = build_int_cst (integer_type_node, vfp);
7897               expr = fold_convert (TREE_TYPE (step_expr), expr);
7898             }
7899           else
7900             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
7901           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
7902                                   expr, step_expr);
7903           if (! CONSTANT_CLASS_P (new_name))
7904             new_name = vect_init_vector (phi, new_name,
7905                                          TREE_TYPE (step_expr), NULL);
7906           new_vec = build_vector_from_val (vectype, new_name);
7907           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7908           for (; ivn < nvects; ++ivn)
7909             {
7910               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
7911               tree def;
7912               if (gimple_code (iv) == GIMPLE_PHI)
7913                 def = gimple_phi_result (iv);
7914               else
7915                 def = gimple_assign_lhs (iv);
7916               new_stmt = gimple_build_assign (make_ssa_name (vectype),
7917                                               PLUS_EXPR,
7918                                               def, vec_step);
7919               if (gimple_code (iv) == GIMPLE_PHI)
7920                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7921               else
7922                 {
7923                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
7924                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
7925                 }
7926               set_vinfo_for_stmt (new_stmt,
7927                                   new_stmt_vec_info (new_stmt, loop_vinfo));
7928               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
7929             }
7930         }
7931
7932       return true;
7933     }
7934
7935   /* Create the vector that holds the initial_value of the induction.  */
7936   if (nested_in_vect_loop)
7937     {
7938       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
7939          been created during vectorization of previous stmts.  We obtain it
7940          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
7941       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
7942       /* If the initial value is not of proper type, convert it.  */
7943       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
7944         {
7945           new_stmt
7946             = gimple_build_assign (vect_get_new_ssa_name (vectype,
7947                                                           vect_simple_var,
7948                                                           "vec_iv_"),
7949                                    VIEW_CONVERT_EXPR,
7950                                    build1 (VIEW_CONVERT_EXPR, vectype,
7951                                            vec_init));
7952           vec_init = gimple_assign_lhs (new_stmt);
7953           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
7954                                                  new_stmt);
7955           gcc_assert (!new_bb);
7956           set_vinfo_for_stmt (new_stmt,
7957                               new_stmt_vec_info (new_stmt, loop_vinfo));
7958         }
7959     }
7960   else
7961     {
7962       /* iv_loop is the loop to be vectorized. Create:
7963          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
7964       stmts = NULL;
7965       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
7966
7967       unsigned HOST_WIDE_INT const_nunits;
7968       if (nunits.is_constant (&const_nunits))
7969         {
7970           tree_vector_builder elts (vectype, const_nunits, 1);
7971           elts.quick_push (new_name);
7972           for (i = 1; i < const_nunits; i++)
7973             {
7974               /* Create: new_name_i = new_name + step_expr  */
7975               new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
7976                                        new_name, step_expr);
7977               elts.quick_push (new_name);
7978             }
7979           /* Create a vector from [new_name_0, new_name_1, ...,
7980              new_name_nunits-1]  */
7981           vec_init = gimple_build_vector (&stmts, &elts);
7982         }
7983       else if (INTEGRAL_TYPE_P (TREE_TYPE (step_expr)))
7984         /* Build the initial value directly from a VEC_SERIES_EXPR.  */
7985         vec_init = gimple_build (&stmts, VEC_SERIES_EXPR, vectype,
7986                                  new_name, step_expr);
7987       else
7988         {
7989           /* Build:
7990                 [base, base, base, ...]
7991                 + (vectype) [0, 1, 2, ...] * [step, step, step, ...].  */
7992           gcc_assert (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)));
7993           gcc_assert (flag_associative_math);
7994           tree index = build_index_vector (vectype, 0, 1);
7995           tree base_vec = gimple_build_vector_from_val (&stmts, vectype,
7996                                                         new_name);
7997           tree step_vec = gimple_build_vector_from_val (&stmts, vectype,
7998                                                         step_expr);
7999           vec_init = gimple_build (&stmts, FLOAT_EXPR, vectype, index);
8000           vec_init = gimple_build (&stmts, MULT_EXPR, vectype,
8001                                    vec_init, step_vec);
8002           vec_init = gimple_build (&stmts, PLUS_EXPR, vectype,
8003                                    vec_init, base_vec);
8004         }
8005
8006       if (stmts)
8007         {
8008           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
8009           gcc_assert (!new_bb);
8010         }
8011     }
8012
8013
8014   /* Create the vector that holds the step of the induction.  */
8015   if (nested_in_vect_loop)
8016     /* iv_loop is nested in the loop to be vectorized. Generate:
8017        vec_step = [S, S, S, S]  */
8018     new_name = step_expr;
8019   else
8020     {
8021       /* iv_loop is the loop to be vectorized. Generate:
8022           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
8023       gimple_seq seq = NULL;
8024       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8025         {
8026           expr = build_int_cst (integer_type_node, vf);
8027           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8028         }
8029       else
8030         expr = build_int_cst (TREE_TYPE (step_expr), vf);
8031       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8032                                expr, step_expr);
8033       if (seq)
8034         {
8035           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8036           gcc_assert (!new_bb);
8037         }
8038     }
8039
8040   t = unshare_expr (new_name);
8041   gcc_assert (CONSTANT_CLASS_P (new_name)
8042               || TREE_CODE (new_name) == SSA_NAME);
8043   new_vec = build_vector_from_val (vectype, t);
8044   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8045
8046
8047   /* Create the following def-use cycle:
8048      loop prolog:
8049          vec_init = ...
8050          vec_step = ...
8051      loop:
8052          vec_iv = PHI <vec_init, vec_loop>
8053          ...
8054          STMT
8055          ...
8056          vec_loop = vec_iv + vec_step;  */
8057
8058   /* Create the induction-phi that defines the induction-operand.  */
8059   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
8060   induction_phi = create_phi_node (vec_dest, iv_loop->header);
8061   set_vinfo_for_stmt (induction_phi,
8062                       new_stmt_vec_info (induction_phi, loop_vinfo));
8063   induc_def = PHI_RESULT (induction_phi);
8064
8065   /* Create the iv update inside the loop  */
8066   vec_def = make_ssa_name (vec_dest);
8067   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
8068   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8069   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
8070
8071   /* Set the arguments of the phi node:  */
8072   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
8073   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
8074                UNKNOWN_LOCATION);
8075
8076   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
8077
8078   /* In case that vectorization factor (VF) is bigger than the number
8079      of elements that we can fit in a vectype (nunits), we have to generate
8080      more than one vector stmt - i.e - we need to "unroll" the
8081      vector stmt by a factor VF/nunits.  For more details see documentation
8082      in vectorizable_operation.  */
8083
8084   if (ncopies > 1)
8085     {
8086       gimple_seq seq = NULL;
8087       stmt_vec_info prev_stmt_vinfo;
8088       /* FORNOW. This restriction should be relaxed.  */
8089       gcc_assert (!nested_in_vect_loop);
8090
8091       /* Create the vector that holds the step of the induction.  */
8092       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
8093         {
8094           expr = build_int_cst (integer_type_node, nunits);
8095           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
8096         }
8097       else
8098         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
8099       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
8100                                expr, step_expr);
8101       if (seq)
8102         {
8103           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
8104           gcc_assert (!new_bb);
8105         }
8106
8107       t = unshare_expr (new_name);
8108       gcc_assert (CONSTANT_CLASS_P (new_name)
8109                   || TREE_CODE (new_name) == SSA_NAME);
8110       new_vec = build_vector_from_val (vectype, t);
8111       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
8112
8113       vec_def = induc_def;
8114       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
8115       for (i = 1; i < ncopies; i++)
8116         {
8117           /* vec_i = vec_prev + vec_step  */
8118           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
8119                                           vec_def, vec_step);
8120           vec_def = make_ssa_name (vec_dest, new_stmt);
8121           gimple_assign_set_lhs (new_stmt, vec_def);
8122
8123           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
8124           set_vinfo_for_stmt (new_stmt,
8125                               new_stmt_vec_info (new_stmt, loop_vinfo));
8126           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
8127           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
8128         }
8129     }
8130
8131   if (nested_in_vect_loop)
8132     {
8133       /* Find the loop-closed exit-phi of the induction, and record
8134          the final vector of induction results:  */
8135       exit_phi = NULL;
8136       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
8137         {
8138           gimple *use_stmt = USE_STMT (use_p);
8139           if (is_gimple_debug (use_stmt))
8140             continue;
8141
8142           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
8143             {
8144               exit_phi = use_stmt;
8145               break;
8146             }
8147         }
8148       if (exit_phi)
8149         {
8150           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
8151           /* FORNOW. Currently not supporting the case that an inner-loop induction
8152              is not used in the outer-loop (i.e. only outside the outer-loop).  */
8153           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
8154                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
8155
8156           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
8157           if (dump_enabled_p ())
8158             {
8159               dump_printf_loc (MSG_NOTE, vect_location,
8160                                "vector of inductions after inner-loop:");
8161               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
8162             }
8163         }
8164     }
8165
8166
8167   if (dump_enabled_p ())
8168     {
8169       dump_printf_loc (MSG_NOTE, vect_location,
8170                        "transform induction: created def-use cycle: ");
8171       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
8172       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8173                         SSA_NAME_DEF_STMT (vec_def), 0);
8174     }
8175
8176   return true;
8177 }
8178
8179 /* Function vectorizable_live_operation.
8180
8181    STMT computes a value that is used outside the loop.  Check if
8182    it can be supported.  */
8183
8184 bool
8185 vectorizable_live_operation (gimple *stmt,
8186                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
8187                              slp_tree slp_node, int slp_index,
8188                              gimple **vec_stmt)
8189 {
8190   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
8191   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
8192   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8193   imm_use_iterator imm_iter;
8194   tree lhs, lhs_type, bitsize, vec_bitsize;
8195   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
8196   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (vectype);
8197   int ncopies;
8198   gimple *use_stmt;
8199   auto_vec<tree> vec_oprnds;
8200   int vec_entry = 0;
8201   poly_uint64 vec_index = 0;
8202
8203   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
8204
8205   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
8206     return false;
8207
8208   /* FORNOW.  CHECKME.  */
8209   if (nested_in_vect_loop_p (loop, stmt))
8210     return false;
8211
8212   /* If STMT is not relevant and it is a simple assignment and its inputs are
8213      invariant then it can remain in place, unvectorized.  The original last
8214      scalar value that it computes will be used.  */
8215   if (!STMT_VINFO_RELEVANT_P (stmt_info))
8216     {
8217       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
8218       if (dump_enabled_p ())
8219         dump_printf_loc (MSG_NOTE, vect_location,
8220                          "statement is simple and uses invariant.  Leaving in "
8221                          "place.\n");
8222       return true;
8223     }
8224
8225   if (slp_node)
8226     ncopies = 1;
8227   else
8228     ncopies = vect_get_num_copies (loop_vinfo, vectype);
8229
8230   if (slp_node)
8231     {
8232       gcc_assert (slp_index >= 0);
8233
8234       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
8235       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
8236
8237       /* Get the last occurrence of the scalar index from the concatenation of
8238          all the slp vectors. Calculate which slp vector it is and the index
8239          within.  */
8240       poly_uint64 pos = (num_vec * nunits) - num_scalar + slp_index;
8241
8242       /* Calculate which vector contains the result, and which lane of
8243          that vector we need.  */
8244       if (!can_div_trunc_p (pos, nunits, &vec_entry, &vec_index))
8245         {
8246           if (dump_enabled_p ())
8247             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8248                              "Cannot determine which vector holds the"
8249                              " final result.\n");
8250           return false;
8251         }
8252     }
8253
8254   if (!vec_stmt)
8255     {
8256       /* No transformation required.  */
8257       if (LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo))
8258         {
8259           if (!direct_internal_fn_supported_p (IFN_EXTRACT_LAST, vectype,
8260                                                OPTIMIZE_FOR_SPEED))
8261             {
8262               if (dump_enabled_p ())
8263                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8264                                  "can't use a fully-masked loop because "
8265                                  "the target doesn't support extract last "
8266                                  "reduction.\n");
8267               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8268             }
8269           else if (slp_node)
8270             {
8271               if (dump_enabled_p ())
8272                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8273                                  "can't use a fully-masked loop because an "
8274                                  "SLP statement is live after the loop.\n");
8275               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8276             }
8277           else if (ncopies > 1)
8278             {
8279               if (dump_enabled_p ())
8280                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
8281                                  "can't use a fully-masked loop because"
8282                                  " ncopies is greater than 1.\n");
8283               LOOP_VINFO_CAN_FULLY_MASK_P (loop_vinfo) = false;
8284             }
8285           else
8286             {
8287               gcc_assert (ncopies == 1 && !slp_node);
8288               vect_record_loop_mask (loop_vinfo,
8289                                      &LOOP_VINFO_MASKS (loop_vinfo),
8290                                      1, vectype);
8291             }
8292         }
8293       return true;
8294     }
8295
8296   /* If stmt has a related stmt, then use that for getting the lhs.  */
8297   if (is_pattern_stmt_p (stmt_info))
8298     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
8299
8300   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
8301         : gimple_get_lhs (stmt);
8302   lhs_type = TREE_TYPE (lhs);
8303
8304   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
8305              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
8306              : TYPE_SIZE (TREE_TYPE (vectype)));
8307   vec_bitsize = TYPE_SIZE (vectype);
8308
8309   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
8310   tree vec_lhs, bitstart;
8311   if (slp_node)
8312     {
8313       gcc_assert (!LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8314
8315       /* Get the correct slp vectorized stmt.  */
8316       gimple *vec_stmt = SLP_TREE_VEC_STMTS (slp_node)[vec_entry];
8317       if (gphi *phi = dyn_cast <gphi *> (vec_stmt))
8318         vec_lhs = gimple_phi_result (phi);
8319       else
8320         vec_lhs = gimple_get_lhs (vec_stmt);
8321
8322       /* Get entry to use.  */
8323       bitstart = bitsize_int (vec_index);
8324       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
8325     }
8326   else
8327     {
8328       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
8329       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
8330       gcc_checking_assert (ncopies == 1
8331                            || !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo));
8332
8333       /* For multiple copies, get the last copy.  */
8334       for (int i = 1; i < ncopies; ++i)
8335         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
8336                                                   vec_lhs);
8337
8338       /* Get the last lane in the vector.  */
8339       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
8340     }
8341
8342   gimple_seq stmts = NULL;
8343   tree new_tree;
8344   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
8345     {
8346       /* Emit:
8347
8348            SCALAR_RES = EXTRACT_LAST <VEC_LHS, MASK>
8349
8350          where VEC_LHS is the vectorized live-out result and MASK is
8351          the loop mask for the final iteration.  */
8352       gcc_assert (ncopies == 1 && !slp_node);
8353       tree scalar_type = TREE_TYPE (STMT_VINFO_VECTYPE (stmt_info));
8354       tree scalar_res = make_ssa_name (scalar_type);
8355       tree mask = vect_get_loop_mask (gsi, &LOOP_VINFO_MASKS (loop_vinfo),
8356                                       1, vectype, 0);
8357       gcall *new_stmt = gimple_build_call_internal (IFN_EXTRACT_LAST,
8358                                                     2, mask, vec_lhs);
8359       gimple_call_set_lhs (new_stmt, scalar_res);
8360       gimple_seq_add_stmt (&stmts, new_stmt);
8361
8362       /* Convert the extracted vector element to the required scalar type.  */
8363       new_tree = gimple_convert (&stmts, lhs_type, scalar_res);
8364     }
8365   else
8366     {
8367       tree bftype = TREE_TYPE (vectype);
8368       if (VECTOR_BOOLEAN_TYPE_P (vectype))
8369         bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
8370       new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
8371       new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree),
8372                                        &stmts, true, NULL_TREE);
8373     }
8374
8375   if (stmts)
8376     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
8377
8378   /* Replace use of lhs with newly computed result.  If the use stmt is a
8379      single arg PHI, just replace all uses of PHI result.  It's necessary
8380      because lcssa PHI defining lhs may be before newly inserted stmt.  */
8381   use_operand_p use_p;
8382   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
8383     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
8384         && !is_gimple_debug (use_stmt))
8385     {
8386       if (gimple_code (use_stmt) == GIMPLE_PHI
8387           && gimple_phi_num_args (use_stmt) == 1)
8388         {
8389           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
8390         }
8391       else
8392         {
8393           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
8394             SET_USE (use_p, new_tree);
8395         }
8396       update_stmt (use_stmt);
8397     }
8398
8399   return true;
8400 }
8401
8402 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
8403
8404 static void
8405 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
8406 {
8407   ssa_op_iter op_iter;
8408   imm_use_iterator imm_iter;
8409   def_operand_p def_p;
8410   gimple *ustmt;
8411
8412   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
8413     {
8414       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
8415         {
8416           basic_block bb;
8417
8418           if (!is_gimple_debug (ustmt))
8419             continue;
8420
8421           bb = gimple_bb (ustmt);
8422
8423           if (!flow_bb_inside_loop_p (loop, bb))
8424             {
8425               if (gimple_debug_bind_p (ustmt))
8426                 {
8427                   if (dump_enabled_p ())
8428                     dump_printf_loc (MSG_NOTE, vect_location,
8429                                      "killing debug use\n");
8430
8431                   gimple_debug_bind_reset_value (ustmt);
8432                   update_stmt (ustmt);
8433                 }
8434               else
8435                 gcc_unreachable ();
8436             }
8437         }
8438     }
8439 }
8440
8441 /* Given loop represented by LOOP_VINFO, return true if computation of
8442    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
8443    otherwise.  */
8444
8445 static bool
8446 loop_niters_no_overflow (loop_vec_info loop_vinfo)
8447 {
8448   /* Constant case.  */
8449   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8450     {
8451       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
8452       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
8453
8454       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
8455       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
8456       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
8457         return true;
8458     }
8459
8460   widest_int max;
8461   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8462   /* Check the upper bound of loop niters.  */
8463   if (get_max_loop_iterations (loop, &max))
8464     {
8465       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
8466       signop sgn = TYPE_SIGN (type);
8467       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
8468       if (max < type_max)
8469         return true;
8470     }
8471   return false;
8472 }
8473
8474 /* Return a mask type with half the number of elements as TYPE.  */
8475
8476 tree
8477 vect_halve_mask_nunits (tree type)
8478 {
8479   poly_uint64 nunits = exact_div (TYPE_VECTOR_SUBPARTS (type), 2);
8480   return build_truth_vector_type (nunits, current_vector_size);
8481 }
8482
8483 /* Return a mask type with twice as many elements as TYPE.  */
8484
8485 tree
8486 vect_double_mask_nunits (tree type)
8487 {
8488   poly_uint64 nunits = TYPE_VECTOR_SUBPARTS (type) * 2;
8489   return build_truth_vector_type (nunits, current_vector_size);
8490 }
8491
8492 /* Record that a fully-masked version of LOOP_VINFO would need MASKS to
8493    contain a sequence of NVECTORS masks that each control a vector of type
8494    VECTYPE.  */
8495
8496 void
8497 vect_record_loop_mask (loop_vec_info loop_vinfo, vec_loop_masks *masks,
8498                        unsigned int nvectors, tree vectype)
8499 {
8500   gcc_assert (nvectors != 0);
8501   if (masks->length () < nvectors)
8502     masks->safe_grow_cleared (nvectors);
8503   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8504   /* The number of scalars per iteration and the number of vectors are
8505      both compile-time constants.  */
8506   unsigned int nscalars_per_iter
8507     = exact_div (nvectors * TYPE_VECTOR_SUBPARTS (vectype),
8508                  LOOP_VINFO_VECT_FACTOR (loop_vinfo)).to_constant ();
8509   if (rgm->max_nscalars_per_iter < nscalars_per_iter)
8510     {
8511       rgm->max_nscalars_per_iter = nscalars_per_iter;
8512       rgm->mask_type = build_same_sized_truth_vector_type (vectype);
8513     }
8514 }
8515
8516 /* Given a complete set of masks MASKS, extract mask number INDEX
8517    for an rgroup that operates on NVECTORS vectors of type VECTYPE,
8518    where 0 <= INDEX < NVECTORS.  Insert any set-up statements before GSI.
8519
8520    See the comment above vec_loop_masks for more details about the mask
8521    arrangement.  */
8522
8523 tree
8524 vect_get_loop_mask (gimple_stmt_iterator *gsi, vec_loop_masks *masks,
8525                     unsigned int nvectors, tree vectype, unsigned int index)
8526 {
8527   rgroup_masks *rgm = &(*masks)[nvectors - 1];
8528   tree mask_type = rgm->mask_type;
8529
8530   /* Populate the rgroup's mask array, if this is the first time we've
8531      used it.  */
8532   if (rgm->masks.is_empty ())
8533     {
8534       rgm->masks.safe_grow_cleared (nvectors);
8535       for (unsigned int i = 0; i < nvectors; ++i)
8536         {
8537           tree mask = make_temp_ssa_name (mask_type, NULL, "loop_mask");
8538           /* Provide a dummy definition until the real one is available.  */
8539           SSA_NAME_DEF_STMT (mask) = gimple_build_nop ();
8540           rgm->masks[i] = mask;
8541         }
8542     }
8543
8544   tree mask = rgm->masks[index];
8545   if (maybe_ne (TYPE_VECTOR_SUBPARTS (mask_type),
8546                 TYPE_VECTOR_SUBPARTS (vectype)))
8547     {
8548       /* A loop mask for data type X can be reused for data type Y
8549          if X has N times more elements than Y and if Y's elements
8550          are N times bigger than X's.  In this case each sequence
8551          of N elements in the loop mask will be all-zero or all-one.
8552          We can then view-convert the mask so that each sequence of
8553          N elements is replaced by a single element.  */
8554       gcc_assert (multiple_p (TYPE_VECTOR_SUBPARTS (mask_type),
8555                               TYPE_VECTOR_SUBPARTS (vectype)));
8556       gimple_seq seq = NULL;
8557       mask_type = build_same_sized_truth_vector_type (vectype);
8558       mask = gimple_build (&seq, VIEW_CONVERT_EXPR, mask_type, mask);
8559       if (seq)
8560         gsi_insert_seq_before (gsi, seq, GSI_SAME_STMT);
8561     }
8562   return mask;
8563 }
8564
8565 /* Scale profiling counters by estimation for LOOP which is vectorized
8566    by factor VF.  */
8567
8568 static void
8569 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
8570 {
8571   edge preheader = loop_preheader_edge (loop);
8572   /* Reduce loop iterations by the vectorization factor.  */
8573   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
8574   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
8575
8576   if (freq_h.nonzero_p ())
8577     {
8578       profile_probability p;
8579
8580       /* Avoid dropping loop body profile counter to 0 because of zero count
8581          in loop's preheader.  */
8582       if (!(freq_e == profile_count::zero ()))
8583         freq_e = freq_e.force_nonzero ();
8584       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
8585       scale_loop_frequencies (loop, p);
8586     }
8587
8588   edge exit_e = single_exit (loop);
8589   exit_e->probability = profile_probability::always ()
8590                                  .apply_scale (1, new_est_niter + 1);
8591
8592   edge exit_l = single_pred_edge (loop->latch);
8593   profile_probability prob = exit_l->probability;
8594   exit_l->probability = exit_e->probability.invert ();
8595   if (prob.initialized_p () && exit_l->probability.initialized_p ())
8596     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
8597 }
8598
8599 /* Function vect_transform_loop.
8600
8601    The analysis phase has determined that the loop is vectorizable.
8602    Vectorize the loop - created vectorized stmts to replace the scalar
8603    stmts in the loop, and update the loop exit condition.
8604    Returns scalar epilogue loop if any.  */
8605
8606 struct loop *
8607 vect_transform_loop (loop_vec_info loop_vinfo)
8608 {
8609   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
8610   struct loop *epilogue = NULL;
8611   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
8612   int nbbs = loop->num_nodes;
8613   int i;
8614   tree niters_vector = NULL_TREE;
8615   tree step_vector = NULL_TREE;
8616   tree niters_vector_mult_vf = NULL_TREE;
8617   poly_uint64 vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
8618   unsigned int lowest_vf = constant_lower_bound (vf);
8619   bool grouped_store;
8620   bool slp_scheduled = false;
8621   gimple *stmt, *pattern_stmt;
8622   gimple_seq pattern_def_seq = NULL;
8623   gimple_stmt_iterator pattern_def_si = gsi_none ();
8624   bool transform_pattern_stmt = false;
8625   bool check_profitability = false;
8626   unsigned int th;
8627
8628   if (dump_enabled_p ())
8629     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
8630
8631   /* Use the more conservative vectorization threshold.  If the number
8632      of iterations is constant assume the cost check has been performed
8633      by our caller.  If the threshold makes all loops profitable that
8634      run at least the (estimated) vectorization factor number of times
8635      checking is pointless, too.  */
8636   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
8637   if (th >= vect_vf_for_cost (loop_vinfo)
8638       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
8639     {
8640       if (dump_enabled_p ())
8641         dump_printf_loc (MSG_NOTE, vect_location,
8642                          "Profitability threshold is %d loop iterations.\n",
8643                          th);
8644       check_profitability = true;
8645     }
8646
8647   /* Make sure there exists a single-predecessor exit bb.  Do this before
8648      versioning.   */
8649   edge e = single_exit (loop);
8650   if (! single_pred_p (e->dest))
8651     {
8652       split_loop_exit_edge (e);
8653       if (dump_enabled_p ())
8654         dump_printf (MSG_NOTE, "split exit edge\n");
8655     }
8656
8657   /* Version the loop first, if required, so the profitability check
8658      comes first.  */
8659
8660   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
8661     {
8662       poly_uint64 versioning_threshold
8663         = LOOP_VINFO_VERSIONING_THRESHOLD (loop_vinfo);
8664       if (check_profitability
8665           && ordered_p (poly_uint64 (th), versioning_threshold))
8666         {
8667           versioning_threshold = ordered_max (poly_uint64 (th),
8668                                               versioning_threshold);
8669           check_profitability = false;
8670         }
8671       vect_loop_versioning (loop_vinfo, th, check_profitability,
8672                             versioning_threshold);
8673       check_profitability = false;
8674     }
8675
8676   /* Make sure there exists a single-predecessor exit bb also on the
8677      scalar loop copy.  Do this after versioning but before peeling
8678      so CFG structure is fine for both scalar and if-converted loop
8679      to make slpeel_duplicate_current_defs_from_edges face matched
8680      loop closed PHI nodes on the exit.  */
8681   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
8682     {
8683       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
8684       if (! single_pred_p (e->dest))
8685         {
8686           split_loop_exit_edge (e);
8687           if (dump_enabled_p ())
8688             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
8689         }
8690     }
8691
8692   tree niters = vect_build_loop_niters (loop_vinfo);
8693   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
8694   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
8695   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
8696   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector,
8697                               &step_vector, &niters_vector_mult_vf, th,
8698                               check_profitability, niters_no_overflow);
8699
8700   if (niters_vector == NULL_TREE)
8701     {
8702       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
8703           && !LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8704           && known_eq (lowest_vf, vf))
8705         {
8706           niters_vector
8707             = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
8708                              LOOP_VINFO_INT_NITERS (loop_vinfo) / lowest_vf);
8709           step_vector = build_one_cst (TREE_TYPE (niters));
8710         }
8711       else
8712         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
8713                                      &step_vector, niters_no_overflow);
8714     }
8715
8716   /* 1) Make sure the loop header has exactly two entries
8717      2) Make sure we have a preheader basic block.  */
8718
8719   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
8720
8721   split_edge (loop_preheader_edge (loop));
8722
8723   if (LOOP_VINFO_FULLY_MASKED_P (loop_vinfo)
8724       && vect_use_loop_mask_for_alignment_p (loop_vinfo))
8725     /* This will deal with any possible peeling.  */
8726     vect_prepare_for_masked_peels (loop_vinfo);
8727
8728   /* FORNOW: the vectorizer supports only loops which body consist
8729      of one basic block (header + empty latch). When the vectorizer will
8730      support more involved loop forms, the order by which the BBs are
8731      traversed need to be reconsidered.  */
8732
8733   for (i = 0; i < nbbs; i++)
8734     {
8735       basic_block bb = bbs[i];
8736       stmt_vec_info stmt_info;
8737
8738       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
8739            gsi_next (&si))
8740         {
8741           gphi *phi = si.phi ();
8742           if (dump_enabled_p ())
8743             {
8744               dump_printf_loc (MSG_NOTE, vect_location,
8745                                "------>vectorizing phi: ");
8746               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
8747             }
8748           stmt_info = vinfo_for_stmt (phi);
8749           if (!stmt_info)
8750             continue;
8751
8752           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8753             vect_loop_kill_debug_uses (loop, phi);
8754
8755           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8756               && !STMT_VINFO_LIVE_P (stmt_info))
8757             continue;
8758
8759           if (STMT_VINFO_VECTYPE (stmt_info)
8760               && (maybe_ne
8761                   (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info)), vf))
8762               && dump_enabled_p ())
8763             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8764
8765           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
8766                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
8767                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
8768               && ! PURE_SLP_STMT (stmt_info))
8769             {
8770               if (dump_enabled_p ())
8771                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
8772               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
8773             }
8774         }
8775
8776       pattern_stmt = NULL;
8777       for (gimple_stmt_iterator si = gsi_start_bb (bb);
8778            !gsi_end_p (si) || transform_pattern_stmt;)
8779         {
8780           bool is_store;
8781
8782           if (transform_pattern_stmt)
8783             stmt = pattern_stmt;
8784           else
8785             {
8786               stmt = gsi_stmt (si);
8787               /* During vectorization remove existing clobber stmts.  */
8788               if (gimple_clobber_p (stmt))
8789                 {
8790                   unlink_stmt_vdef (stmt);
8791                   gsi_remove (&si, true);
8792                   release_defs (stmt);
8793                   continue;
8794                 }
8795             }
8796
8797           if (dump_enabled_p ())
8798             {
8799               dump_printf_loc (MSG_NOTE, vect_location,
8800                                "------>vectorizing statement: ");
8801               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
8802             }
8803
8804           stmt_info = vinfo_for_stmt (stmt);
8805
8806           /* vector stmts created in the outer-loop during vectorization of
8807              stmts in an inner-loop may not have a stmt_info, and do not
8808              need to be vectorized.  */
8809           if (!stmt_info)
8810             {
8811               gsi_next (&si);
8812               continue;
8813             }
8814
8815           if (MAY_HAVE_DEBUG_BIND_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
8816             vect_loop_kill_debug_uses (loop, stmt);
8817
8818           if (!STMT_VINFO_RELEVANT_P (stmt_info)
8819               && !STMT_VINFO_LIVE_P (stmt_info))
8820             {
8821               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8822                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8823                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8824                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8825                 {
8826                   stmt = pattern_stmt;
8827                   stmt_info = vinfo_for_stmt (stmt);
8828                 }
8829               else
8830                 {
8831                   gsi_next (&si);
8832                   continue;
8833                 }
8834             }
8835           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
8836                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
8837                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
8838                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
8839             transform_pattern_stmt = true;
8840
8841           /* If pattern statement has def stmts, vectorize them too.  */
8842           if (is_pattern_stmt_p (stmt_info))
8843             {
8844               if (pattern_def_seq == NULL)
8845                 {
8846                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
8847                   pattern_def_si = gsi_start (pattern_def_seq);
8848                 }
8849               else if (!gsi_end_p (pattern_def_si))
8850                 gsi_next (&pattern_def_si);
8851               if (pattern_def_seq != NULL)
8852                 {
8853                   gimple *pattern_def_stmt = NULL;
8854                   stmt_vec_info pattern_def_stmt_info = NULL;
8855
8856                   while (!gsi_end_p (pattern_def_si))
8857                     {
8858                       pattern_def_stmt = gsi_stmt (pattern_def_si);
8859                       pattern_def_stmt_info
8860                         = vinfo_for_stmt (pattern_def_stmt);
8861                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
8862                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
8863                         break;
8864                       gsi_next (&pattern_def_si);
8865                     }
8866
8867                   if (!gsi_end_p (pattern_def_si))
8868                     {
8869                       if (dump_enabled_p ())
8870                         {
8871                           dump_printf_loc (MSG_NOTE, vect_location,
8872                                            "==> vectorizing pattern def "
8873                                            "stmt: ");
8874                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
8875                                             pattern_def_stmt, 0);
8876                         }
8877
8878                       stmt = pattern_def_stmt;
8879                       stmt_info = pattern_def_stmt_info;
8880                     }
8881                   else
8882                     {
8883                       pattern_def_si = gsi_none ();
8884                       transform_pattern_stmt = false;
8885                     }
8886                 }
8887               else
8888                 transform_pattern_stmt = false;
8889             }
8890
8891           if (STMT_VINFO_VECTYPE (stmt_info))
8892             {
8893               poly_uint64 nunits
8894                 = TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
8895               if (!STMT_SLP_TYPE (stmt_info)
8896                   && maybe_ne (nunits, vf)
8897                   && dump_enabled_p ())
8898                   /* For SLP VF is set according to unrolling factor, and not
8899                      to vector size, hence for SLP this print is not valid.  */
8900                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
8901             }
8902
8903           /* SLP. Schedule all the SLP instances when the first SLP stmt is
8904              reached.  */
8905           if (STMT_SLP_TYPE (stmt_info))
8906             {
8907               if (!slp_scheduled)
8908                 {
8909                   slp_scheduled = true;
8910
8911                   if (dump_enabled_p ())
8912                     dump_printf_loc (MSG_NOTE, vect_location,
8913                                      "=== scheduling SLP instances ===\n");
8914
8915                   vect_schedule_slp (loop_vinfo);
8916                 }
8917
8918               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
8919               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
8920                 {
8921                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8922                     {
8923                       pattern_def_seq = NULL;
8924                       gsi_next (&si);
8925                     }
8926                   continue;
8927                 }
8928             }
8929
8930           /* -------- vectorize statement ------------ */
8931           if (dump_enabled_p ())
8932             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
8933
8934           grouped_store = false;
8935           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
8936           if (is_store)
8937             {
8938               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
8939                 {
8940                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
8941                      interleaving chain was completed - free all the stores in
8942                      the chain.  */
8943                   gsi_next (&si);
8944                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
8945                 }
8946               else
8947                 {
8948                   /* Free the attached stmt_vec_info and remove the stmt.  */
8949                   gimple *store = gsi_stmt (si);
8950                   free_stmt_vec_info (store);
8951                   unlink_stmt_vdef (store);
8952                   gsi_remove (&si, true);
8953                   release_defs (store);
8954                 }
8955
8956               /* Stores can only appear at the end of pattern statements.  */
8957               gcc_assert (!transform_pattern_stmt);
8958               pattern_def_seq = NULL;
8959             }
8960           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
8961             {
8962               pattern_def_seq = NULL;
8963               gsi_next (&si);
8964             }
8965         }                       /* stmts in BB */
8966
8967       /* Stub out scalar statements that must not survive vectorization.
8968          Doing this here helps with grouped statements, or statements that
8969          are involved in patterns.  */
8970       for (gimple_stmt_iterator gsi = gsi_start_bb (bb);
8971            !gsi_end_p (gsi); gsi_next (&gsi))
8972         {
8973           gcall *call = dyn_cast <gcall *> (gsi_stmt (gsi));
8974           if (call && gimple_call_internal_p (call, IFN_MASK_LOAD))
8975             {
8976               tree lhs = gimple_get_lhs (call);
8977               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
8978                 {
8979                   tree zero = build_zero_cst (TREE_TYPE (lhs));
8980                   gimple *new_stmt = gimple_build_assign (lhs, zero);
8981                   gsi_replace (&gsi, new_stmt, true);
8982                 }
8983             }
8984         }
8985     }                           /* BBs in loop */
8986
8987   /* The vectorization factor is always > 1, so if we use an IV increment of 1.
8988      a zero NITERS becomes a nonzero NITERS_VECTOR.  */
8989   if (integer_onep (step_vector))
8990     niters_no_overflow = true;
8991   vect_set_loop_condition (loop, loop_vinfo, niters_vector, step_vector,
8992                            niters_vector_mult_vf, !niters_no_overflow);
8993
8994   unsigned int assumed_vf = vect_vf_for_cost (loop_vinfo);
8995   scale_profile_for_vect_loop (loop, assumed_vf);
8996
8997   /* True if the final iteration might not handle a full vector's
8998      worth of scalar iterations.  */
8999   bool final_iter_may_be_partial = LOOP_VINFO_FULLY_MASKED_P (loop_vinfo);
9000   /* The minimum number of iterations performed by the epilogue.  This
9001      is 1 when peeling for gaps because we always need a final scalar
9002      iteration.  */
9003   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
9004   /* +1 to convert latch counts to loop iteration counts,
9005      -min_epilogue_iters to remove iterations that cannot be performed
9006        by the vector code.  */
9007   int bias_for_lowest = 1 - min_epilogue_iters;
9008   int bias_for_assumed = bias_for_lowest;
9009   int alignment_npeels = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
9010   if (alignment_npeels && LOOP_VINFO_FULLY_MASKED_P (loop_vinfo))
9011     {
9012       /* When the amount of peeling is known at compile time, the first
9013          iteration will have exactly alignment_npeels active elements.
9014          In the worst case it will have at least one.  */
9015       int min_first_active = (alignment_npeels > 0 ? alignment_npeels : 1);
9016       bias_for_lowest += lowest_vf - min_first_active;
9017       bias_for_assumed += assumed_vf - min_first_active;
9018     }
9019   /* In these calculations the "- 1" converts loop iteration counts
9020      back to latch counts.  */
9021   if (loop->any_upper_bound)
9022     loop->nb_iterations_upper_bound
9023       = (final_iter_may_be_partial
9024          ? wi::udiv_ceil (loop->nb_iterations_upper_bound + bias_for_lowest,
9025                           lowest_vf) - 1
9026          : wi::udiv_floor (loop->nb_iterations_upper_bound + bias_for_lowest,
9027                            lowest_vf) - 1);
9028   if (loop->any_likely_upper_bound)
9029     loop->nb_iterations_likely_upper_bound
9030       = (final_iter_may_be_partial
9031          ? wi::udiv_ceil (loop->nb_iterations_likely_upper_bound
9032                           + bias_for_lowest, lowest_vf) - 1
9033          : wi::udiv_floor (loop->nb_iterations_likely_upper_bound
9034                            + bias_for_lowest, lowest_vf) - 1);
9035   if (loop->any_estimate)
9036     loop->nb_iterations_estimate
9037       = (final_iter_may_be_partial
9038          ? wi::udiv_ceil (loop->nb_iterations_estimate + bias_for_assumed,
9039                           assumed_vf) - 1
9040          : wi::udiv_floor (loop->nb_iterations_estimate + bias_for_assumed,
9041                            assumed_vf) - 1);
9042
9043   if (dump_enabled_p ())
9044     {
9045       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9046         {
9047           dump_printf_loc (MSG_NOTE, vect_location,
9048                            "LOOP VECTORIZED\n");
9049           if (loop->inner)
9050             dump_printf_loc (MSG_NOTE, vect_location,
9051                              "OUTER LOOP VECTORIZED\n");
9052           dump_printf (MSG_NOTE, "\n");
9053         }
9054       else
9055         {
9056           dump_printf_loc (MSG_NOTE, vect_location,
9057                            "LOOP EPILOGUE VECTORIZED (VS=");
9058           dump_dec (MSG_NOTE, current_vector_size);
9059           dump_printf (MSG_NOTE, ")\n");
9060         }
9061     }
9062
9063   /* Free SLP instances here because otherwise stmt reference counting
9064      won't work.  */
9065   slp_instance instance;
9066   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
9067     vect_free_slp_instance (instance);
9068   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
9069   /* Clear-up safelen field since its value is invalid after vectorization
9070      since vectorized loop can have loop-carried dependencies.  */
9071   loop->safelen = 0;
9072
9073   /* Don't vectorize epilogue for epilogue.  */
9074   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
9075     epilogue = NULL;
9076
9077   if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
9078     epilogue = NULL;
9079
9080   if (epilogue)
9081     {
9082       auto_vector_sizes vector_sizes;
9083       targetm.vectorize.autovectorize_vector_sizes (&vector_sizes);
9084       unsigned int next_size = 0;
9085
9086       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
9087           && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0
9088           && known_eq (vf, lowest_vf))
9089         {
9090           unsigned int eiters
9091             = (LOOP_VINFO_INT_NITERS (loop_vinfo)
9092                - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo));
9093           eiters = eiters % lowest_vf;
9094           epilogue->nb_iterations_upper_bound = eiters - 1;
9095
9096           unsigned int ratio;
9097           while (next_size < vector_sizes.length ()
9098                  && !(constant_multiple_p (current_vector_size,
9099                                            vector_sizes[next_size], &ratio)
9100                       && eiters >= lowest_vf / ratio))
9101             next_size += 1;
9102         }
9103       else
9104         while (next_size < vector_sizes.length ()
9105                && maybe_lt (current_vector_size, vector_sizes[next_size]))
9106           next_size += 1;
9107
9108       if (next_size == vector_sizes.length ())
9109         epilogue = NULL;
9110     }
9111
9112   if (epilogue)
9113     {
9114       epilogue->force_vectorize = loop->force_vectorize;
9115       epilogue->safelen = loop->safelen;
9116       epilogue->dont_vectorize = false;
9117
9118       /* We may need to if-convert epilogue to vectorize it.  */
9119       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
9120         tree_if_conversion (epilogue);
9121     }
9122
9123   return epilogue;
9124 }
9125
9126 /* The code below is trying to perform simple optimization - revert
9127    if-conversion for masked stores, i.e. if the mask of a store is zero
9128    do not perform it and all stored value producers also if possible.
9129    For example,
9130      for (i=0; i<n; i++)
9131        if (c[i])
9132         {
9133           p1[i] += 1;
9134           p2[i] = p3[i] +2;
9135         }
9136    this transformation will produce the following semi-hammock:
9137
9138    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
9139      {
9140        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
9141        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
9142        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
9143        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
9144        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
9145        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
9146      }
9147 */
9148
9149 void
9150 optimize_mask_stores (struct loop *loop)
9151 {
9152   basic_block *bbs = get_loop_body (loop);
9153   unsigned nbbs = loop->num_nodes;
9154   unsigned i;
9155   basic_block bb;
9156   struct loop *bb_loop;
9157   gimple_stmt_iterator gsi;
9158   gimple *stmt;
9159   auto_vec<gimple *> worklist;
9160
9161   vect_location = find_loop_location (loop);
9162   /* Pick up all masked stores in loop if any.  */
9163   for (i = 0; i < nbbs; i++)
9164     {
9165       bb = bbs[i];
9166       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
9167            gsi_next (&gsi))
9168         {
9169           stmt = gsi_stmt (gsi);
9170           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
9171             worklist.safe_push (stmt);
9172         }
9173     }
9174
9175   free (bbs);
9176   if (worklist.is_empty ())
9177     return;
9178
9179   /* Loop has masked stores.  */
9180   while (!worklist.is_empty ())
9181     {
9182       gimple *last, *last_store;
9183       edge e, efalse;
9184       tree mask;
9185       basic_block store_bb, join_bb;
9186       gimple_stmt_iterator gsi_to;
9187       tree vdef, new_vdef;
9188       gphi *phi;
9189       tree vectype;
9190       tree zero;
9191
9192       last = worklist.pop ();
9193       mask = gimple_call_arg (last, 2);
9194       bb = gimple_bb (last);
9195       /* Create then_bb and if-then structure in CFG, then_bb belongs to
9196          the same loop as if_bb.  It could be different to LOOP when two
9197          level loop-nest is vectorized and mask_store belongs to the inner
9198          one.  */
9199       e = split_block (bb, last);
9200       bb_loop = bb->loop_father;
9201       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
9202       join_bb = e->dest;
9203       store_bb = create_empty_bb (bb);
9204       add_bb_to_loop (store_bb, bb_loop);
9205       e->flags = EDGE_TRUE_VALUE;
9206       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
9207       /* Put STORE_BB to likely part.  */
9208       efalse->probability = profile_probability::unlikely ();
9209       store_bb->count = efalse->count ();
9210       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
9211       if (dom_info_available_p (CDI_DOMINATORS))
9212         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
9213       if (dump_enabled_p ())
9214         dump_printf_loc (MSG_NOTE, vect_location,
9215                          "Create new block %d to sink mask stores.",
9216                          store_bb->index);
9217       /* Create vector comparison with boolean result.  */
9218       vectype = TREE_TYPE (mask);
9219       zero = build_zero_cst (vectype);
9220       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
9221       gsi = gsi_last_bb (bb);
9222       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
9223       /* Create new PHI node for vdef of the last masked store:
9224          .MEM_2 = VDEF <.MEM_1>
9225          will be converted to
9226          .MEM.3 = VDEF <.MEM_1>
9227          and new PHI node will be created in join bb
9228          .MEM_2 = PHI <.MEM_1, .MEM_3>
9229       */
9230       vdef = gimple_vdef (last);
9231       new_vdef = make_ssa_name (gimple_vop (cfun), last);
9232       gimple_set_vdef (last, new_vdef);
9233       phi = create_phi_node (vdef, join_bb);
9234       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
9235
9236       /* Put all masked stores with the same mask to STORE_BB if possible.  */
9237       while (true)
9238         {
9239           gimple_stmt_iterator gsi_from;
9240           gimple *stmt1 = NULL;
9241
9242           /* Move masked store to STORE_BB.  */
9243           last_store = last;
9244           gsi = gsi_for_stmt (last);
9245           gsi_from = gsi;
9246           /* Shift GSI to the previous stmt for further traversal.  */
9247           gsi_prev (&gsi);
9248           gsi_to = gsi_start_bb (store_bb);
9249           gsi_move_before (&gsi_from, &gsi_to);
9250           /* Setup GSI_TO to the non-empty block start.  */
9251           gsi_to = gsi_start_bb (store_bb);
9252           if (dump_enabled_p ())
9253             {
9254               dump_printf_loc (MSG_NOTE, vect_location,
9255                                "Move stmt to created bb\n");
9256               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
9257             }
9258           /* Move all stored value producers if possible.  */
9259           while (!gsi_end_p (gsi))
9260             {
9261               tree lhs;
9262               imm_use_iterator imm_iter;
9263               use_operand_p use_p;
9264               bool res;
9265
9266               /* Skip debug statements.  */
9267               if (is_gimple_debug (gsi_stmt (gsi)))
9268                 {
9269                   gsi_prev (&gsi);
9270                   continue;
9271                 }
9272               stmt1 = gsi_stmt (gsi);
9273               /* Do not consider statements writing to memory or having
9274                  volatile operand.  */
9275               if (gimple_vdef (stmt1)
9276                   || gimple_has_volatile_ops (stmt1))
9277                 break;
9278               gsi_from = gsi;
9279               gsi_prev (&gsi);
9280               lhs = gimple_get_lhs (stmt1);
9281               if (!lhs)
9282                 break;
9283
9284               /* LHS of vectorized stmt must be SSA_NAME.  */
9285               if (TREE_CODE (lhs) != SSA_NAME)
9286                 break;
9287
9288               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
9289                 {
9290                   /* Remove dead scalar statement.  */
9291                   if (has_zero_uses (lhs))
9292                     {
9293                       gsi_remove (&gsi_from, true);
9294                       continue;
9295                     }
9296                 }
9297
9298               /* Check that LHS does not have uses outside of STORE_BB.  */
9299               res = true;
9300               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
9301                 {
9302                   gimple *use_stmt;
9303                   use_stmt = USE_STMT (use_p);
9304                   if (is_gimple_debug (use_stmt))
9305                     continue;
9306                   if (gimple_bb (use_stmt) != store_bb)
9307                     {
9308                       res = false;
9309                       break;
9310                     }
9311                 }
9312               if (!res)
9313                 break;
9314
9315               if (gimple_vuse (stmt1)
9316                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
9317                 break;
9318
9319               /* Can move STMT1 to STORE_BB.  */
9320               if (dump_enabled_p ())
9321                 {
9322                   dump_printf_loc (MSG_NOTE, vect_location,
9323                                    "Move stmt to created bb\n");
9324                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
9325                 }
9326               gsi_move_before (&gsi_from, &gsi_to);
9327               /* Shift GSI_TO for further insertion.  */
9328               gsi_prev (&gsi_to);
9329             }
9330           /* Put other masked stores with the same mask to STORE_BB.  */
9331           if (worklist.is_empty ()
9332               || gimple_call_arg (worklist.last (), 2) != mask
9333               || worklist.last () != stmt1)
9334             break;
9335           last = worklist.pop ();
9336         }
9337       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
9338     }
9339 }