gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53
  54 /* Loop Vectorization Pass.
  55
  56    This pass tries to vectorize loops.
  57
  58    For example, the vectorizer transforms the following simple loop:
  59
  60         short a[N]; short b[N]; short c[N]; int i;
  61
  62         for (i=0; i<N; i++){
  63           a[i] = b[i] + c[i];
  64         }
  65
  66    as if it was manually vectorized by rewriting the source code into:
  67
  68         typedef int __attribute__((mode(V8HI))) v8hi;
  69         short a[N];  short b[N]; short c[N];   int i;
  70         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  71         v8hi va, vb, vc;
  72
  73         for (i=0; i<N/8; i++){
  74           vb = pb[i];
  75           vc = pc[i];
  76           va = vb + vc;
  77           pa[i] = va;
  78         }
  79
  80         The main entry to this pass is vectorize_loops(), in which
  81    the vectorizer applies a set of analyses on a given set of loops,
  82    followed by the actual vectorization transformation for the loops that
  83    had successfully passed the analysis phase.
  84         Throughout this pass we make a distinction between two types of
  85    data: scalars (which are represented by SSA_NAMES), and memory references
  86    ("data-refs").  These two types of data require different handling both
  87    during analysis and transformation. The types of data-refs that the
  88    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  89    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  90    accesses are required to have a simple (consecutive) access pattern.
  91
  92    Analysis phase:
  93    ===============
  94         The driver for the analysis phase is vect_analyze_loop().
  95    It applies a set of analyses, some of which rely on the scalar evolution
  96    analyzer (scev) developed by Sebastian Pop.
  97
  98         During the analysis phase the vectorizer records some information
  99    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 100    loop, as well as general information about the loop as a whole, which is
 101    recorded in a "loop_vec_info" struct attached to each loop.
 102
 103    Transformation phase:
 104    =====================
 105         The loop transformation phase scans all the stmts in the loop, and
 106    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 107    the loop that needs to be vectorized.  It inserts the vector code sequence
 108    just before the scalar stmt S, and records a pointer to the vector code
 109    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 110    attached to S).  This pointer will be used for the vectorization of following
 111    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 112    otherwise, we rely on dead code elimination for removing it.
 113
 114         For example, say stmt S1 was vectorized into stmt VS1:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    S2:  a = b;
 119
 120    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 121    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 122    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 123    resulting sequence would be:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    VS2: va = vb;
 128    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 129
 130         Operands that are not SSA_NAMEs, are data-refs that appear in
 131    load/store operations (like 'x[i]' in S1), and are handled differently.
 132
 133    Target modeling:
 134    =================
 135         Currently the only target specific information that is used is the
 136    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 137    Targets that can support different sizes of vectors, for now will need
 138    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 139    flexibility will be added in the future.
 140
 141         Since we only vectorize operations which vector form can be
 142    expressed using existing tree codes, to verify that an operation is
 143    supported, the vectorizer checks the relevant optab at the relevant
 144    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 145    the value found is CODE_FOR_nothing, then there's no target support, and
 146    we can't vectorize the stmt.
 147
 148    For additional information on this project see:
 149    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 150 */
 151
 152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 153
 154 /* Function vect_determine_vectorization_factor
 155
 156    Determine the vectorization factor (VF).  VF is the number of data elements
 157    that are operated upon in parallel in a single iteration of the vectorized
 158    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 159    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 160    elements can fit in a single vector register.
 161
 162    We currently support vectorization of loops in which all types operated upon
 163    are of the same size.  Therefore this function currently sets VF according to
 164    the size of the types operated upon, and fails if there are multiple sizes
 165    in the loop.
 166
 167    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 168    original loop:
 169         for (i=0; i<N; i++){
 170           a[i] = b[i] + c[i];
 171         }
 172
 173    vectorized loop:
 174         for (i=0; i<N; i+=VF){
 175           a[i:VF] = b[i:VF] + c[i:VF];
 176         }
 177 */
 178
 179 static bool
 180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 181 {
 182   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 183   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 184   unsigned nbbs = loop->num_nodes;
 185   unsigned int vectorization_factor = 0;
 186   tree scalar_type = NULL_TREE;
 187   gphi *phi;
 188   tree vectype;
 189   unsigned int nunits;
 190   stmt_vec_info stmt_info;
 191   unsigned i;
 192   HOST_WIDE_INT dummy;
 193   gimple *stmt, *pattern_stmt = NULL;
 194   gimple_seq pattern_def_seq = NULL;
 195   gimple_stmt_iterator pattern_def_si = gsi_none ();
 196   bool analyze_pattern_stmt = false;
 197   bool bool_result;
 198   auto_vec<stmt_vec_info> mask_producers;
 199
 200   if (dump_enabled_p ())
 201     dump_printf_loc (MSG_NOTE, vect_location,
 202                      "=== vect_determine_vectorization_factor ===\n");
 203
 204   for (i = 0; i < nbbs; i++)
 205     {
 206       basic_block bb = bbs[i];
 207
 208       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 209            gsi_next (&si))
 210         {
 211           phi = si.phi ();
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217             }
 218
 219           gcc_assert (stmt_info);
 220
 221           if (STMT_VINFO_RELEVANT_P (stmt_info)
 222               || STMT_VINFO_LIVE_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 554           if (dump_enabled_p ())
 555             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 556           if (!vectorization_factor
 557               || (nunits > vectorization_factor))
 558             vectorization_factor = nunits;
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 571                      vectorization_factor);
 572   if (vectorization_factor <= 1)
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "not vectorized: unsupported data-type\n");
 577       return false;
 578     }
 579   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 580
 581   for (i = 0; i < mask_producers.length (); i++)
 582     {
 583       tree mask_type = NULL;
 584
 585       stmt = STMT_VINFO_STMT (mask_producers[i]);
 586
 587       if (is_gimple_assign (stmt)
 588           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 589           && !VECT_SCALAR_BOOLEAN_TYPE_P
 590                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                     }
 623                   return false;
 624                 }
 625
 626               /* No vectype probably means external definition.
 627                  Allow it in case there is another operand which
 628                  allows to determine mask type.  */
 629               if (!vectype)
 630                 continue;
 631
 632               if (!mask_type)
 633                 mask_type = vectype;
 634               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 635                        != TYPE_VECTOR_SUBPARTS (vectype))
 636                 {
 637                   if (dump_enabled_p ())
 638                     {
 639                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                                        "not vectorized: different sized masks "
 641                                        "types in statement, ");
 642                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 643                                          mask_type);
 644                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 645                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 646                                          vectype);
 647                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 648                     }
 649                   return false;
 650                 }
 651               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 652                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 653                 {
 654                   if (dump_enabled_p ())
 655                     {
 656                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                                        "not vectorized: mixed mask and "
 658                                        "nonmask vector types in statement, ");
 659                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 660                                          mask_type);
 661                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 662                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 663                                          vectype);
 664                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 665                     }
 666                   return false;
 667                 }
 668             }
 669
 670           /* We may compare boolean value loaded as vector of integers.
 671              Fix mask_type in such case.  */
 672           if (mask_type
 673               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 674               && gimple_code (stmt) == GIMPLE_ASSIGN
 675               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 676             mask_type = build_same_sized_truth_vector_type (mask_type);
 677         }
 678
 679       /* No mask_type should mean loop invariant predicate.
 680          This is probably a subject for optimization in
 681          if-conversion.  */
 682       if (!mask_type)
 683         {
 684           if (dump_enabled_p ())
 685             {
 686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                                "not vectorized: can't compute mask type "
 688                                "for statement, ");
 689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 690                                 0);
 691             }
 692           return false;
 693         }
 694
 695       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 696     }
 697
 698   return true;
 699 }
 700
 701
 702 /* Function vect_is_simple_iv_evolution.
 703
 704    FORNOW: A simple evolution of an induction variables in the loop is
 705    considered a polynomial evolution.  */
 706
 707 static bool
 708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 709                              tree * step)
 710 {
 711   tree init_expr;
 712   tree step_expr;
 713   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 714   basic_block bb;
 715
 716   /* When there is no evolution in this loop, the evolution function
 717      is not "simple".  */
 718   if (evolution_part == NULL_TREE)
 719     return false;
 720
 721   /* When the evolution is a polynomial of degree >= 2
 722      the evolution function is not "simple".  */
 723   if (tree_is_chrec (evolution_part))
 724     return false;
 725
 726   step_expr = evolution_part;
 727   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 728
 729   if (dump_enabled_p ())
 730     {
 731       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 732       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 733       dump_printf (MSG_NOTE, ",  init: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 735       dump_printf (MSG_NOTE, "\n");
 736     }
 737
 738   *init = init_expr;
 739   *step = step_expr;
 740
 741   if (TREE_CODE (step_expr) != INTEGER_CST
 742       && (TREE_CODE (step_expr) != SSA_NAME
 743           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 744               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 745           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 746               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 747                   || !flag_associative_math)))
 748       && (TREE_CODE (step_expr) != REAL_CST
 749           || !flag_associative_math))
 750     {
 751       if (dump_enabled_p ())
 752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                          "step unknown.\n");
 754       return false;
 755     }
 756
 757   return true;
 758 }
 759
 760 /* Function vect_analyze_scalar_cycles_1.
 761
 762    Examine the cross iteration def-use cycles of scalar variables
 763    in LOOP.  LOOP_VINFO represents the loop that is now being
 764    considered for vectorization (can be LOOP, or an outer-loop
 765    enclosing LOOP).  */
 766
 767 static void
 768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 769 {
 770   basic_block bb = loop->header;
 771   tree init, step;
 772   auto_vec<gimple *, 64> worklist;
 773   gphi_iterator gsi;
 774   bool double_reduc;
 775
 776   if (dump_enabled_p ())
 777     dump_printf_loc (MSG_NOTE, vect_location,
 778                      "=== vect_analyze_scalar_cycles ===\n");
 779
 780   /* First - identify all inductions.  Reduction detection assumes that all the
 781      inductions have been identified, therefore, this order must not be
 782      changed.  */
 783   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 784     {
 785       gphi *phi = gsi.phi ();
 786       tree access_fn = NULL;
 787       tree def = PHI_RESULT (phi);
 788       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 789
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 793           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 794         }
 795
 796       /* Skip virtual phi's.  The data dependences that are associated with
 797          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 798       if (virtual_operand_p (def))
 799         continue;
 800
 801       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 802
 803       /* Analyze the evolution function.  */
 804       access_fn = analyze_scalar_evolution (loop, def);
 805       if (access_fn)
 806         {
 807           STRIP_NOPS (access_fn);
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "Access function of PHI: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 816             = initial_condition_in_loop_num (access_fn, loop->num);
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 818             = evolution_part_in_loop_num (access_fn, loop->num);
 819         }
 820
 821       if (!access_fn
 822           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 823           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 824               && TREE_CODE (step) != INTEGER_CST))
 825         {
 826           worklist.safe_push (phi);
 827           continue;
 828         }
 829
 830       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 831                   != NULL_TREE);
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 833
 834       if (dump_enabled_p ())
 835         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 836       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 837     }
 838
 839
 840   /* Second - identify all reductions and nested cycles.  */
 841   while (worklist.length () > 0)
 842     {
 843       gimple *phi = worklist.pop ();
 844       tree def = PHI_RESULT (phi);
 845       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 846       gimple *reduc_stmt;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852         }
 853
 854       gcc_assert (!virtual_operand_p (def)
 855                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 856
 857       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 858                                                 &double_reduc, false);
 859       if (reduc_stmt)
 860         {
 861           if (double_reduc)
 862             {
 863               if (dump_enabled_p ())
 864                 dump_printf_loc (MSG_NOTE, vect_location,
 865                                  "Detected double reduction.\n");
 866
 867               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 868               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 869                                                     vect_double_reduction_def;
 870             }
 871           else
 872             {
 873               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 874                 {
 875                   if (dump_enabled_p ())
 876                     dump_printf_loc (MSG_NOTE, vect_location,
 877                                      "Detected vectorizable nested cycle.\n");
 878
 879                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 880                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 881                                                              vect_nested_cycle;
 882                 }
 883               else
 884                 {
 885                   if (dump_enabled_p ())
 886                     dump_printf_loc (MSG_NOTE, vect_location,
 887                                      "Detected reduction.\n");
 888
 889                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 890                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 891                                                            vect_reduction_def;
 892                   /* Store the reduction cycles for possible vectorization in
 893                      loop-aware SLP if it was not detected as reduction
 894                      chain.  */
 895                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 896                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 984         while (next)
 985           {
 986             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 987               break;
 988             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 989           }
 990         /* If not all stmt in the chain are patterns try to handle
 991            the chain without patterns.  */
 992         if (! next)
 993           {
 994             vect_fixup_reduc_chain (first);
 995             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 996               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 997           }
 998       }
 999 }
1000
1001 /* Function vect_get_loop_niters.
1002
1003    Determine how many iterations the loop is executed and place it
1004    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1005    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1006    niter information holds in ASSUMPTIONS.
1007
1008    Return the loop exit condition.  */
1009
1010
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013                       tree *number_of_iterations, tree *number_of_iterationsm1)
1014 {
1015   edge exit = single_exit (loop);
1016   struct tree_niter_desc niter_desc;
1017   tree niter_assumptions, niter, may_be_zero;
1018   gcond *cond = get_loop_exit_condition (loop);
1019
1020   *assumptions = boolean_true_node;
1021   *number_of_iterationsm1 = chrec_dont_know;
1022   *number_of_iterations = chrec_dont_know;
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   if (!exit)
1028     return cond;
1029
1030   niter = chrec_dont_know;
1031   may_be_zero = NULL_TREE;
1032   niter_assumptions = boolean_true_node;
1033   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034       || chrec_contains_undetermined (niter_desc.niter))
1035     return cond;
1036
1037   niter_assumptions = niter_desc.assumptions;
1038   may_be_zero = niter_desc.may_be_zero;
1039   niter = niter_desc.niter;
1040
1041   if (may_be_zero && integer_zerop (may_be_zero))
1042     may_be_zero = NULL_TREE;
1043
1044   if (may_be_zero)
1045     {
1046       if (COMPARISON_CLASS_P (may_be_zero))
1047         {
1048           /* Try to combine may_be_zero with assumptions, this can simplify
1049              computation of niter expression.  */
1050           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052                                              niter_assumptions,
1053                                              fold_build1 (TRUTH_NOT_EXPR,
1054                                                           boolean_type_node,
1055                                                           may_be_zero));
1056           else
1057             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1059
1060           may_be_zero = NULL_TREE;
1061         }
1062       else if (integer_nonzerop (may_be_zero))
1063         {
1064           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066           return cond;
1067         }
1068       else
1069         return cond;
1070     }
1071
1072   *assumptions = niter_assumptions;
1073   *number_of_iterationsm1 = niter;
1074
1075   /* We want the number of loop header executions which is the number
1076      of latch executions plus one.
1077      ???  For UINT_MAX latch executions this number overflows to zero
1078      for loops like do { n++; } while (n != 0);  */
1079   if (niter && !chrec_contains_undetermined (niter))
1080     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081                           build_int_cst (TREE_TYPE (niter), 1));
1082   *number_of_iterations = niter;
1083
1084   return cond;
1085 }
1086
1087 /* Function bb_in_loop_p
1088
1089    Used as predicate for dfs order traversal of the loop bbs.  */
1090
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1093 {
1094   const struct loop *const loop = (const struct loop *)data;
1095   if (flow_bb_inside_loop_p (loop, bb))
1096     return true;
1097   return false;
1098 }
1099
1100
1101 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1102    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1103
1104 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1105   : vec_info (vec_info::loop, init_cost (loop_in)),
1106     loop (loop_in),
1107     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1108     num_itersm1 (NULL_TREE),
1109     num_iters (NULL_TREE),
1110     num_iters_unchanged (NULL_TREE),
1111     num_iters_assumptions (NULL_TREE),
1112     th (0),
1113     vectorization_factor (0),
1114     unaligned_dr (NULL),
1115     peeling_for_alignment (0),
1116     ptr_mask (0),
1117     slp_unrolling_factor (1),
1118     single_scalar_iteration_cost (0),
1119     vectorizable (false),
1120     peeling_for_gaps (false),
1121     peeling_for_niter (false),
1122     operands_swapped (false),
1123     no_data_dependencies (false),
1124     has_mask_store (false),
1125     scalar_loop (NULL),
1126     orig_loop_info (NULL)
1127 {
1128   /* Create/Update stmt_info for all stmts in the loop.  */
1129   basic_block *body = get_loop_body (loop);
1130   for (unsigned int i = 0; i < loop->num_nodes; i++)
1131     {
1132       basic_block bb = body[i];
1133       gimple_stmt_iterator si;
1134
1135       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1136         {
1137           gimple *phi = gsi_stmt (si);
1138           gimple_set_uid (phi, 0);
1139           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1140         }
1141
1142       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1143         {
1144           gimple *stmt = gsi_stmt (si);
1145           gimple_set_uid (stmt, 0);
1146           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1147         }
1148     }
1149   free (body);
1150
1151   /* CHECKME: We want to visit all BBs before their successors (except for
1152      latch blocks, for which this assertion wouldn't hold).  In the simple
1153      case of the loop forms we allow, a dfs order of the BBs would the same
1154      as reversed postorder traversal, so we are safe.  */
1155
1156   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1157                                           bbs, loop->num_nodes, loop);
1158   gcc_assert (nbbs == loop->num_nodes);
1159 }
1160
1161
1162 /* Free all memory used by the _loop_vec_info, as well as all the
1163    stmt_vec_info structs of all the stmts in the loop.  */
1164
1165 _loop_vec_info::~_loop_vec_info ()
1166 {
1167   int nbbs;
1168   gimple_stmt_iterator si;
1169   int j;
1170
1171   nbbs = loop->num_nodes;
1172   for (j = 0; j < nbbs; j++)
1173     {
1174       basic_block bb = bbs[j];
1175       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1176         free_stmt_vec_info (gsi_stmt (si));
1177
1178       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1179         {
1180           gimple *stmt = gsi_stmt (si);
1181
1182           /* We may have broken canonical form by moving a constant
1183              into RHS1 of a commutative op.  Fix such occurrences.  */
1184           if (operands_swapped && is_gimple_assign (stmt))
1185             {
1186               enum tree_code code = gimple_assign_rhs_code (stmt);
1187
1188               if ((code == PLUS_EXPR
1189                    || code == POINTER_PLUS_EXPR
1190                    || code == MULT_EXPR)
1191                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1192                 swap_ssa_operands (stmt,
1193                                    gimple_assign_rhs1_ptr (stmt),
1194                                    gimple_assign_rhs2_ptr (stmt));
1195               else if (code == COND_EXPR
1196                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1197                 {
1198                   tree cond_expr = gimple_assign_rhs1 (stmt);
1199                   enum tree_code cond_code = TREE_CODE (cond_expr);
1200
1201                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1202                     {
1203                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1204                                                                   0));
1205                       cond_code = invert_tree_comparison (cond_code,
1206                                                           honor_nans);
1207                       if (cond_code != ERROR_MARK)
1208                         {
1209                           TREE_SET_CODE (cond_expr, cond_code);
1210                           swap_ssa_operands (stmt,
1211                                              gimple_assign_rhs2_ptr (stmt),
1212                                              gimple_assign_rhs3_ptr (stmt));
1213                         }
1214                     }
1215                 }
1216             }
1217
1218           /* Free stmt_vec_info.  */
1219           free_stmt_vec_info (stmt);
1220           gsi_next (&si);
1221         }
1222     }
1223
1224   free (bbs);
1225
1226   loop->aux = NULL;
1227 }
1228
1229
1230 /* Calculate the cost of one scalar iteration of the loop.  */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1233 {
1234   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1237   int innerloop_iters, i;
1238
1239   /* Count statements in scalar loop.  Using this as scalar cost for a single
1240      iteration for now.
1241
1242      TODO: Add outer loop support.
1243
1244      TODO: Consider assigning different costs to different scalar
1245      statements.  */
1246
1247   /* FORNOW.  */
1248   innerloop_iters = 1;
1249   if (loop->inner)
1250     innerloop_iters = 50; /* FIXME */
1251
1252   for (i = 0; i < nbbs; i++)
1253     {
1254       gimple_stmt_iterator si;
1255       basic_block bb = bbs[i];
1256
1257       if (bb->loop_father == loop->inner)
1258         factor = innerloop_iters;
1259       else
1260         factor = 1;
1261
1262       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1263         {
1264           gimple *stmt = gsi_stmt (si);
1265           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1266
1267           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1268             continue;
1269
1270           /* Skip stmts that are not vectorized inside the loop.  */
1271           if (stmt_info
1272               && !STMT_VINFO_RELEVANT_P (stmt_info)
1273               && (!STMT_VINFO_LIVE_P (stmt_info)
1274                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1275               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1276             continue;
1277
1278           vect_cost_for_stmt kind;
1279           if (STMT_VINFO_DATA_REF (stmt_info))
1280             {
1281               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1282                kind = scalar_load;
1283              else
1284                kind = scalar_store;
1285             }
1286           else
1287             kind = scalar_stmt;
1288
1289           scalar_single_iter_cost
1290             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291                                  factor, kind, stmt_info, 0, vect_prologue);
1292         }
1293     }
1294   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1295     = scalar_single_iter_cost;
1296 }
1297
1298
1299 /* Function vect_analyze_loop_form_1.
1300
1301    Verify that certain CFG restrictions hold, including:
1302    - the loop has a pre-header
1303    - the loop has a single entry and exit
1304    - the loop exit condition is simple enough
1305    - the number of iterations can be analyzed, i.e, a countable loop.  The
1306      niter could be analyzed under some assumptions.  */
1307
1308 bool
1309 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1310                           tree *assumptions, tree *number_of_iterationsm1,
1311                           tree *number_of_iterations, gcond **inner_loop_cond)
1312 {
1313   if (dump_enabled_p ())
1314     dump_printf_loc (MSG_NOTE, vect_location,
1315                      "=== vect_analyze_loop_form ===\n");
1316
1317   /* Different restrictions apply when we are considering an inner-most loop,
1318      vs. an outer (nested) loop.
1319      (FORNOW. May want to relax some of these restrictions in the future).  */
1320
1321   if (!loop->inner)
1322     {
1323       /* Inner-most loop.  We currently require that the number of BBs is
1324          exactly 2 (the header and latch).  Vectorizable inner-most loops
1325          look like this:
1326
1327                         (pre-header)
1328                            |
1329                           header <--------+
1330                            | |            |
1331                            | +--> latch --+
1332                            |
1333                         (exit-bb)  */
1334
1335       if (loop->num_nodes != 2)
1336         {
1337           if (dump_enabled_p ())
1338             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                              "not vectorized: control flow in loop.\n");
1340           return false;
1341         }
1342
1343       if (empty_block_p (loop->header))
1344         {
1345           if (dump_enabled_p ())
1346             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1347                              "not vectorized: empty loop.\n");
1348           return false;
1349         }
1350     }
1351   else
1352     {
1353       struct loop *innerloop = loop->inner;
1354       edge entryedge;
1355
1356       /* Nested loop. We currently require that the loop is doubly-nested,
1357          contains a single inner loop, and the number of BBs is exactly 5.
1358          Vectorizable outer-loops look like this:
1359
1360                         (pre-header)
1361                            |
1362                           header <---+
1363                            |         |
1364                           inner-loop |
1365                            |         |
1366                           tail ------+
1367                            |
1368                         (exit-bb)
1369
1370          The inner-loop has the properties expected of inner-most loops
1371          as described above.  */
1372
1373       if ((loop->inner)->inner || (loop->inner)->next)
1374         {
1375           if (dump_enabled_p ())
1376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                              "not vectorized: multiple nested loops.\n");
1378           return false;
1379         }
1380
1381       if (loop->num_nodes != 5)
1382         {
1383           if (dump_enabled_p ())
1384             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1385                              "not vectorized: control flow in loop.\n");
1386           return false;
1387         }
1388
1389       entryedge = loop_preheader_edge (innerloop);
1390       if (entryedge->src != loop->header
1391           || !single_exit (innerloop)
1392           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1393         {
1394           if (dump_enabled_p ())
1395             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1396                              "not vectorized: unsupported outerloop form.\n");
1397           return false;
1398         }
1399
1400       /* Analyze the inner-loop.  */
1401       tree inner_niterm1, inner_niter, inner_assumptions;
1402       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1403                                       &inner_assumptions, &inner_niterm1,
1404                                       &inner_niter, NULL)
1405           /* Don't support analyzing niter under assumptions for inner
1406              loop.  */
1407           || !integer_onep (inner_assumptions))
1408         {
1409           if (dump_enabled_p ())
1410             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411                              "not vectorized: Bad inner loop.\n");
1412           return false;
1413         }
1414
1415       if (!expr_invariant_in_loop_p (loop, inner_niter))
1416         {
1417           if (dump_enabled_p ())
1418             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419                              "not vectorized: inner-loop count not"
1420                              " invariant.\n");
1421           return false;
1422         }
1423
1424       if (dump_enabled_p ())
1425         dump_printf_loc (MSG_NOTE, vect_location,
1426                          "Considering outer-loop vectorization.\n");
1427     }
1428
1429   if (!single_exit (loop)
1430       || EDGE_COUNT (loop->header->preds) != 2)
1431     {
1432       if (dump_enabled_p ())
1433         {
1434           if (!single_exit (loop))
1435             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436                              "not vectorized: multiple exits.\n");
1437           else if (EDGE_COUNT (loop->header->preds) != 2)
1438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1439                              "not vectorized: too many incoming edges.\n");
1440         }
1441       return false;
1442     }
1443
1444   /* We assume that the loop exit condition is at the end of the loop. i.e,
1445      that the loop is represented as a do-while (with a proper if-guard
1446      before the loop if needed), where the loop header contains all the
1447      executable statements, and the latch is empty.  */
1448   if (!empty_block_p (loop->latch)
1449       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1450     {
1451       if (dump_enabled_p ())
1452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                          "not vectorized: latch block not empty.\n");
1454       return false;
1455     }
1456
1457   /* Make sure the exit is not abnormal.  */
1458   edge e = single_exit (loop);
1459   if (e->flags & EDGE_ABNORMAL)
1460     {
1461       if (dump_enabled_p ())
1462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463                          "not vectorized: abnormal loop exit edge.\n");
1464       return false;
1465     }
1466
1467   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1468                                      number_of_iterationsm1);
1469   if (!*loop_cond)
1470     {
1471       if (dump_enabled_p ())
1472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473                          "not vectorized: complicated exit condition.\n");
1474       return false;
1475     }
1476
1477   if (integer_zerop (*assumptions)
1478       || !*number_of_iterations
1479       || chrec_contains_undetermined (*number_of_iterations))
1480     {
1481       if (dump_enabled_p ())
1482         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1483                          "not vectorized: number of iterations cannot be "
1484                          "computed.\n");
1485       return false;
1486     }
1487
1488   if (integer_zerop (*number_of_iterations))
1489     {
1490       if (dump_enabled_p ())
1491         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492                          "not vectorized: number of iterations = 0.\n");
1493       return false;
1494     }
1495
1496   return true;
1497 }
1498
1499 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1500
1501 loop_vec_info
1502 vect_analyze_loop_form (struct loop *loop)
1503 {
1504   tree assumptions, number_of_iterations, number_of_iterationsm1;
1505   gcond *loop_cond, *inner_loop_cond = NULL;
1506
1507   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1508                                   &assumptions, &number_of_iterationsm1,
1509                                   &number_of_iterations, &inner_loop_cond))
1510     return NULL;
1511
1512   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1513   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1514   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1515   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1516   if (!integer_onep (assumptions))
1517     {
1518       /* We consider to vectorize this loop by versioning it under
1519          some assumptions.  In order to do this, we need to clear
1520          existing information computed by scev and niter analyzer.  */
1521       scev_reset_htab ();
1522       free_numbers_of_iterations_estimates (loop);
1523       /* Also set flag for this loop so that following scev and niter
1524          analysis are done under the assumptions.  */
1525       loop_constraint_set (loop, LOOP_C_FINITE);
1526       /* Also record the assumptions for versioning.  */
1527       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1528     }
1529
1530   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1531     {
1532       if (dump_enabled_p ())
1533         {
1534           dump_printf_loc (MSG_NOTE, vect_location,
1535                            "Symbolic number of iterations is ");
1536           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1537           dump_printf (MSG_NOTE, "\n");
1538         }
1539     }
1540
1541   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1542   if (inner_loop_cond)
1543     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1544       = loop_exit_ctrl_vec_info_type;
1545
1546   gcc_assert (!loop->aux);
1547   loop->aux = loop_vinfo;
1548   return loop_vinfo;
1549 }
1550
1551
1552
1553 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1554    statements update the vectorization factor.  */
1555
1556 static void
1557 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1558 {
1559   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1560   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1561   int nbbs = loop->num_nodes;
1562   unsigned int vectorization_factor;
1563   int i;
1564
1565   if (dump_enabled_p ())
1566     dump_printf_loc (MSG_NOTE, vect_location,
1567                      "=== vect_update_vf_for_slp ===\n");
1568
1569   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1570   gcc_assert (vectorization_factor != 0);
1571
1572   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1573      vectorization factor of the loop is the unrolling factor required by
1574      the SLP instances.  If that unrolling factor is 1, we say, that we
1575      perform pure SLP on loop - cross iteration parallelism is not
1576      exploited.  */
1577   bool only_slp_in_loop = true;
1578   for (i = 0; i < nbbs; i++)
1579     {
1580       basic_block bb = bbs[i];
1581       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1582            gsi_next (&si))
1583         {
1584           gimple *stmt = gsi_stmt (si);
1585           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1586           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1587               && STMT_VINFO_RELATED_STMT (stmt_info))
1588             {
1589               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1590               stmt_info = vinfo_for_stmt (stmt);
1591             }
1592           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1593                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1594               && !PURE_SLP_STMT (stmt_info))
1595             /* STMT needs both SLP and loop-based vectorization.  */
1596             only_slp_in_loop = false;
1597         }
1598     }
1599
1600   if (only_slp_in_loop)
1601     {
1602       dump_printf_loc (MSG_NOTE, vect_location,
1603                        "Loop contains only SLP stmts\n");
1604       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1605     }
1606   else
1607     {
1608       dump_printf_loc (MSG_NOTE, vect_location,
1609                        "Loop contains SLP and non-SLP stmts\n");
1610       vectorization_factor
1611         = least_common_multiple (vectorization_factor,
1612                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1613     }
1614
1615   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1616   if (dump_enabled_p ())
1617     dump_printf_loc (MSG_NOTE, vect_location,
1618                      "Updating vectorization factor to %d\n",
1619                      vectorization_factor);
1620 }
1621
1622 /* Function vect_analyze_loop_operations.
1623
1624    Scan the loop stmts and make sure they are all vectorizable.  */
1625
1626 static bool
1627 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1628 {
1629   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1630   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1631   int nbbs = loop->num_nodes;
1632   int i;
1633   stmt_vec_info stmt_info;
1634   bool need_to_vectorize = false;
1635   bool ok;
1636
1637   if (dump_enabled_p ())
1638     dump_printf_loc (MSG_NOTE, vect_location,
1639                      "=== vect_analyze_loop_operations ===\n");
1640
1641   for (i = 0; i < nbbs; i++)
1642     {
1643       basic_block bb = bbs[i];
1644
1645       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1646            gsi_next (&si))
1647         {
1648           gphi *phi = si.phi ();
1649           ok = true;
1650
1651           stmt_info = vinfo_for_stmt (phi);
1652           if (dump_enabled_p ())
1653             {
1654               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1655               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1656             }
1657           if (virtual_operand_p (gimple_phi_result (phi)))
1658             continue;
1659
1660           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1661              (i.e., a phi in the tail of the outer-loop).  */
1662           if (! is_loop_header_bb_p (bb))
1663             {
1664               /* FORNOW: we currently don't support the case that these phis
1665                  are not used in the outerloop (unless it is double reduction,
1666                  i.e., this phi is vect_reduction_def), cause this case
1667                  requires to actually do something here.  */
1668               if (STMT_VINFO_LIVE_P (stmt_info)
1669                   && STMT_VINFO_DEF_TYPE (stmt_info)
1670                      != vect_double_reduction_def)
1671                 {
1672                   if (dump_enabled_p ())
1673                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674                                      "Unsupported loop-closed phi in "
1675                                      "outer-loop.\n");
1676                   return false;
1677                 }
1678
1679               /* If PHI is used in the outer loop, we check that its operand
1680                  is defined in the inner loop.  */
1681               if (STMT_VINFO_RELEVANT_P (stmt_info))
1682                 {
1683                   tree phi_op;
1684                   gimple *op_def_stmt;
1685
1686                   if (gimple_phi_num_args (phi) != 1)
1687                     return false;
1688
1689                   phi_op = PHI_ARG_DEF (phi, 0);
1690                   if (TREE_CODE (phi_op) != SSA_NAME)
1691                     return false;
1692
1693                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1694                   if (gimple_nop_p (op_def_stmt)
1695                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1696                       || !vinfo_for_stmt (op_def_stmt))
1697                     return false;
1698
1699                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1700                         != vect_used_in_outer
1701                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1702                            != vect_used_in_outer_by_reduction)
1703                     return false;
1704                 }
1705
1706               continue;
1707             }
1708
1709           gcc_assert (stmt_info);
1710
1711           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1712                || STMT_VINFO_LIVE_P (stmt_info))
1713               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1714             {
1715               /* A scalar-dependence cycle that we don't support.  */
1716               if (dump_enabled_p ())
1717                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1718                                  "not vectorized: scalar dependence cycle.\n");
1719               return false;
1720             }
1721
1722           if (STMT_VINFO_RELEVANT_P (stmt_info))
1723             {
1724               need_to_vectorize = true;
1725               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1726                   && ! PURE_SLP_STMT (stmt_info))
1727                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1728               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1729                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1730                        && ! PURE_SLP_STMT (stmt_info))
1731                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1732             }
1733
1734           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1735             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1736
1737           if (!ok)
1738             {
1739               if (dump_enabled_p ())
1740                 {
1741                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742                                    "not vectorized: relevant phi not "
1743                                    "supported: ");
1744                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1745                 }
1746               return false;
1747             }
1748         }
1749
1750       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1751            gsi_next (&si))
1752         {
1753           gimple *stmt = gsi_stmt (si);
1754           if (!gimple_clobber_p (stmt)
1755               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1756             return false;
1757         }
1758     } /* bbs */
1759
1760   /* All operations in the loop are either irrelevant (deal with loop
1761      control, or dead), or only used outside the loop and can be moved
1762      out of the loop (e.g. invariants, inductions).  The loop can be
1763      optimized away by scalar optimizations.  We're better off not
1764      touching this loop.  */
1765   if (!need_to_vectorize)
1766     {
1767       if (dump_enabled_p ())
1768         dump_printf_loc (MSG_NOTE, vect_location,
1769                          "All the computation can be taken out of the loop.\n");
1770       if (dump_enabled_p ())
1771         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772                          "not vectorized: redundant loop. no profit to "
1773                          "vectorize.\n");
1774       return false;
1775     }
1776
1777   return true;
1778 }
1779
1780
1781 /* Function vect_analyze_loop_2.
1782
1783    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1784    for it.  The different analyses will record information in the
1785    loop_vec_info struct.  */
1786 static bool
1787 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1788 {
1789   bool ok;
1790   int max_vf = MAX_VECTORIZATION_FACTOR;
1791   int min_vf = 2;
1792   unsigned int n_stmts = 0;
1793
1794   /* The first group of checks is independent of the vector size.  */
1795   fatal = true;
1796
1797   /* Find all data references in the loop (which correspond to vdefs/vuses)
1798      and analyze their evolution in the loop.  */
1799
1800   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1801
1802   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1803   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1804     {
1805       if (dump_enabled_p ())
1806         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1807                          "not vectorized: loop nest containing two "
1808                          "or more consecutive inner loops cannot be "
1809                          "vectorized\n");
1810       return false;
1811     }
1812
1813   for (unsigned i = 0; i < loop->num_nodes; i++)
1814     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1815          !gsi_end_p (gsi); gsi_next (&gsi))
1816       {
1817         gimple *stmt = gsi_stmt (gsi);
1818         if (is_gimple_debug (stmt))
1819           continue;
1820         ++n_stmts;
1821         if (!find_data_references_in_stmt (loop, stmt,
1822                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1823           {
1824             if (is_gimple_call (stmt) && loop->safelen)
1825               {
1826                 tree fndecl = gimple_call_fndecl (stmt), op;
1827                 if (fndecl != NULL_TREE)
1828                   {
1829                     cgraph_node *node = cgraph_node::get (fndecl);
1830                     if (node != NULL && node->simd_clones != NULL)
1831                       {
1832                         unsigned int j, n = gimple_call_num_args (stmt);
1833                         for (j = 0; j < n; j++)
1834                           {
1835                             op = gimple_call_arg (stmt, j);
1836                             if (DECL_P (op)
1837                                 || (REFERENCE_CLASS_P (op)
1838                                     && get_base_address (op)))
1839                               break;
1840                           }
1841                         op = gimple_call_lhs (stmt);
1842                         /* Ignore #pragma omp declare simd functions
1843                            if they don't have data references in the
1844                            call stmt itself.  */
1845                         if (j == n
1846                             && !(op
1847                                  && (DECL_P (op)
1848                                      || (REFERENCE_CLASS_P (op)
1849                                          && get_base_address (op)))))
1850                           continue;
1851                       }
1852                   }
1853               }
1854             if (dump_enabled_p ())
1855               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                "not vectorized: loop contains function "
1857                                "calls or data references that cannot "
1858                                "be analyzed\n");
1859             return false;
1860           }
1861       }
1862
1863   /* Analyze the data references and also adjust the minimal
1864      vectorization factor according to the loads and stores.  */
1865
1866   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1867   if (!ok)
1868     {
1869       if (dump_enabled_p ())
1870         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1871                          "bad data references.\n");
1872       return false;
1873     }
1874
1875   /* Classify all cross-iteration scalar data-flow cycles.
1876      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1877   vect_analyze_scalar_cycles (loop_vinfo);
1878
1879   vect_pattern_recog (loop_vinfo);
1880
1881   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1882
1883   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1884      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1885
1886   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1887   if (!ok)
1888     {
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                          "bad data access.\n");
1892       return false;
1893     }
1894
1895   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1896
1897   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1898   if (!ok)
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1902                          "unexpected pattern.\n");
1903       return false;
1904     }
1905
1906   /* While the rest of the analysis below depends on it in some way.  */
1907   fatal = false;
1908
1909   /* Analyze data dependences between the data-refs in the loop
1910      and adjust the maximum vectorization factor according to
1911      the dependences.
1912      FORNOW: fail at the first data dependence that we encounter.  */
1913
1914   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1915   if (!ok
1916       || max_vf < min_vf)
1917     {
1918       if (dump_enabled_p ())
1919             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920                              "bad data dependence.\n");
1921       return false;
1922     }
1923
1924   ok = vect_determine_vectorization_factor (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "can't determine vectorization factor.\n");
1930       return false;
1931     }
1932   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1933     {
1934       if (dump_enabled_p ())
1935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936                          "bad data dependence.\n");
1937       return false;
1938     }
1939
1940   /* Compute the scalar iteration cost.  */
1941   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1942
1943   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1944   HOST_WIDE_INT estimated_niter;
1945   unsigned th;
1946   int min_scalar_loop_bound;
1947
1948   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1949   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1950   if (!ok)
1951     return false;
1952
1953   /* If there are any SLP instances mark them as pure_slp.  */
1954   bool slp = vect_make_slp_decision (loop_vinfo);
1955   if (slp)
1956     {
1957       /* Find stmts that need to be both vectorized and SLPed.  */
1958       vect_detect_hybrid_slp (loop_vinfo);
1959
1960       /* Update the vectorization factor based on the SLP decision.  */
1961       vect_update_vf_for_slp (loop_vinfo);
1962     }
1963
1964   /* This is the point where we can re-start analysis with SLP forced off.  */
1965 start_over:
1966
1967   /* Now the vectorization factor is final.  */
1968   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1969   gcc_assert (vectorization_factor != 0);
1970
1971   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1972     dump_printf_loc (MSG_NOTE, vect_location,
1973                      "vectorization_factor = %d, niters = "
1974                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1975                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1976
1977   HOST_WIDE_INT max_niter
1978     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1979   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1980        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1981       || (max_niter != -1
1982           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1983     {
1984       if (dump_enabled_p ())
1985         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1986                          "not vectorized: iteration count smaller than "
1987                          "vectorization factor.\n");
1988       return false;
1989     }
1990
1991   /* Analyze the alignment of the data-refs in the loop.
1992      Fail if a data reference is found that cannot be vectorized.  */
1993
1994   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1995   if (!ok)
1996     {
1997       if (dump_enabled_p ())
1998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1999                          "bad data alignment.\n");
2000       return false;
2001     }
2002
2003   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2004      It is important to call pruning after vect_analyze_data_ref_accesses,
2005      since we use grouping information gathered by interleaving analysis.  */
2006   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2007   if (!ok)
2008     return false;
2009
2010   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2011      vectorization.  */
2012   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2013     {
2014     /* This pass will decide on using loop versioning and/or loop peeling in
2015        order to enhance the alignment of data references in the loop.  */
2016     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2017     if (!ok)
2018       {
2019         if (dump_enabled_p ())
2020           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021                            "bad data alignment.\n");
2022         return false;
2023       }
2024     }
2025
2026   if (slp)
2027     {
2028       /* Analyze operations in the SLP instances.  Note this may
2029          remove unsupported SLP instances which makes the above
2030          SLP kind detection invalid.  */
2031       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2032       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2033                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2034       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2035         goto again;
2036     }
2037
2038   /* Scan all the remaining operations in the loop that are not subject
2039      to SLP and make sure they are vectorizable.  */
2040   ok = vect_analyze_loop_operations (loop_vinfo);
2041   if (!ok)
2042     {
2043       if (dump_enabled_p ())
2044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2045                          "bad operation or unsupported loop bound.\n");
2046       return false;
2047     }
2048
2049   /* If epilog loop is required because of data accesses with gaps,
2050      one additional iteration needs to be peeled.  Check if there is
2051      enough iterations for vectorization.  */
2052   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2053       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2054     {
2055       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2056       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2057
2058       if (wi::to_widest (scalar_niters) < vf)
2059         {
2060           if (dump_enabled_p ())
2061             dump_printf_loc (MSG_NOTE, vect_location,
2062                              "loop has no enough iterations to support"
2063                              " peeling for gaps.\n");
2064           return false;
2065         }
2066     }
2067
2068   /* Analyze cost.  Decide if worth while to vectorize.  */
2069   int min_profitable_estimate, min_profitable_iters;
2070   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2071                                       &min_profitable_estimate);
2072
2073   if (min_profitable_iters < 0)
2074     {
2075       if (dump_enabled_p ())
2076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2077                          "not vectorized: vectorization not profitable.\n");
2078       if (dump_enabled_p ())
2079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080                          "not vectorized: vector version will never be "
2081                          "profitable.\n");
2082       goto again;
2083     }
2084
2085   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2086                            * vectorization_factor);
2087
2088   /* Use the cost model only if it is more conservative than user specified
2089      threshold.  */
2090   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2091
2092   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2093
2094   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2095       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2096     {
2097       if (dump_enabled_p ())
2098         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099                          "not vectorized: vectorization not profitable.\n");
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_NOTE, vect_location,
2102                          "not vectorized: iteration count smaller than user "
2103                          "specified loop bound parameter or minimum profitable "
2104                          "iterations (whichever is more conservative).\n");
2105       goto again;
2106     }
2107
2108   estimated_niter
2109     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2110   if (estimated_niter == -1)
2111     estimated_niter = max_niter;
2112   if (estimated_niter != -1
2113       && ((unsigned HOST_WIDE_INT) estimated_niter
2114           < MAX (th, (unsigned) min_profitable_estimate)))
2115     {
2116       if (dump_enabled_p ())
2117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118                          "not vectorized: estimated iteration count too "
2119                          "small.\n");
2120       if (dump_enabled_p ())
2121         dump_printf_loc (MSG_NOTE, vect_location,
2122                          "not vectorized: estimated iteration count smaller "
2123                          "than specified loop bound parameter or minimum "
2124                          "profitable iterations (whichever is more "
2125                          "conservative).\n");
2126       goto again;
2127     }
2128
2129   /* Decide whether we need to create an epilogue loop to handle
2130      remaining scalar iterations.  */
2131   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2132          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2133         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2134
2135   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2136       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2137     {
2138       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2139                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2140           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2141         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2142     }
2143   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2144            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2145                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2146                /* In case of versioning, check if the maximum number of
2147                   iterations is greater than th.  If they are identical,
2148                   the epilogue is unnecessary.  */
2149                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2150                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2151     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2152
2153   /* If an epilogue loop is required make sure we can create one.  */
2154   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2155       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2156     {
2157       if (dump_enabled_p ())
2158         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2159       if (!vect_can_advance_ivs_p (loop_vinfo)
2160           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2161                                            single_exit (LOOP_VINFO_LOOP
2162                                                          (loop_vinfo))))
2163         {
2164           if (dump_enabled_p ())
2165             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166                              "not vectorized: can't create required "
2167                              "epilog loop\n");
2168           goto again;
2169         }
2170     }
2171
2172   /* During peeling, we need to check if number of loop iterations is
2173      enough for both peeled prolog loop and vector loop.  This check
2174      can be merged along with threshold check of loop versioning, so
2175      increase threshold for this case if necessary.  */
2176   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2177       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2178           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2179     {
2180       unsigned niters_th;
2181
2182       /* Niters for peeled prolog loop.  */
2183       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2184         {
2185           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2186           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2187
2188           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2189         }
2190       else
2191         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2192
2193       /* Niters for at least one iteration of vectorized loop.  */
2194       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2195       /* One additional iteration because of peeling for gap.  */
2196       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2197         niters_th++;
2198       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2199         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2200     }
2201
2202   gcc_assert (vectorization_factor
2203               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2204
2205   /* Ok to vectorize!  */
2206   return true;
2207
2208 again:
2209   /* Try again with SLP forced off but if we didn't do any SLP there is
2210      no point in re-trying.  */
2211   if (!slp)
2212     return false;
2213
2214   /* If there are reduction chains re-trying will fail anyway.  */
2215   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2216     return false;
2217
2218   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2219      via interleaving or lane instructions.  */
2220   slp_instance instance;
2221   slp_tree node;
2222   unsigned i, j;
2223   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2224     {
2225       stmt_vec_info vinfo;
2226       vinfo = vinfo_for_stmt
2227           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2228       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2229         continue;
2230       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2231       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2232       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2233       if (! vect_store_lanes_supported (vectype, size)
2234           && ! vect_grouped_store_supported (vectype, size))
2235         return false;
2236       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2237         {
2238           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2239           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2240           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2241           size = STMT_VINFO_GROUP_SIZE (vinfo);
2242           vectype = STMT_VINFO_VECTYPE (vinfo);
2243           if (! vect_load_lanes_supported (vectype, size)
2244               && ! vect_grouped_load_supported (vectype, single_element_p,
2245                                                 size))
2246             return false;
2247         }
2248     }
2249
2250   if (dump_enabled_p ())
2251     dump_printf_loc (MSG_NOTE, vect_location,
2252                      "re-trying with SLP disabled\n");
2253
2254   /* Roll back state appropriately.  No SLP this time.  */
2255   slp = false;
2256   /* Restore vectorization factor as it were without SLP.  */
2257   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2258   /* Free the SLP instances.  */
2259   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2260     vect_free_slp_instance (instance);
2261   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2262   /* Reset SLP type to loop_vect on all stmts.  */
2263   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2264     {
2265       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2266       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2267            !gsi_end_p (si); gsi_next (&si))
2268         {
2269           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2270           STMT_SLP_TYPE (stmt_info) = loop_vect;
2271         }
2272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2273            !gsi_end_p (si); gsi_next (&si))
2274         {
2275           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2276           STMT_SLP_TYPE (stmt_info) = loop_vect;
2277           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2278             {
2279               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2280               STMT_SLP_TYPE (stmt_info) = loop_vect;
2281               for (gimple_stmt_iterator pi
2282                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2283                    !gsi_end_p (pi); gsi_next (&pi))
2284                 {
2285                   gimple *pstmt = gsi_stmt (pi);
2286                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2287                 }
2288             }
2289         }
2290     }
2291   /* Free optimized alias test DDRS.  */
2292   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2293   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2294   /* Reset target cost data.  */
2295   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2296   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2297     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2298   /* Reset assorted flags.  */
2299   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2300   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2301   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2302
2303   goto start_over;
2304 }
2305
2306 /* Function vect_analyze_loop.
2307
2308    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2309    for it.  The different analyses will record information in the
2310    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2311    be vectorized.  */
2312 loop_vec_info
2313 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2314 {
2315   loop_vec_info loop_vinfo;
2316   unsigned int vector_sizes;
2317
2318   /* Autodetect first vector size we try.  */
2319   current_vector_size = 0;
2320   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2321
2322   if (dump_enabled_p ())
2323     dump_printf_loc (MSG_NOTE, vect_location,
2324                      "===== analyze_loop_nest =====\n");
2325
2326   if (loop_outer (loop)
2327       && loop_vec_info_for_loop (loop_outer (loop))
2328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2329     {
2330       if (dump_enabled_p ())
2331         dump_printf_loc (MSG_NOTE, vect_location,
2332                          "outer-loop already vectorized.\n");
2333       return NULL;
2334     }
2335
2336   while (1)
2337     {
2338       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2339       loop_vinfo = vect_analyze_loop_form (loop);
2340       if (!loop_vinfo)
2341         {
2342           if (dump_enabled_p ())
2343             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344                              "bad loop form.\n");
2345           return NULL;
2346         }
2347
2348       bool fatal = false;
2349
2350       if (orig_loop_vinfo)
2351         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2352
2353       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2354         {
2355           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2356
2357           return loop_vinfo;
2358         }
2359
2360       delete loop_vinfo;
2361
2362       vector_sizes &= ~current_vector_size;
2363       if (fatal
2364           || vector_sizes == 0
2365           || current_vector_size == 0)
2366         return NULL;
2367
2368       /* Try the next biggest vector size.  */
2369       current_vector_size = 1 << floor_log2 (vector_sizes);
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_NOTE, vect_location,
2372                          "***** Re-trying analysis with "
2373                          "vector size %d\n", current_vector_size);
2374     }
2375 }
2376
2377
2378 /* Function reduction_code_for_scalar_code
2379
2380    Input:
2381    CODE - tree_code of a reduction operations.
2382
2383    Output:
2384    REDUC_CODE - the corresponding tree-code to be used to reduce the
2385       vector of partial results into a single scalar result, or ERROR_MARK
2386       if the operation is a supported reduction operation, but does not have
2387       such a tree-code.
2388
2389    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2390
2391 static bool
2392 reduction_code_for_scalar_code (enum tree_code code,
2393                                 enum tree_code *reduc_code)
2394 {
2395   switch (code)
2396     {
2397       case MAX_EXPR:
2398         *reduc_code = REDUC_MAX_EXPR;
2399         return true;
2400
2401       case MIN_EXPR:
2402         *reduc_code = REDUC_MIN_EXPR;
2403         return true;
2404
2405       case PLUS_EXPR:
2406         *reduc_code = REDUC_PLUS_EXPR;
2407         return true;
2408
2409       case MULT_EXPR:
2410       case MINUS_EXPR:
2411       case BIT_IOR_EXPR:
2412       case BIT_XOR_EXPR:
2413       case BIT_AND_EXPR:
2414         *reduc_code = ERROR_MARK;
2415         return true;
2416
2417       default:
2418        return false;
2419     }
2420 }
2421
2422
2423 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2424    STMT is printed with a message MSG. */
2425
2426 static void
2427 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2428 {
2429   dump_printf_loc (msg_type, vect_location, "%s", msg);
2430   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2431 }
2432
2433
2434 /* Detect SLP reduction of the form:
2435
2436    #a1 = phi <a5, a0>
2437    a2 = operation (a1)
2438    a3 = operation (a2)
2439    a4 = operation (a3)
2440    a5 = operation (a4)
2441
2442    #a = phi <a5>
2443
2444    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2445    FIRST_STMT is the first reduction stmt in the chain
2446    (a2 = operation (a1)).
2447
2448    Return TRUE if a reduction chain was detected.  */
2449
2450 static bool
2451 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2452                        gimple *first_stmt)
2453 {
2454   struct loop *loop = (gimple_bb (phi))->loop_father;
2455   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2456   enum tree_code code;
2457   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2458   stmt_vec_info use_stmt_info, current_stmt_info;
2459   tree lhs;
2460   imm_use_iterator imm_iter;
2461   use_operand_p use_p;
2462   int nloop_uses, size = 0, n_out_of_loop_uses;
2463   bool found = false;
2464
2465   if (loop != vect_loop)
2466     return false;
2467
2468   lhs = PHI_RESULT (phi);
2469   code = gimple_assign_rhs_code (first_stmt);
2470   while (1)
2471     {
2472       nloop_uses = 0;
2473       n_out_of_loop_uses = 0;
2474       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2475         {
2476           gimple *use_stmt = USE_STMT (use_p);
2477           if (is_gimple_debug (use_stmt))
2478             continue;
2479
2480           /* Check if we got back to the reduction phi.  */
2481           if (use_stmt == phi)
2482             {
2483               loop_use_stmt = use_stmt;
2484               found = true;
2485               break;
2486             }
2487
2488           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2489             {
2490               loop_use_stmt = use_stmt;
2491               nloop_uses++;
2492             }
2493            else
2494              n_out_of_loop_uses++;
2495
2496            /* There are can be either a single use in the loop or two uses in
2497               phi nodes.  */
2498            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2499              return false;
2500         }
2501
2502       if (found)
2503         break;
2504
2505       /* We reached a statement with no loop uses.  */
2506       if (nloop_uses == 0)
2507         return false;
2508
2509       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2510       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2511         return false;
2512
2513       if (!is_gimple_assign (loop_use_stmt)
2514           || code != gimple_assign_rhs_code (loop_use_stmt)
2515           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2516         return false;
2517
2518       /* Insert USE_STMT into reduction chain.  */
2519       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2520       if (current_stmt)
2521         {
2522           current_stmt_info = vinfo_for_stmt (current_stmt);
2523           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2524           GROUP_FIRST_ELEMENT (use_stmt_info)
2525             = GROUP_FIRST_ELEMENT (current_stmt_info);
2526         }
2527       else
2528         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2529
2530       lhs = gimple_assign_lhs (loop_use_stmt);
2531       current_stmt = loop_use_stmt;
2532       size++;
2533    }
2534
2535   if (!found || loop_use_stmt != phi || size < 2)
2536     return false;
2537
2538   /* Swap the operands, if needed, to make the reduction operand be the second
2539      operand.  */
2540   lhs = PHI_RESULT (phi);
2541   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2542   while (next_stmt)
2543     {
2544       if (gimple_assign_rhs2 (next_stmt) == lhs)
2545         {
2546           tree op = gimple_assign_rhs1 (next_stmt);
2547           gimple *def_stmt = NULL;
2548
2549           if (TREE_CODE (op) == SSA_NAME)
2550             def_stmt = SSA_NAME_DEF_STMT (op);
2551
2552           /* Check that the other def is either defined in the loop
2553              ("vect_internal_def"), or it's an induction (defined by a
2554              loop-header phi-node).  */
2555           if (def_stmt
2556               && gimple_bb (def_stmt)
2557               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2558               && (is_gimple_assign (def_stmt)
2559                   || is_gimple_call (def_stmt)
2560                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2561                            == vect_induction_def
2562                   || (gimple_code (def_stmt) == GIMPLE_PHI
2563                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2564                                   == vect_internal_def
2565                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2566             {
2567               lhs = gimple_assign_lhs (next_stmt);
2568               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2569               continue;
2570             }
2571
2572           return false;
2573         }
2574       else
2575         {
2576           tree op = gimple_assign_rhs2 (next_stmt);
2577           gimple *def_stmt = NULL;
2578
2579           if (TREE_CODE (op) == SSA_NAME)
2580             def_stmt = SSA_NAME_DEF_STMT (op);
2581
2582           /* Check that the other def is either defined in the loop
2583             ("vect_internal_def"), or it's an induction (defined by a
2584             loop-header phi-node).  */
2585           if (def_stmt
2586               && gimple_bb (def_stmt)
2587               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2588               && (is_gimple_assign (def_stmt)
2589                   || is_gimple_call (def_stmt)
2590                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2591                               == vect_induction_def
2592                   || (gimple_code (def_stmt) == GIMPLE_PHI
2593                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2594                                   == vect_internal_def
2595                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2596             {
2597               if (dump_enabled_p ())
2598                 {
2599                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2600                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2601                 }
2602
2603               swap_ssa_operands (next_stmt,
2604                                  gimple_assign_rhs1_ptr (next_stmt),
2605                                  gimple_assign_rhs2_ptr (next_stmt));
2606               update_stmt (next_stmt);
2607
2608               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2609                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2610             }
2611           else
2612             return false;
2613         }
2614
2615       lhs = gimple_assign_lhs (next_stmt);
2616       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2617     }
2618
2619   /* Save the chain for further analysis in SLP detection.  */
2620   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2621   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2622   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2623
2624   return true;
2625 }
2626
2627
2628 /* Function vect_is_simple_reduction
2629
2630    (1) Detect a cross-iteration def-use cycle that represents a simple
2631    reduction computation.  We look for the following pattern:
2632
2633    loop_header:
2634      a1 = phi < a0, a2 >
2635      a3 = ...
2636      a2 = operation (a3, a1)
2637
2638    or
2639
2640    a3 = ...
2641    loop_header:
2642      a1 = phi < a0, a2 >
2643      a2 = operation (a3, a1)
2644
2645    such that:
2646    1. operation is commutative and associative and it is safe to
2647       change the order of the computation
2648    2. no uses for a2 in the loop (a2 is used out of the loop)
2649    3. no uses of a1 in the loop besides the reduction operation
2650    4. no uses of a1 outside the loop.
2651
2652    Conditions 1,4 are tested here.
2653    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2654
2655    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2656    nested cycles.
2657
2658    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2659    reductions:
2660
2661      a1 = phi < a0, a2 >
2662      inner loop (def of a3)
2663      a2 = phi < a3 >
2664
2665    (4) Detect condition expressions, ie:
2666      for (int i = 0; i < N; i++)
2667        if (a[i] < val)
2668         ret_val = a[i];
2669
2670 */
2671
2672 static gimple *
2673 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2674                           bool *double_reduc,
2675                           bool need_wrapping_integral_overflow,
2676                           enum vect_reduction_type *v_reduc_type)
2677 {
2678   struct loop *loop = (gimple_bb (phi))->loop_father;
2679   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2680   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2681   enum tree_code orig_code, code;
2682   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2683   tree type;
2684   int nloop_uses;
2685   tree name;
2686   imm_use_iterator imm_iter;
2687   use_operand_p use_p;
2688   bool phi_def;
2689
2690   *double_reduc = false;
2691   *v_reduc_type = TREE_CODE_REDUCTION;
2692
2693   tree phi_name = PHI_RESULT (phi);
2694   /* ???  If there are no uses of the PHI result the inner loop reduction
2695      won't be detected as possibly double-reduction by vectorizable_reduction
2696      because that tries to walk the PHI arg from the preheader edge which
2697      can be constant.  See PR60382.  */
2698   if (has_zero_uses (phi_name))
2699     return NULL;
2700   nloop_uses = 0;
2701   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2702     {
2703       gimple *use_stmt = USE_STMT (use_p);
2704       if (is_gimple_debug (use_stmt))
2705         continue;
2706
2707       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2708         {
2709           if (dump_enabled_p ())
2710             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2711                              "intermediate value used outside loop.\n");
2712
2713           return NULL;
2714         }
2715
2716       nloop_uses++;
2717       if (nloop_uses > 1)
2718         {
2719           if (dump_enabled_p ())
2720             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2721                              "reduction value used in loop.\n");
2722           return NULL;
2723         }
2724
2725       phi_use_stmt = use_stmt;
2726     }
2727
2728   edge latch_e = loop_latch_edge (loop);
2729   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2730   if (TREE_CODE (loop_arg) != SSA_NAME)
2731     {
2732       if (dump_enabled_p ())
2733         {
2734           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2735                            "reduction: not ssa_name: ");
2736           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2737           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2738         }
2739       return NULL;
2740     }
2741
2742   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2743   if (is_gimple_assign (def_stmt))
2744     {
2745       name = gimple_assign_lhs (def_stmt);
2746       phi_def = false;
2747     }
2748   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2749     {
2750       name = PHI_RESULT (def_stmt);
2751       phi_def = true;
2752     }
2753   else
2754     {
2755       if (dump_enabled_p ())
2756         {
2757           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758                            "reduction: unhandled reduction operation: ");
2759           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2760         }
2761       return NULL;
2762     }
2763
2764   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2765     return NULL;
2766
2767   nloop_uses = 0;
2768   auto_vec<gphi *, 3> lcphis;
2769   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2770     {
2771       gimple *use_stmt = USE_STMT (use_p);
2772       if (is_gimple_debug (use_stmt))
2773         continue;
2774       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2775         nloop_uses++;
2776       else
2777         /* We can have more than one loop-closed PHI.  */
2778         lcphis.safe_push (as_a <gphi *> (use_stmt));
2779       if (nloop_uses > 1)
2780         {
2781           if (dump_enabled_p ())
2782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783                              "reduction used in loop.\n");
2784           return NULL;
2785         }
2786     }
2787
2788   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2789      defined in the inner loop.  */
2790   if (phi_def)
2791     {
2792       op1 = PHI_ARG_DEF (def_stmt, 0);
2793
2794       if (gimple_phi_num_args (def_stmt) != 1
2795           || TREE_CODE (op1) != SSA_NAME)
2796         {
2797           if (dump_enabled_p ())
2798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2799                              "unsupported phi node definition.\n");
2800
2801           return NULL;
2802         }
2803
2804       def1 = SSA_NAME_DEF_STMT (op1);
2805       if (gimple_bb (def1)
2806           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2807           && loop->inner
2808           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2809           && is_gimple_assign (def1)
2810           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2811         {
2812           if (dump_enabled_p ())
2813             report_vect_op (MSG_NOTE, def_stmt,
2814                             "detected double reduction: ");
2815
2816           *double_reduc = true;
2817           return def_stmt;
2818         }
2819
2820       return NULL;
2821     }
2822
2823   /* If we are vectorizing an inner reduction we are executing that
2824      in the original order only in case we are not dealing with a
2825      double reduction.  */
2826   bool check_reduction = true;
2827   if (flow_loop_nested_p (vect_loop, loop))
2828     {
2829       gphi *lcphi;
2830       unsigned i;
2831       check_reduction = false;
2832       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2833         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2834           {
2835             gimple *use_stmt = USE_STMT (use_p);
2836             if (is_gimple_debug (use_stmt))
2837               continue;
2838             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2839               check_reduction = true;
2840           }
2841     }
2842
2843   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2844   code = orig_code = gimple_assign_rhs_code (def_stmt);
2845
2846   /* We can handle "res -= x[i]", which is non-associative by
2847      simply rewriting this into "res += -x[i]".  Avoid changing
2848      gimple instruction for the first simple tests and only do this
2849      if we're allowed to change code at all.  */
2850   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2851     code = PLUS_EXPR;
2852
2853   if (code == COND_EXPR)
2854     {
2855       if (! nested_in_vect_loop)
2856         *v_reduc_type = COND_REDUCTION;
2857
2858       op3 = gimple_assign_rhs1 (def_stmt);
2859       if (COMPARISON_CLASS_P (op3))
2860         {
2861           op4 = TREE_OPERAND (op3, 1);
2862           op3 = TREE_OPERAND (op3, 0);
2863         }
2864       if (op3 == phi_name || op4 == phi_name)
2865         {
2866           if (dump_enabled_p ())
2867             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2868                             "reduction: condition depends on previous"
2869                             " iteration: ");
2870           return NULL;
2871         }
2872
2873       op1 = gimple_assign_rhs2 (def_stmt);
2874       op2 = gimple_assign_rhs3 (def_stmt);
2875     }
2876   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2877     {
2878       if (dump_enabled_p ())
2879         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2880                         "reduction: not commutative/associative: ");
2881       return NULL;
2882     }
2883   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2884     {
2885       op1 = gimple_assign_rhs1 (def_stmt);
2886       op2 = gimple_assign_rhs2 (def_stmt);
2887     }
2888   else
2889     {
2890       if (dump_enabled_p ())
2891         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2892                         "reduction: not handled operation: ");
2893       return NULL;
2894     }
2895
2896   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2897     {
2898       if (dump_enabled_p ())
2899         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2900                         "reduction: both uses not ssa_names: ");
2901
2902       return NULL;
2903     }
2904
2905   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2906   if ((TREE_CODE (op1) == SSA_NAME
2907        && !types_compatible_p (type,TREE_TYPE (op1)))
2908       || (TREE_CODE (op2) == SSA_NAME
2909           && !types_compatible_p (type, TREE_TYPE (op2)))
2910       || (op3 && TREE_CODE (op3) == SSA_NAME
2911           && !types_compatible_p (type, TREE_TYPE (op3)))
2912       || (op4 && TREE_CODE (op4) == SSA_NAME
2913           && !types_compatible_p (type, TREE_TYPE (op4))))
2914     {
2915       if (dump_enabled_p ())
2916         {
2917           dump_printf_loc (MSG_NOTE, vect_location,
2918                            "reduction: multiple types: operation type: ");
2919           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2920           dump_printf (MSG_NOTE, ", operands types: ");
2921           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2922                              TREE_TYPE (op1));
2923           dump_printf (MSG_NOTE, ",");
2924           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2925                              TREE_TYPE (op2));
2926           if (op3)
2927             {
2928               dump_printf (MSG_NOTE, ",");
2929               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2930                                  TREE_TYPE (op3));
2931             }
2932
2933           if (op4)
2934             {
2935               dump_printf (MSG_NOTE, ",");
2936               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2937                                  TREE_TYPE (op4));
2938             }
2939           dump_printf (MSG_NOTE, "\n");
2940         }
2941
2942       return NULL;
2943     }
2944
2945   /* Check that it's ok to change the order of the computation.
2946      Generally, when vectorizing a reduction we change the order of the
2947      computation.  This may change the behavior of the program in some
2948      cases, so we need to check that this is ok.  One exception is when
2949      vectorizing an outer-loop: the inner-loop is executed sequentially,
2950      and therefore vectorizing reductions in the inner-loop during
2951      outer-loop vectorization is safe.  */
2952
2953   if (*v_reduc_type != COND_REDUCTION
2954       && check_reduction)
2955     {
2956       /* CHECKME: check for !flag_finite_math_only too?  */
2957       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2958         {
2959           /* Changing the order of operations changes the semantics.  */
2960           if (dump_enabled_p ())
2961             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2962                         "reduction: unsafe fp math optimization: ");
2963           return NULL;
2964         }
2965       else if (INTEGRAL_TYPE_P (type))
2966         {
2967           if (!operation_no_trapping_overflow (type, code))
2968             {
2969               /* Changing the order of operations changes the semantics.  */
2970               if (dump_enabled_p ())
2971                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2972                                 "reduction: unsafe int math optimization"
2973                                 " (overflow traps): ");
2974               return NULL;
2975             }
2976           if (need_wrapping_integral_overflow
2977               && !TYPE_OVERFLOW_WRAPS (type)
2978               && operation_can_overflow (code))
2979             {
2980               /* Changing the order of operations changes the semantics.  */
2981               if (dump_enabled_p ())
2982                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2983                                 "reduction: unsafe int math optimization"
2984                                 " (overflow doesn't wrap): ");
2985               return NULL;
2986             }
2987         }
2988       else if (SAT_FIXED_POINT_TYPE_P (type))
2989         {
2990           /* Changing the order of operations changes the semantics.  */
2991           if (dump_enabled_p ())
2992           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2993                           "reduction: unsafe fixed-point math optimization: ");
2994           return NULL;
2995         }
2996     }
2997
2998   /* Reduction is safe. We're dealing with one of the following:
2999      1) integer arithmetic and no trapv
3000      2) floating point arithmetic, and special flags permit this optimization
3001      3) nested cycle (i.e., outer loop vectorization).  */
3002   if (TREE_CODE (op1) == SSA_NAME)
3003     def1 = SSA_NAME_DEF_STMT (op1);
3004
3005   if (TREE_CODE (op2) == SSA_NAME)
3006     def2 = SSA_NAME_DEF_STMT (op2);
3007
3008   if (code != COND_EXPR
3009       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3010     {
3011       if (dump_enabled_p ())
3012         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3013       return NULL;
3014     }
3015
3016   /* Check that one def is the reduction def, defined by PHI,
3017      the other def is either defined in the loop ("vect_internal_def"),
3018      or it's an induction (defined by a loop-header phi-node).  */
3019
3020   if (def2 && def2 == phi
3021       && (code == COND_EXPR
3022           || !def1 || gimple_nop_p (def1)
3023           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3024           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025               && (is_gimple_assign (def1)
3026                   || is_gimple_call (def1)
3027                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3028                       == vect_induction_def
3029                   || (gimple_code (def1) == GIMPLE_PHI
3030                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3031                           == vect_internal_def
3032                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3033     {
3034       if (dump_enabled_p ())
3035         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3036       return def_stmt;
3037     }
3038
3039   if (def1 && def1 == phi
3040       && (code == COND_EXPR
3041           || !def2 || gimple_nop_p (def2)
3042           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3043           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044               && (is_gimple_assign (def2)
3045                   || is_gimple_call (def2)
3046                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3047                        == vect_induction_def
3048                   || (gimple_code (def2) == GIMPLE_PHI
3049                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3050                            == vect_internal_def
3051                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3052     {
3053       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3054         {
3055           /* Check if we can swap operands (just for simplicity - so that
3056              the rest of the code can assume that the reduction variable
3057              is always the last (second) argument).  */
3058           if (code == COND_EXPR)
3059             {
3060               /* Swap cond_expr by inverting the condition.  */
3061               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3062               enum tree_code invert_code = ERROR_MARK;
3063               enum tree_code cond_code = TREE_CODE (cond_expr);
3064
3065               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3066                 {
3067                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3068                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3069                 }
3070               if (invert_code != ERROR_MARK)
3071                 {
3072                   TREE_SET_CODE (cond_expr, invert_code);
3073                   swap_ssa_operands (def_stmt,
3074                                      gimple_assign_rhs2_ptr (def_stmt),
3075                                      gimple_assign_rhs3_ptr (def_stmt));
3076                 }
3077               else
3078                 {
3079                   if (dump_enabled_p ())
3080                     report_vect_op (MSG_NOTE, def_stmt,
3081                                     "detected reduction: cannot swap operands "
3082                                     "for cond_expr");
3083                   return NULL;
3084                 }
3085             }
3086           else
3087             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3088                                gimple_assign_rhs2_ptr (def_stmt));
3089
3090           if (dump_enabled_p ())
3091             report_vect_op (MSG_NOTE, def_stmt,
3092                             "detected reduction: need to swap operands: ");
3093
3094           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3095             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3096         }
3097       else
3098         {
3099           if (dump_enabled_p ())
3100             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3101         }
3102
3103       return def_stmt;
3104     }
3105
3106   /* Try to find SLP reduction chain.  */
3107   if (! nested_in_vect_loop
3108       && code != COND_EXPR
3109       && orig_code != MINUS_EXPR
3110       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3111     {
3112       if (dump_enabled_p ())
3113         report_vect_op (MSG_NOTE, def_stmt,
3114                         "reduction: detected reduction chain: ");
3115
3116       return def_stmt;
3117     }
3118
3119   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3120   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3121   while (first)
3122     {
3123       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3124       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3125       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126       first = next;
3127     }
3128
3129   /* Look for the expression computing loop_arg from loop PHI result.  */
3130   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3131   auto_bitmap visited;
3132   tree lookfor = PHI_RESULT (phi);
3133   ssa_op_iter curri;
3134   use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3135                                             SSA_OP_USE);
3136   while (USE_FROM_PTR (curr) != loop_arg)
3137     curr = op_iter_next_use (&curri);
3138   curri.i = curri.numops;
3139   do
3140     {
3141       path.safe_push (std::make_pair (curri, curr));
3142       tree use = USE_FROM_PTR (curr);
3143       if (use == lookfor)
3144         break;
3145       gimple *def = SSA_NAME_DEF_STMT (use);
3146       if (gimple_nop_p (def)
3147           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3148         {
3149 pop:
3150           do
3151             {
3152               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3153               curri = x.first;
3154               curr = x.second;
3155               do
3156                 curr = op_iter_next_use (&curri);
3157               /* Skip already visited or non-SSA operands (from iterating
3158                  over PHI args).  */
3159               while (curr != NULL_USE_OPERAND_P
3160                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3161                          || ! bitmap_set_bit (visited,
3162                                               SSA_NAME_VERSION
3163                                                 (USE_FROM_PTR (curr)))));
3164             }
3165           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3166           if (curr == NULL_USE_OPERAND_P)
3167             break;
3168         }
3169       else
3170         {
3171           if (gimple_code (def) == GIMPLE_PHI)
3172             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3173           else
3174             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3175           while (curr != NULL_USE_OPERAND_P
3176                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3177                      || ! bitmap_set_bit (visited,
3178                                           SSA_NAME_VERSION
3179                                             (USE_FROM_PTR (curr)))))
3180             curr = op_iter_next_use (&curri);
3181           if (curr == NULL_USE_OPERAND_P)
3182             goto pop;
3183         }
3184     }
3185   while (1);
3186   if (dump_file && (dump_flags & TDF_DETAILS))
3187     {
3188       dump_printf_loc (MSG_NOTE, vect_location,
3189                        "reduction path: ");
3190       unsigned i;
3191       std::pair<ssa_op_iter, use_operand_p> *x;
3192       FOR_EACH_VEC_ELT (path, i, x)
3193         {
3194           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3195           dump_printf (MSG_NOTE, " ");
3196         }
3197       dump_printf (MSG_NOTE, "\n");
3198     }
3199
3200   /* Check whether the reduction path detected is valid.  */
3201   bool fail = path.length () == 0;
3202   bool neg = false;
3203   for (unsigned i = 1; i < path.length (); ++i)
3204     {
3205       gimple *use_stmt = USE_STMT (path[i].second);
3206       tree op = USE_FROM_PTR (path[i].second);
3207       if (! has_single_use (op)
3208           || ! is_gimple_assign (use_stmt))
3209         {
3210           fail = true;
3211           break;
3212         }
3213       if (gimple_assign_rhs_code (use_stmt) != code)
3214         {
3215           if (code == PLUS_EXPR
3216               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3217             {
3218               /* Track whether we negate the reduction value each iteration.  */
3219               if (gimple_assign_rhs2 (use_stmt) == op)
3220                 neg = ! neg;
3221             }
3222           else
3223             {
3224               fail = true;
3225               break;
3226             }
3227         }
3228     }
3229   if (! fail && ! neg)
3230     return def_stmt;
3231
3232   if (dump_enabled_p ())
3233     {
3234       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3235                       "reduction: unknown pattern: ");
3236     }
3237
3238   return NULL;
3239 }
3240
3241 /* Wrapper around vect_is_simple_reduction, which will modify code
3242    in-place if it enables detection of more reductions.  Arguments
3243    as there.  */
3244
3245 gimple *
3246 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3247                              bool *double_reduc,
3248                              bool need_wrapping_integral_overflow)
3249 {
3250   enum vect_reduction_type v_reduc_type;
3251   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3252                                           need_wrapping_integral_overflow,
3253                                           &v_reduc_type);
3254   if (def)
3255     {
3256       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3257       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3258       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3259       reduc_def_info = vinfo_for_stmt (def);
3260       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3261     }
3262   return def;
3263 }
3264
3265 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3266 int
3267 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3268                              int *peel_iters_epilogue,
3269                              stmt_vector_for_cost *scalar_cost_vec,
3270                              stmt_vector_for_cost *prologue_cost_vec,
3271                              stmt_vector_for_cost *epilogue_cost_vec)
3272 {
3273   int retval = 0;
3274   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3275
3276   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3277     {
3278       *peel_iters_epilogue = vf/2;
3279       if (dump_enabled_p ())
3280         dump_printf_loc (MSG_NOTE, vect_location,
3281                          "cost model: epilogue peel iters set to vf/2 "
3282                          "because loop iterations are unknown .\n");
3283
3284       /* If peeled iterations are known but number of scalar loop
3285          iterations are unknown, count a taken branch per peeled loop.  */
3286       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3287                                  NULL, 0, vect_prologue);
3288       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3289                                  NULL, 0, vect_epilogue);
3290     }
3291   else
3292     {
3293       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3294       peel_iters_prologue = niters < peel_iters_prologue ?
3295                             niters : peel_iters_prologue;
3296       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3297       /* If we need to peel for gaps, but no peeling is required, we have to
3298          peel VF iterations.  */
3299       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3300         *peel_iters_epilogue = vf;
3301     }
3302
3303   stmt_info_for_cost *si;
3304   int j;
3305   if (peel_iters_prologue)
3306     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3307         {
3308           stmt_vec_info stmt_info
3309             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3310           retval += record_stmt_cost (prologue_cost_vec,
3311                                       si->count * peel_iters_prologue,
3312                                       si->kind, stmt_info, si->misalign,
3313                                       vect_prologue);
3314         }
3315   if (*peel_iters_epilogue)
3316     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3317         {
3318           stmt_vec_info stmt_info
3319             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3320           retval += record_stmt_cost (epilogue_cost_vec,
3321                                       si->count * *peel_iters_epilogue,
3322                                       si->kind, stmt_info, si->misalign,
3323                                       vect_epilogue);
3324         }
3325
3326   return retval;
3327 }
3328
3329 /* Function vect_estimate_min_profitable_iters
3330
3331    Return the number of iterations required for the vector version of the
3332    loop to be profitable relative to the cost of the scalar version of the
3333    loop.
3334
3335    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3336    of iterations for vectorization.  -1 value means loop vectorization
3337    is not profitable.  This returned value may be used for dynamic
3338    profitability check.
3339
3340    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3341    for static check against estimated number of iterations.  */
3342
3343 static void
3344 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3345                                     int *ret_min_profitable_niters,
3346                                     int *ret_min_profitable_estimate)
3347 {
3348   int min_profitable_iters;
3349   int min_profitable_estimate;
3350   int peel_iters_prologue;
3351   int peel_iters_epilogue;
3352   unsigned vec_inside_cost = 0;
3353   int vec_outside_cost = 0;
3354   unsigned vec_prologue_cost = 0;
3355   unsigned vec_epilogue_cost = 0;
3356   int scalar_single_iter_cost = 0;
3357   int scalar_outside_cost = 0;
3358   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3359   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3360   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3361
3362   /* Cost model disabled.  */
3363   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3364     {
3365       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3366       *ret_min_profitable_niters = 0;
3367       *ret_min_profitable_estimate = 0;
3368       return;
3369     }
3370
3371   /* Requires loop versioning tests to handle misalignment.  */
3372   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3373     {
3374       /*  FIXME: Make cost depend on complexity of individual check.  */
3375       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3376       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3377                             vect_prologue);
3378       dump_printf (MSG_NOTE,
3379                    "cost model: Adding cost of checks for loop "
3380                    "versioning to treat misalignment.\n");
3381     }
3382
3383   /* Requires loop versioning with alias checks.  */
3384   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3385     {
3386       /*  FIXME: Make cost depend on complexity of individual check.  */
3387       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3388       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3389                             vect_prologue);
3390       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3391       if (len)
3392         /* Count LEN - 1 ANDs and LEN comparisons.  */
3393         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3394                               NULL, 0, vect_prologue);
3395       dump_printf (MSG_NOTE,
3396                    "cost model: Adding cost of checks for loop "
3397                    "versioning aliasing.\n");
3398     }
3399
3400   /* Requires loop versioning with niter checks.  */
3401   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3402     {
3403       /*  FIXME: Make cost depend on complexity of individual check.  */
3404       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3405                             vect_prologue);
3406       dump_printf (MSG_NOTE,
3407                    "cost model: Adding cost of checks for loop "
3408                    "versioning niters.\n");
3409     }
3410
3411   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3412     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3413                           vect_prologue);
3414
3415   /* Count statements in scalar loop.  Using this as scalar cost for a single
3416      iteration for now.
3417
3418      TODO: Add outer loop support.
3419
3420      TODO: Consider assigning different costs to different scalar
3421      statements.  */
3422
3423   scalar_single_iter_cost
3424     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3425
3426   /* Add additional cost for the peeled instructions in prologue and epilogue
3427      loop.
3428
3429      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3430      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3431
3432      TODO: Build an expression that represents peel_iters for prologue and
3433      epilogue to be used in a run-time test.  */
3434
3435   if (npeel  < 0)
3436     {
3437       peel_iters_prologue = vf/2;
3438       dump_printf (MSG_NOTE, "cost model: "
3439                    "prologue peel iters set to vf/2.\n");
3440
3441       /* If peeling for alignment is unknown, loop bound of main loop becomes
3442          unknown.  */
3443       peel_iters_epilogue = vf/2;
3444       dump_printf (MSG_NOTE, "cost model: "
3445                    "epilogue peel iters set to vf/2 because "
3446                    "peeling for alignment is unknown.\n");
3447
3448       /* If peeled iterations are unknown, count a taken branch and a not taken
3449          branch per peeled loop. Even if scalar loop iterations are known,
3450          vector iterations are not known since peeled prologue iterations are
3451          not known. Hence guards remain the same.  */
3452       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3453                             NULL, 0, vect_prologue);
3454       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3455                             NULL, 0, vect_prologue);
3456       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3457                             NULL, 0, vect_epilogue);
3458       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3459                             NULL, 0, vect_epilogue);
3460       stmt_info_for_cost *si;
3461       int j;
3462       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3463         {
3464           struct _stmt_vec_info *stmt_info
3465             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3466           (void) add_stmt_cost (target_cost_data,
3467                                 si->count * peel_iters_prologue,
3468                                 si->kind, stmt_info, si->misalign,
3469                                 vect_prologue);
3470           (void) add_stmt_cost (target_cost_data,
3471                                 si->count * peel_iters_epilogue,
3472                                 si->kind, stmt_info, si->misalign,
3473                                 vect_epilogue);
3474         }
3475     }
3476   else
3477     {
3478       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3479       stmt_info_for_cost *si;
3480       int j;
3481       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3482
3483       prologue_cost_vec.create (2);
3484       epilogue_cost_vec.create (2);
3485       peel_iters_prologue = npeel;
3486
3487       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3488                                           &peel_iters_epilogue,
3489                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3490                                             (loop_vinfo),
3491                                           &prologue_cost_vec,
3492                                           &epilogue_cost_vec);
3493
3494       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3495         {
3496           struct _stmt_vec_info *stmt_info
3497             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3498           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3499                                 si->misalign, vect_prologue);
3500         }
3501
3502       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3503         {
3504           struct _stmt_vec_info *stmt_info
3505             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3506           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3507                                 si->misalign, vect_epilogue);
3508         }
3509
3510       prologue_cost_vec.release ();
3511       epilogue_cost_vec.release ();
3512     }
3513
3514   /* FORNOW: The scalar outside cost is incremented in one of the
3515      following ways:
3516
3517      1. The vectorizer checks for alignment and aliasing and generates
3518      a condition that allows dynamic vectorization.  A cost model
3519      check is ANDED with the versioning condition.  Hence scalar code
3520      path now has the added cost of the versioning check.
3521
3522        if (cost > th & versioning_check)
3523          jmp to vector code
3524
3525      Hence run-time scalar is incremented by not-taken branch cost.
3526
3527      2. The vectorizer then checks if a prologue is required.  If the
3528      cost model check was not done before during versioning, it has to
3529      be done before the prologue check.
3530
3531        if (cost <= th)
3532          prologue = scalar_iters
3533        if (prologue == 0)
3534          jmp to vector code
3535        else
3536          execute prologue
3537        if (prologue == num_iters)
3538          go to exit
3539
3540      Hence the run-time scalar cost is incremented by a taken branch,
3541      plus a not-taken branch, plus a taken branch cost.
3542
3543      3. The vectorizer then checks if an epilogue is required.  If the
3544      cost model check was not done before during prologue check, it
3545      has to be done with the epilogue check.
3546
3547        if (prologue == 0)
3548          jmp to vector code
3549        else
3550          execute prologue
3551        if (prologue == num_iters)
3552          go to exit
3553        vector code:
3554          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3555            jmp to epilogue
3556
3557      Hence the run-time scalar cost should be incremented by 2 taken
3558      branches.
3559
3560      TODO: The back end may reorder the BBS's differently and reverse
3561      conditions/branch directions.  Change the estimates below to
3562      something more reasonable.  */
3563
3564   /* If the number of iterations is known and we do not do versioning, we can
3565      decide whether to vectorize at compile time.  Hence the scalar version
3566      do not carry cost model guard costs.  */
3567   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3568       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3569     {
3570       /* Cost model check occurs at versioning.  */
3571       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3572         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3573       else
3574         {
3575           /* Cost model check occurs at prologue generation.  */
3576           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3577             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3578               + vect_get_stmt_cost (cond_branch_not_taken);
3579           /* Cost model check occurs at epilogue generation.  */
3580           else
3581             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3582         }
3583     }
3584
3585   /* Complete the target-specific cost calculations.  */
3586   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3587                &vec_inside_cost, &vec_epilogue_cost);
3588
3589   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3590
3591   if (dump_enabled_p ())
3592     {
3593       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3594       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3595                    vec_inside_cost);
3596       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3597                    vec_prologue_cost);
3598       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3599                    vec_epilogue_cost);
3600       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3601                    scalar_single_iter_cost);
3602       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3603                    scalar_outside_cost);
3604       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3605                    vec_outside_cost);
3606       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3607                    peel_iters_prologue);
3608       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3609                    peel_iters_epilogue);
3610     }
3611
3612   /* Calculate number of iterations required to make the vector version
3613      profitable, relative to the loop bodies only.  The following condition
3614      must hold true:
3615      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3616      where
3617      SIC = scalar iteration cost, VIC = vector iteration cost,
3618      VOC = vector outside cost, VF = vectorization factor,
3619      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3620      SOC = scalar outside cost for run time cost model check.  */
3621
3622   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3623     {
3624       if (vec_outside_cost <= 0)
3625         min_profitable_iters = 0;
3626       else
3627         {
3628           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3629                                   - vec_inside_cost * peel_iters_prologue
3630                                   - vec_inside_cost * peel_iters_epilogue)
3631                                  / ((scalar_single_iter_cost * vf)
3632                                     - vec_inside_cost);
3633
3634           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3635               <= (((int) vec_inside_cost * min_profitable_iters)
3636                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3637             min_profitable_iters++;
3638         }
3639     }
3640   /* vector version will never be profitable.  */
3641   else
3642     {
3643       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3644         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3645                     "did not happen for a simd loop");
3646
3647       if (dump_enabled_p ())
3648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649                          "cost model: the vector iteration cost = %d "
3650                          "divided by the scalar iteration cost = %d "
3651                          "is greater or equal to the vectorization factor = %d"
3652                          ".\n",
3653                          vec_inside_cost, scalar_single_iter_cost, vf);
3654       *ret_min_profitable_niters = -1;
3655       *ret_min_profitable_estimate = -1;
3656       return;
3657     }
3658
3659   dump_printf (MSG_NOTE,
3660                "  Calculated minimum iters for profitability: %d\n",
3661                min_profitable_iters);
3662
3663   /* We want the vectorized loop to execute at least once.  */
3664   if (min_profitable_iters < (vf + peel_iters_prologue + peel_iters_epilogue))
3665     min_profitable_iters = vf + peel_iters_prologue + peel_iters_epilogue;
3666
3667   if (dump_enabled_p ())
3668     dump_printf_loc (MSG_NOTE, vect_location,
3669                      "  Runtime profitability threshold = %d\n",
3670                      min_profitable_iters);
3671
3672   *ret_min_profitable_niters = min_profitable_iters;
3673
3674   /* Calculate number of iterations required to make the vector version
3675      profitable, relative to the loop bodies only.
3676
3677      Non-vectorized variant is SIC * niters and it must win over vector
3678      variant on the expected loop trip count.  The following condition must hold true:
3679      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3680
3681   if (vec_outside_cost <= 0)
3682     min_profitable_estimate = 0;
3683   else
3684     {
3685       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3686                                  - vec_inside_cost * peel_iters_prologue
3687                                  - vec_inside_cost * peel_iters_epilogue)
3688                                  / ((scalar_single_iter_cost * vf)
3689                                    - vec_inside_cost);
3690     }
3691   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3692   if (dump_enabled_p ())
3693     dump_printf_loc (MSG_NOTE, vect_location,
3694                      "  Static estimate profitability threshold = %d\n",
3695                      min_profitable_estimate);
3696
3697   *ret_min_profitable_estimate = min_profitable_estimate;
3698 }
3699
3700 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3701    vector elements (not bits) for a vector of mode MODE.  */
3702 static void
3703 calc_vec_perm_mask_for_shift (machine_mode mode, unsigned int offset,
3704                               unsigned char *sel)
3705 {
3706   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3707
3708   for (i = 0; i < nelt; i++)
3709     sel[i] = (i + offset) & (2*nelt - 1);
3710 }
3711
3712 /* Checks whether the target supports whole-vector shifts for vectors of mode
3713    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3714    it supports vec_perm_const with masks for all necessary shift amounts.  */
3715 static bool
3716 have_whole_vector_shift (machine_mode mode)
3717 {
3718   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3719     return true;
3720
3721   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3722     return false;
3723
3724   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3725   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3726
3727   for (i = nelt/2; i >= 1; i/=2)
3728     {
3729       calc_vec_perm_mask_for_shift (mode, i, sel);
3730       if (!can_vec_perm_p (mode, false, sel))
3731         return false;
3732     }
3733   return true;
3734 }
3735
3736 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3737    functions. Design better to avoid maintenance issues.  */
3738
3739 /* Function vect_model_reduction_cost.
3740
3741    Models cost for a reduction operation, including the vector ops
3742    generated within the strip-mine loop, the initial definition before
3743    the loop, and the epilogue code that must be generated.  */
3744
3745 static void
3746 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3747                            int ncopies)
3748 {
3749   int prologue_cost = 0, epilogue_cost = 0;
3750   enum tree_code code;
3751   optab optab;
3752   tree vectype;
3753   gimple *orig_stmt;
3754   machine_mode mode;
3755   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3756   struct loop *loop = NULL;
3757   void *target_cost_data;
3758
3759   if (loop_vinfo)
3760     {
3761       loop = LOOP_VINFO_LOOP (loop_vinfo);
3762       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3763     }
3764   else
3765     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3766
3767   /* Condition reductions generate two reductions in the loop.  */
3768   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3769     ncopies *= 2;
3770
3771   /* Cost of reduction op inside loop.  */
3772   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3773                                         stmt_info, 0, vect_body);
3774
3775   vectype = STMT_VINFO_VECTYPE (stmt_info);
3776   mode = TYPE_MODE (vectype);
3777   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3778
3779   if (!orig_stmt)
3780     orig_stmt = STMT_VINFO_STMT (stmt_info);
3781
3782   code = gimple_assign_rhs_code (orig_stmt);
3783
3784   /* Add in cost for initial definition.
3785      For cond reduction we have four vectors: initial index, step, initial
3786      result of the data reduction, initial value of the index reduction.  */
3787   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3788                        == COND_REDUCTION ? 4 : 1;
3789   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3790                                   scalar_to_vec, stmt_info, 0,
3791                                   vect_prologue);
3792
3793   /* Determine cost of epilogue code.
3794
3795      We have a reduction operator that will reduce the vector in one statement.
3796      Also requires scalar extract.  */
3797
3798   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3799     {
3800       if (reduc_code != ERROR_MARK)
3801         {
3802           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3803             {
3804               /* An EQ stmt and an COND_EXPR stmt.  */
3805               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3806                                               vector_stmt, stmt_info, 0,
3807                                               vect_epilogue);
3808               /* Reduction of the max index and a reduction of the found
3809                  values.  */
3810               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3811                                               vec_to_scalar, stmt_info, 0,
3812                                               vect_epilogue);
3813               /* A broadcast of the max value.  */
3814               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3815                                               scalar_to_vec, stmt_info, 0,
3816                                               vect_epilogue);
3817             }
3818           else
3819             {
3820               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3821                                               stmt_info, 0, vect_epilogue);
3822               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3823                                               vec_to_scalar, stmt_info, 0,
3824                                               vect_epilogue);
3825             }
3826         }
3827       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3828         {
3829           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3830           /* Extraction of scalar elements.  */
3831           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3832                                           vec_to_scalar, stmt_info, 0,
3833                                           vect_epilogue);
3834           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3835           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3836                                           scalar_stmt, stmt_info, 0,
3837                                           vect_epilogue);
3838         }
3839       else
3840         {
3841           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3842           tree bitsize =
3843             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3844           int element_bitsize = tree_to_uhwi (bitsize);
3845           int nelements = vec_size_in_bits / element_bitsize;
3846
3847           if (code == COND_EXPR)
3848             code = MAX_EXPR;
3849
3850           optab = optab_for_tree_code (code, vectype, optab_default);
3851
3852           /* We have a whole vector shift available.  */
3853           if (optab != unknown_optab
3854               && VECTOR_MODE_P (mode)
3855               && optab_handler (optab, mode) != CODE_FOR_nothing
3856               && have_whole_vector_shift (mode))
3857             {
3858               /* Final reduction via vector shifts and the reduction operator.
3859                  Also requires scalar extract.  */
3860               epilogue_cost += add_stmt_cost (target_cost_data,
3861                                               exact_log2 (nelements) * 2,
3862                                               vector_stmt, stmt_info, 0,
3863                                               vect_epilogue);
3864               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3865                                               vec_to_scalar, stmt_info, 0,
3866                                               vect_epilogue);
3867             }
3868           else
3869             /* Use extracts and reduction op for final reduction.  For N
3870                elements, we have N extracts and N-1 reduction ops.  */
3871             epilogue_cost += add_stmt_cost (target_cost_data,
3872                                             nelements + nelements - 1,
3873                                             vector_stmt, stmt_info, 0,
3874                                             vect_epilogue);
3875         }
3876     }
3877
3878   if (dump_enabled_p ())
3879     dump_printf (MSG_NOTE,
3880                  "vect_model_reduction_cost: inside_cost = %d, "
3881                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3882                  prologue_cost, epilogue_cost);
3883 }
3884
3885
3886 /* Function vect_model_induction_cost.
3887
3888    Models cost for induction operations.  */
3889
3890 static void
3891 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3892 {
3893   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3894   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3895   unsigned inside_cost, prologue_cost;
3896
3897   if (PURE_SLP_STMT (stmt_info))
3898     return;
3899
3900   /* loop cost for vec_loop.  */
3901   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3902                                stmt_info, 0, vect_body);
3903
3904   /* prologue cost for vec_init and vec_step.  */
3905   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3906                                  stmt_info, 0, vect_prologue);
3907
3908   if (dump_enabled_p ())
3909     dump_printf_loc (MSG_NOTE, vect_location,
3910                      "vect_model_induction_cost: inside_cost = %d, "
3911                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3912 }
3913
3914
3915
3916 /* Function get_initial_def_for_reduction
3917
3918    Input:
3919    STMT - a stmt that performs a reduction operation in the loop.
3920    INIT_VAL - the initial value of the reduction variable
3921
3922    Output:
3923    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3924         of the reduction (used for adjusting the epilog - see below).
3925    Return a vector variable, initialized according to the operation that STMT
3926         performs. This vector will be used as the initial value of the
3927         vector of partial results.
3928
3929    Option1 (adjust in epilog): Initialize the vector as follows:
3930      add/bit or/xor:    [0,0,...,0,0]
3931      mult/bit and:      [1,1,...,1,1]
3932      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3933    and when necessary (e.g. add/mult case) let the caller know
3934    that it needs to adjust the result by init_val.
3935
3936    Option2: Initialize the vector as follows:
3937      add/bit or/xor:    [init_val,0,0,...,0]
3938      mult/bit and:      [init_val,1,1,...,1]
3939      min/max/cond_expr: [init_val,init_val,...,init_val]
3940    and no adjustments are needed.
3941
3942    For example, for the following code:
3943
3944    s = init_val;
3945    for (i=0;i<n;i++)
3946      s = s + a[i];
3947
3948    STMT is 's = s + a[i]', and the reduction variable is 's'.
3949    For a vector of 4 units, we want to return either [0,0,0,init_val],
3950    or [0,0,0,0] and let the caller know that it needs to adjust
3951    the result at the end by 'init_val'.
3952
3953    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3954    initialization vector is simpler (same element in all entries), if
3955    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3956
3957    A cost model should help decide between these two schemes.  */
3958
3959 tree
3960 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3961                                tree *adjustment_def)
3962 {
3963   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3964   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3965   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3966   tree scalar_type = TREE_TYPE (init_val);
3967   tree vectype = get_vectype_for_scalar_type (scalar_type);
3968   int nunits;
3969   enum tree_code code = gimple_assign_rhs_code (stmt);
3970   tree def_for_init;
3971   tree init_def;
3972   tree *elts;
3973   int i;
3974   bool nested_in_vect_loop = false;
3975   REAL_VALUE_TYPE real_init_val = dconst0;
3976   int int_init_val = 0;
3977   gimple *def_stmt = NULL;
3978   gimple_seq stmts = NULL;
3979
3980   gcc_assert (vectype);
3981   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3982
3983   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3984               || SCALAR_FLOAT_TYPE_P (scalar_type));
3985
3986   if (nested_in_vect_loop_p (loop, stmt))
3987     nested_in_vect_loop = true;
3988   else
3989     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3990
3991   /* In case of double reduction we only create a vector variable to be put
3992      in the reduction phi node.  The actual statement creation is done in
3993      vect_create_epilog_for_reduction.  */
3994   if (adjustment_def && nested_in_vect_loop
3995       && TREE_CODE (init_val) == SSA_NAME
3996       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3997       && gimple_code (def_stmt) == GIMPLE_PHI
3998       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3999       && vinfo_for_stmt (def_stmt)
4000       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4001           == vect_double_reduction_def)
4002     {
4003       *adjustment_def = NULL;
4004       return vect_create_destination_var (init_val, vectype);
4005     }
4006
4007   /* In case of a nested reduction do not use an adjustment def as
4008      that case is not supported by the epilogue generation correctly
4009      if ncopies is not one.  */
4010   if (adjustment_def && nested_in_vect_loop)
4011     {
4012       *adjustment_def = NULL;
4013       return vect_get_vec_def_for_operand (init_val, stmt);
4014     }
4015
4016   switch (code)
4017     {
4018       case WIDEN_SUM_EXPR:
4019       case DOT_PROD_EXPR:
4020       case SAD_EXPR:
4021       case PLUS_EXPR:
4022       case MINUS_EXPR:
4023       case BIT_IOR_EXPR:
4024       case BIT_XOR_EXPR:
4025       case MULT_EXPR:
4026       case BIT_AND_EXPR:
4027         /* ADJUSMENT_DEF is NULL when called from
4028            vect_create_epilog_for_reduction to vectorize double reduction.  */
4029         if (adjustment_def)
4030           *adjustment_def = init_val;
4031
4032         if (code == MULT_EXPR)
4033           {
4034             real_init_val = dconst1;
4035             int_init_val = 1;
4036           }
4037
4038         if (code == BIT_AND_EXPR)
4039           int_init_val = -1;
4040
4041         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4042           def_for_init = build_real (scalar_type, real_init_val);
4043         else
4044           def_for_init = build_int_cst (scalar_type, int_init_val);
4045
4046         /* Create a vector of '0' or '1' except the first element.  */
4047         elts = XALLOCAVEC (tree, nunits);
4048         for (i = nunits - 2; i >= 0; --i)
4049           elts[i + 1] = def_for_init;
4050
4051         /* Option1: the first element is '0' or '1' as well.  */
4052         if (adjustment_def)
4053           {
4054             elts[0] = def_for_init;
4055             init_def = build_vector (vectype, elts);
4056             break;
4057           }
4058
4059         /* Option2: the first element is INIT_VAL.  */
4060         elts[0] = init_val;
4061         if (TREE_CONSTANT (init_val))
4062           init_def = build_vector (vectype, elts);
4063         else
4064           {
4065             vec<constructor_elt, va_gc> *v;
4066             vec_alloc (v, nunits);
4067             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4068             for (i = 1; i < nunits; ++i)
4069               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4070             init_def = build_constructor (vectype, v);
4071           }
4072
4073         break;
4074
4075       case MIN_EXPR:
4076       case MAX_EXPR:
4077       case COND_EXPR:
4078         if (adjustment_def)
4079           {
4080             *adjustment_def = NULL_TREE;
4081             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4082               {
4083                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4084                 break;
4085               }
4086           }
4087         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4088         if (! gimple_seq_empty_p (stmts))
4089           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4090         init_def = build_vector_from_val (vectype, init_val);
4091         break;
4092
4093       default:
4094         gcc_unreachable ();
4095     }
4096
4097   return init_def;
4098 }
4099
4100 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4101    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4102
4103 static void
4104 get_initial_defs_for_reduction (slp_tree slp_node,
4105                                 vec<tree> *vec_oprnds,
4106                                 unsigned int number_of_vectors,
4107                                 enum tree_code code, bool reduc_chain)
4108 {
4109   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4110   gimple *stmt = stmts[0];
4111   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4112   unsigned nunits;
4113   tree vec_cst;
4114   tree *elts;
4115   unsigned j, number_of_places_left_in_vector;
4116   tree vector_type, scalar_type;
4117   tree vop;
4118   int group_size = stmts.length ();
4119   unsigned int vec_num, i;
4120   unsigned number_of_copies = 1;
4121   vec<tree> voprnds;
4122   voprnds.create (number_of_vectors);
4123   bool constant_p;
4124   tree neutral_op = NULL;
4125   struct loop *loop;
4126   gimple_seq ctor_seq = NULL;
4127
4128   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4129   scalar_type = TREE_TYPE (vector_type);
4130   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4131
4132   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4133
4134   loop = (gimple_bb (stmt))->loop_father;
4135   gcc_assert (loop);
4136
4137   /* op is the reduction operand of the first stmt already.  */
4138   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4139      we need either neutral operands or the original operands.  See
4140      get_initial_def_for_reduction() for details.  */
4141   switch (code)
4142     {
4143     case WIDEN_SUM_EXPR:
4144     case DOT_PROD_EXPR:
4145     case SAD_EXPR:
4146     case PLUS_EXPR:
4147     case MINUS_EXPR:
4148     case BIT_IOR_EXPR:
4149     case BIT_XOR_EXPR:
4150       neutral_op = build_zero_cst (scalar_type);
4151       break;
4152
4153     case MULT_EXPR:
4154       neutral_op = build_one_cst (scalar_type);
4155       break;
4156
4157     case BIT_AND_EXPR:
4158       neutral_op = build_all_ones_cst (scalar_type);
4159       break;
4160
4161     /* For MIN/MAX we don't have an easy neutral operand but
4162        the initial values can be used fine here.  Only for
4163        a reduction chain we have to force a neutral element.  */
4164     case MAX_EXPR:
4165     case MIN_EXPR:
4166       if (! reduc_chain)
4167         neutral_op = NULL;
4168       else
4169         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt,
4170                                             loop_preheader_edge (loop));
4171       break;
4172
4173     default:
4174       gcc_assert (! reduc_chain);
4175       neutral_op = NULL;
4176     }
4177
4178   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4179      created vectors. It is greater than 1 if unrolling is performed.
4180
4181      For example, we have two scalar operands, s1 and s2 (e.g., group of
4182      strided accesses of size two), while NUNITS is four (i.e., four scalars
4183      of this type can be packed in a vector).  The output vector will contain
4184      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4185      will be 2).
4186
4187      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4188      containing the operands.
4189
4190      For example, NUNITS is four as before, and the group size is 8
4191      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4192      {s5, s6, s7, s8}.  */
4193
4194   number_of_copies = nunits * number_of_vectors / group_size;
4195
4196   number_of_places_left_in_vector = nunits;
4197   constant_p = true;
4198   elts = XALLOCAVEC (tree, nunits);
4199   for (j = 0; j < number_of_copies; j++)
4200     {
4201       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4202         {
4203           tree op;
4204           /* Get the def before the loop.  In reduction chain we have only
4205              one initial value.  */
4206           if ((j != (number_of_copies - 1)
4207                || (reduc_chain && i != 0))
4208               && neutral_op)
4209             op = neutral_op;
4210           else
4211             op = PHI_ARG_DEF_FROM_EDGE (stmt,
4212                                         loop_preheader_edge (loop));
4213
4214           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4215           number_of_places_left_in_vector--;
4216           elts[number_of_places_left_in_vector] = op;
4217           if (!CONSTANT_CLASS_P (op))
4218             constant_p = false;
4219
4220           if (number_of_places_left_in_vector == 0)
4221             {
4222               if (constant_p)
4223                 vec_cst = build_vector (vector_type, elts);
4224               else
4225                 {
4226                   vec<constructor_elt, va_gc> *v;
4227                   unsigned k;
4228                   vec_alloc (v, nunits);
4229                   for (k = 0; k < nunits; ++k)
4230                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
4231                   vec_cst = build_constructor (vector_type, v);
4232                 }
4233               tree init;
4234               gimple_stmt_iterator gsi;
4235               init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
4236               if (ctor_seq != NULL)
4237                 {
4238                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
4239                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
4240                                                         GSI_SAME_STMT);
4241                   ctor_seq = NULL;
4242                 }
4243               voprnds.quick_push (init);
4244
4245               number_of_places_left_in_vector = nunits;
4246               constant_p = true;
4247             }
4248         }
4249     }
4250
4251   /* Since the vectors are created in the reverse order, we should invert
4252      them.  */
4253   vec_num = voprnds.length ();
4254   for (j = vec_num; j != 0; j--)
4255     {
4256       vop = voprnds[j - 1];
4257       vec_oprnds->quick_push (vop);
4258     }
4259
4260   voprnds.release ();
4261
4262   /* In case that VF is greater than the unrolling factor needed for the SLP
4263      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4264      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4265      to replicate the vectors.  */
4266   while (number_of_vectors > vec_oprnds->length ())
4267     {
4268       tree neutral_vec = NULL;
4269
4270       if (neutral_op)
4271         {
4272           if (!neutral_vec)
4273             neutral_vec = build_vector_from_val (vector_type, neutral_op);
4274
4275           vec_oprnds->quick_push (neutral_vec);
4276         }
4277       else
4278         {
4279           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4280             vec_oprnds->quick_push (vop);
4281         }
4282     }
4283 }
4284
4285
4286 /* Function vect_create_epilog_for_reduction
4287
4288    Create code at the loop-epilog to finalize the result of a reduction
4289    computation.
4290
4291    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4292      reduction statements.
4293    STMT is the scalar reduction stmt that is being vectorized.
4294    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4295      number of elements that we can fit in a vectype (nunits).  In this case
4296      we have to generate more than one vector stmt - i.e - we need to "unroll"
4297      the vector stmt by a factor VF/nunits.  For more details see documentation
4298      in vectorizable_operation.
4299    REDUC_CODE is the tree-code for the epilog reduction.
4300    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4301      computation.
4302    REDUC_INDEX is the index of the operand in the right hand side of the
4303      statement that is defined by REDUCTION_PHI.
4304    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4305    SLP_NODE is an SLP node containing a group of reduction statements. The
4306      first one in this group is STMT.
4307
4308    This function:
4309    1. Creates the reduction def-use cycles: sets the arguments for
4310       REDUCTION_PHIS:
4311       The loop-entry argument is the vectorized initial-value of the reduction.
4312       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4313       sums.
4314    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4315       by applying the operation specified by REDUC_CODE if available, or by
4316       other means (whole-vector shifts or a scalar loop).
4317       The function also creates a new phi node at the loop exit to preserve
4318       loop-closed form, as illustrated below.
4319
4320      The flow at the entry to this function:
4321
4322         loop:
4323           vec_def = phi <null, null>            # REDUCTION_PHI
4324           VECT_DEF = vector_stmt                # vectorized form of STMT
4325           s_loop = scalar_stmt                  # (scalar) STMT
4326         loop_exit:
4327           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4328           use <s_out0>
4329           use <s_out0>
4330
4331      The above is transformed by this function into:
4332
4333         loop:
4334           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4335           VECT_DEF = vector_stmt                # vectorized form of STMT
4336           s_loop = scalar_stmt                  # (scalar) STMT
4337         loop_exit:
4338           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4339           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4340           v_out2 = reduce <v_out1>
4341           s_out3 = extract_field <v_out2, 0>
4342           s_out4 = adjust_result <s_out3>
4343           use <s_out4>
4344           use <s_out4>
4345 */
4346
4347 static void
4348 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4349                                   gimple *reduc_def_stmt,
4350                                   int ncopies, enum tree_code reduc_code,
4351                                   vec<gimple *> reduction_phis,
4352                                   bool double_reduc,
4353                                   slp_tree slp_node,
4354                                   slp_instance slp_node_instance)
4355 {
4356   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4357   stmt_vec_info prev_phi_info;
4358   tree vectype;
4359   machine_mode mode;
4360   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4361   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4362   basic_block exit_bb;
4363   tree scalar_dest;
4364   tree scalar_type;
4365   gimple *new_phi = NULL, *phi;
4366   gimple_stmt_iterator exit_gsi;
4367   tree vec_dest;
4368   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4369   gimple *epilog_stmt = NULL;
4370   enum tree_code code = gimple_assign_rhs_code (stmt);
4371   gimple *exit_phi;
4372   tree bitsize;
4373   tree adjustment_def = NULL;
4374   tree vec_initial_def = NULL;
4375   tree expr, def, initial_def = NULL;
4376   tree orig_name, scalar_result;
4377   imm_use_iterator imm_iter, phi_imm_iter;
4378   use_operand_p use_p, phi_use_p;
4379   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4380   bool nested_in_vect_loop = false;
4381   auto_vec<gimple *> new_phis;
4382   auto_vec<gimple *> inner_phis;
4383   enum vect_def_type dt = vect_unknown_def_type;
4384   int j, i;
4385   auto_vec<tree> scalar_results;
4386   unsigned int group_size = 1, k, ratio;
4387   auto_vec<tree> vec_initial_defs;
4388   auto_vec<gimple *> phis;
4389   bool slp_reduc = false;
4390   tree new_phi_result;
4391   gimple *inner_phi = NULL;
4392   tree induction_index = NULL_TREE;
4393
4394   if (slp_node)
4395     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4396
4397   if (nested_in_vect_loop_p (loop, stmt))
4398     {
4399       outer_loop = loop;
4400       loop = loop->inner;
4401       nested_in_vect_loop = true;
4402       gcc_assert (!slp_node);
4403     }
4404
4405   vectype = STMT_VINFO_VECTYPE (stmt_info);
4406   gcc_assert (vectype);
4407   mode = TYPE_MODE (vectype);
4408
4409   /* 1. Create the reduction def-use cycle:
4410      Set the arguments of REDUCTION_PHIS, i.e., transform
4411
4412         loop:
4413           vec_def = phi <null, null>            # REDUCTION_PHI
4414           VECT_DEF = vector_stmt                # vectorized form of STMT
4415           ...
4416
4417      into:
4418
4419         loop:
4420           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4421           VECT_DEF = vector_stmt                # vectorized form of STMT
4422           ...
4423
4424      (in case of SLP, do it for all the phis). */
4425
4426   /* Get the loop-entry arguments.  */
4427   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4428   if (slp_node)
4429     {
4430       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4431       vec_initial_defs.reserve (vec_num);
4432       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4433                                       &vec_initial_defs, vec_num, code,
4434                                       GROUP_FIRST_ELEMENT (stmt_info));
4435     }
4436   else
4437     {
4438       /* Get at the scalar def before the loop, that defines the initial value
4439          of the reduction variable.  */
4440       gimple *def_stmt;
4441       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4442                                            loop_preheader_edge (loop));
4443       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4444       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4445                                                        &adjustment_def);
4446       vec_initial_defs.create (1);
4447       vec_initial_defs.quick_push (vec_initial_def);
4448     }
4449
4450   /* Set phi nodes arguments.  */
4451   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4452     {
4453       tree vec_init_def, def;
4454       gimple_seq stmts;
4455       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4456                                            true, NULL_TREE);
4457       if (stmts)
4458         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4459
4460       def = vect_defs[i];
4461       for (j = 0; j < ncopies; j++)
4462         {
4463           if (j != 0)
4464             {
4465               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4466               if (nested_in_vect_loop)
4467                 vec_init_def
4468                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4469                                                     vec_init_def);
4470             }
4471
4472           /* Set the loop-entry arg of the reduction-phi.  */
4473
4474           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4475               == INTEGER_INDUC_COND_REDUCTION)
4476             {
4477               /* Initialise the reduction phi to zero.  This prevents initial
4478                  values of non-zero interferring with the reduction op.  */
4479               gcc_assert (ncopies == 1);
4480               gcc_assert (i == 0);
4481
4482               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4483               tree zero_vec = build_zero_cst (vec_init_def_type);
4484
4485               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4486                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4487             }
4488           else
4489             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4490                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4491
4492           /* Set the loop-latch arg for the reduction-phi.  */
4493           if (j > 0)
4494             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4495
4496           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4497                        UNKNOWN_LOCATION);
4498
4499           if (dump_enabled_p ())
4500             {
4501               dump_printf_loc (MSG_NOTE, vect_location,
4502                                "transform reduction: created def-use cycle: ");
4503               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4504               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4505             }
4506         }
4507     }
4508
4509   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4510      which is updated with the current index of the loop for every match of
4511      the original loop's cond_expr (VEC_STMT).  This results in a vector
4512      containing the last time the condition passed for that vector lane.
4513      The first match will be a 1 to allow 0 to be used for non-matching
4514      indexes.  If there are no matches at all then the vector will be all
4515      zeroes.  */
4516   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4517     {
4518       tree indx_before_incr, indx_after_incr;
4519       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4520       int k;
4521
4522       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4523       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4524
4525       int scalar_precision
4526         = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype)));
4527       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4528       tree cr_index_vector_type = build_vector_type
4529         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4530
4531       /* First we create a simple vector induction variable which starts
4532          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4533          vector size (STEP).  */
4534
4535       /* Create a {1,2,3,...} vector.  */
4536       tree *vtemp = XALLOCAVEC (tree, nunits_out);
4537       for (k = 0; k < nunits_out; ++k)
4538         vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
4539       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4540
4541       /* Create a vector of the step value.  */
4542       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4543       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4544
4545       /* Create an induction variable.  */
4546       gimple_stmt_iterator incr_gsi;
4547       bool insert_after;
4548       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4549       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4550                  insert_after, &indx_before_incr, &indx_after_incr);
4551
4552       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4553          filled with zeros (VEC_ZERO).  */
4554
4555       /* Create a vector of 0s.  */
4556       tree zero = build_zero_cst (cr_index_scalar_type);
4557       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4558
4559       /* Create a vector phi node.  */
4560       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4561       new_phi = create_phi_node (new_phi_tree, loop->header);
4562       set_vinfo_for_stmt (new_phi,
4563                           new_stmt_vec_info (new_phi, loop_vinfo));
4564       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4565                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4566
4567       /* Now take the condition from the loops original cond_expr
4568          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4569          every match uses values from the induction variable
4570          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4571          (NEW_PHI_TREE).
4572          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4573          the new cond_expr (INDEX_COND_EXPR).  */
4574
4575       /* Duplicate the condition from vec_stmt.  */
4576       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4577
4578       /* Create a conditional, where the condition is taken from vec_stmt
4579          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4580          else is the phi (NEW_PHI_TREE).  */
4581       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4582                                      ccompare, indx_before_incr,
4583                                      new_phi_tree);
4584       induction_index = make_ssa_name (cr_index_vector_type);
4585       gimple *index_condition = gimple_build_assign (induction_index,
4586                                                      index_cond_expr);
4587       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4588       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4589                                                         loop_vinfo);
4590       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4591       set_vinfo_for_stmt (index_condition, index_vec_info);
4592
4593       /* Update the phi with the vec cond.  */
4594       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4595                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4596     }
4597
4598   /* 2. Create epilog code.
4599         The reduction epilog code operates across the elements of the vector
4600         of partial results computed by the vectorized loop.
4601         The reduction epilog code consists of:
4602
4603         step 1: compute the scalar result in a vector (v_out2)
4604         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4605         step 3: adjust the scalar result (s_out3) if needed.
4606
4607         Step 1 can be accomplished using one the following three schemes:
4608           (scheme 1) using reduc_code, if available.
4609           (scheme 2) using whole-vector shifts, if available.
4610           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4611                      combined.
4612
4613           The overall epilog code looks like this:
4614
4615           s_out0 = phi <s_loop>         # original EXIT_PHI
4616           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4617           v_out2 = reduce <v_out1>              # step 1
4618           s_out3 = extract_field <v_out2, 0>    # step 2
4619           s_out4 = adjust_result <s_out3>       # step 3
4620
4621           (step 3 is optional, and steps 1 and 2 may be combined).
4622           Lastly, the uses of s_out0 are replaced by s_out4.  */
4623
4624
4625   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4626          v_out1 = phi <VECT_DEF>
4627          Store them in NEW_PHIS.  */
4628
4629   exit_bb = single_exit (loop)->dest;
4630   prev_phi_info = NULL;
4631   new_phis.create (vect_defs.length ());
4632   FOR_EACH_VEC_ELT (vect_defs, i, def)
4633     {
4634       for (j = 0; j < ncopies; j++)
4635         {
4636           tree new_def = copy_ssa_name (def);
4637           phi = create_phi_node (new_def, exit_bb);
4638           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4639           if (j == 0)
4640             new_phis.quick_push (phi);
4641           else
4642             {
4643               def = vect_get_vec_def_for_stmt_copy (dt, def);
4644               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4645             }
4646
4647           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4648           prev_phi_info = vinfo_for_stmt (phi);
4649         }
4650     }
4651
4652   /* The epilogue is created for the outer-loop, i.e., for the loop being
4653      vectorized.  Create exit phis for the outer loop.  */
4654   if (double_reduc)
4655     {
4656       loop = outer_loop;
4657       exit_bb = single_exit (loop)->dest;
4658       inner_phis.create (vect_defs.length ());
4659       FOR_EACH_VEC_ELT (new_phis, i, phi)
4660         {
4661           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4662           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4663           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4664                            PHI_RESULT (phi));
4665           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4666                                                             loop_vinfo));
4667           inner_phis.quick_push (phi);
4668           new_phis[i] = outer_phi;
4669           prev_phi_info = vinfo_for_stmt (outer_phi);
4670           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4671             {
4672               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4673               new_result = copy_ssa_name (PHI_RESULT (phi));
4674               outer_phi = create_phi_node (new_result, exit_bb);
4675               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4676                                PHI_RESULT (phi));
4677               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4678                                                                 loop_vinfo));
4679               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4680               prev_phi_info = vinfo_for_stmt (outer_phi);
4681             }
4682         }
4683     }
4684
4685   exit_gsi = gsi_after_labels (exit_bb);
4686
4687   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4688          (i.e. when reduc_code is not available) and in the final adjustment
4689          code (if needed).  Also get the original scalar reduction variable as
4690          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4691          represents a reduction pattern), the tree-code and scalar-def are
4692          taken from the original stmt that the pattern-stmt (STMT) replaces.
4693          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4694          are taken from STMT.  */
4695
4696   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4697   if (!orig_stmt)
4698     {
4699       /* Regular reduction  */
4700       orig_stmt = stmt;
4701     }
4702   else
4703     {
4704       /* Reduction pattern  */
4705       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4706       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4707       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4708     }
4709
4710   code = gimple_assign_rhs_code (orig_stmt);
4711   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4712      partial results are added and not subtracted.  */
4713   if (code == MINUS_EXPR)
4714     code = PLUS_EXPR;
4715
4716   scalar_dest = gimple_assign_lhs (orig_stmt);
4717   scalar_type = TREE_TYPE (scalar_dest);
4718   scalar_results.create (group_size);
4719   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4720   bitsize = TYPE_SIZE (scalar_type);
4721
4722   /* In case this is a reduction in an inner-loop while vectorizing an outer
4723      loop - we don't need to extract a single scalar result at the end of the
4724      inner-loop (unless it is double reduction, i.e., the use of reduction is
4725      outside the outer-loop).  The final vector of partial results will be used
4726      in the vectorized outer-loop, or reduced to a scalar result at the end of
4727      the outer-loop.  */
4728   if (nested_in_vect_loop && !double_reduc)
4729     goto vect_finalize_reduction;
4730
4731   /* SLP reduction without reduction chain, e.g.,
4732      # a1 = phi <a2, a0>
4733      # b1 = phi <b2, b0>
4734      a2 = operation (a1)
4735      b2 = operation (b1)  */
4736   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4737
4738   /* In case of reduction chain, e.g.,
4739      # a1 = phi <a3, a0>
4740      a2 = operation (a1)
4741      a3 = operation (a2),
4742
4743      we may end up with more than one vector result.  Here we reduce them to
4744      one vector.  */
4745   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4746     {
4747       tree first_vect = PHI_RESULT (new_phis[0]);
4748       gassign *new_vec_stmt = NULL;
4749       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4750       for (k = 1; k < new_phis.length (); k++)
4751         {
4752           gimple *next_phi = new_phis[k];
4753           tree second_vect = PHI_RESULT (next_phi);
4754           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4755           new_vec_stmt = gimple_build_assign (tem, code,
4756                                               first_vect, second_vect);
4757           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4758           first_vect = tem;
4759         }
4760
4761       new_phi_result = first_vect;
4762       if (new_vec_stmt)
4763         {
4764           new_phis.truncate (0);
4765           new_phis.safe_push (new_vec_stmt);
4766         }
4767     }
4768   /* Likewise if we couldn't use a single defuse cycle.  */
4769   else if (ncopies > 1)
4770     {
4771       gcc_assert (new_phis.length () == 1);
4772       tree first_vect = PHI_RESULT (new_phis[0]);
4773       gassign *new_vec_stmt = NULL;
4774       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4775       gimple *next_phi = new_phis[0];
4776       for (int k = 1; k < ncopies; ++k)
4777         {
4778           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4779           tree second_vect = PHI_RESULT (next_phi);
4780           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4781           new_vec_stmt = gimple_build_assign (tem, code,
4782                                               first_vect, second_vect);
4783           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4784           first_vect = tem;
4785         }
4786       new_phi_result = first_vect;
4787       new_phis.truncate (0);
4788       new_phis.safe_push (new_vec_stmt);
4789     }
4790   else
4791     new_phi_result = PHI_RESULT (new_phis[0]);
4792
4793   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4794       && reduc_code != ERROR_MARK)
4795     {
4796       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4797          various data values where the condition matched and another vector
4798          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4799          need to extract the last matching index (which will be the index with
4800          highest value) and use this to index into the data vector.
4801          For the case where there were no matches, the data vector will contain
4802          all default values and the index vector will be all zeros.  */
4803
4804       /* Get various versions of the type of the vector of indexes.  */
4805       tree index_vec_type = TREE_TYPE (induction_index);
4806       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4807       tree index_scalar_type = TREE_TYPE (index_vec_type);
4808       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4809         (index_vec_type);
4810
4811       /* Get an unsigned integer version of the type of the data vector.  */
4812       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4813       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4814       tree vectype_unsigned = build_vector_type
4815         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4816
4817       /* First we need to create a vector (ZERO_VEC) of zeros and another
4818          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4819          can create using a MAX reduction and then expanding.
4820          In the case where the loop never made any matches, the max index will
4821          be zero.  */
4822
4823       /* Vector of {0, 0, 0,...}.  */
4824       tree zero_vec = make_ssa_name (vectype);
4825       tree zero_vec_rhs = build_zero_cst (vectype);
4826       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4827       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4828
4829       /* Find maximum value from the vector of found indexes.  */
4830       tree max_index = make_ssa_name (index_scalar_type);
4831       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4832                                                     induction_index);
4833       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4834
4835       /* Vector of {max_index, max_index, max_index,...}.  */
4836       tree max_index_vec = make_ssa_name (index_vec_type);
4837       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4838                                                       max_index);
4839       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4840                                                         max_index_vec_rhs);
4841       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4842
4843       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4844          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4845          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4846          otherwise.  Only one value should match, resulting in a vector
4847          (VEC_COND) with one data value and the rest zeros.
4848          In the case where the loop never made any matches, every index will
4849          match, resulting in a vector with all data values (which will all be
4850          the default value).  */
4851
4852       /* Compare the max index vector to the vector of found indexes to find
4853          the position of the max value.  */
4854       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4855       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4856                                                       induction_index,
4857                                                       max_index_vec);
4858       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4859
4860       /* Use the compare to choose either values from the data vector or
4861          zero.  */
4862       tree vec_cond = make_ssa_name (vectype);
4863       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4864                                                    vec_compare, new_phi_result,
4865                                                    zero_vec);
4866       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4867
4868       /* Finally we need to extract the data value from the vector (VEC_COND)
4869          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4870          reduction, but because this doesn't exist, we can use a MAX reduction
4871          instead.  The data value might be signed or a float so we need to cast
4872          it first.
4873          In the case where the loop never made any matches, the data values are
4874          all identical, and so will reduce down correctly.  */
4875
4876       /* Make the matched data values unsigned.  */
4877       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4878       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4879                                        vec_cond);
4880       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4881                                                         VIEW_CONVERT_EXPR,
4882                                                         vec_cond_cast_rhs);
4883       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4884
4885       /* Reduce down to a scalar value.  */
4886       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4887       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4888                                       optab_default);
4889       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4890                   != CODE_FOR_nothing);
4891       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4892                                                      REDUC_MAX_EXPR,
4893                                                      vec_cond_cast);
4894       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4895
4896       /* Convert the reduced value back to the result type and set as the
4897          result.  */
4898       gimple_seq stmts = NULL;
4899       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4900                                data_reduc);
4901       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4902       scalar_results.safe_push (new_temp);
4903     }
4904   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4905            && reduc_code == ERROR_MARK)
4906     {
4907       /* Condition redution without supported REDUC_MAX_EXPR.  Generate
4908          idx = 0;
4909          idx_val = induction_index[0];
4910          val = data_reduc[0];
4911          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4912            if (induction_index[i] > idx_val)
4913              val = data_reduc[i], idx_val = induction_index[i];
4914          return val;  */
4915
4916       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4917       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4918       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4919       unsigned HOST_WIDE_INT v_size
4920         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4921       tree idx_val = NULL_TREE, val = NULL_TREE;
4922       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4923         {
4924           tree old_idx_val = idx_val;
4925           tree old_val = val;
4926           idx_val = make_ssa_name (idx_eltype);
4927           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4928                                              build3 (BIT_FIELD_REF, idx_eltype,
4929                                                      induction_index,
4930                                                      bitsize_int (el_size),
4931                                                      bitsize_int (off)));
4932           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4933           val = make_ssa_name (data_eltype);
4934           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4935                                              build3 (BIT_FIELD_REF,
4936                                                      data_eltype,
4937                                                      new_phi_result,
4938                                                      bitsize_int (el_size),
4939                                                      bitsize_int (off)));
4940           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4941           if (off != 0)
4942             {
4943               tree new_idx_val = idx_val;
4944               tree new_val = val;
4945               if (off != v_size - el_size)
4946                 {
4947                   new_idx_val = make_ssa_name (idx_eltype);
4948                   epilog_stmt = gimple_build_assign (new_idx_val,
4949                                                      MAX_EXPR, idx_val,
4950                                                      old_idx_val);
4951                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4952                 }
4953               new_val = make_ssa_name (data_eltype);
4954               epilog_stmt = gimple_build_assign (new_val,
4955                                                  COND_EXPR,
4956                                                  build2 (GT_EXPR,
4957                                                          boolean_type_node,
4958                                                          idx_val,
4959                                                          old_idx_val),
4960                                                  val, old_val);
4961               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4962               idx_val = new_idx_val;
4963               val = new_val;
4964             }
4965         }
4966       /* Convert the reduced value back to the result type and set as the
4967          result.  */
4968       gimple_seq stmts = NULL;
4969       val = gimple_convert (&stmts, scalar_type, val);
4970       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4971       scalar_results.safe_push (val);
4972     }
4973
4974   /* 2.3 Create the reduction code, using one of the three schemes described
4975          above. In SLP we simply need to extract all the elements from the
4976          vector (without reducing them), so we use scalar shifts.  */
4977   else if (reduc_code != ERROR_MARK && !slp_reduc)
4978     {
4979       tree tmp;
4980       tree vec_elem_type;
4981
4982       /* Case 1:  Create:
4983          v_out2 = reduc_expr <v_out1>  */
4984
4985       if (dump_enabled_p ())
4986         dump_printf_loc (MSG_NOTE, vect_location,
4987                          "Reduce using direct vector reduction.\n");
4988
4989       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4990       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4991         {
4992           tree tmp_dest =
4993               vect_create_destination_var (scalar_dest, vec_elem_type);
4994           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4995           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4996           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4997           gimple_assign_set_lhs (epilog_stmt, new_temp);
4998           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4999
5000           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
5001         }
5002       else
5003         tmp = build1 (reduc_code, scalar_type, new_phi_result);
5004
5005       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
5006       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5007       gimple_assign_set_lhs (epilog_stmt, new_temp);
5008       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5009
5010       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5011           == INTEGER_INDUC_COND_REDUCTION)
5012         {
5013           /* Earlier we set the initial value to be zero.  Check the result
5014              and if it is zero then replace with the original initial
5015              value.  */
5016           tree zero = build_zero_cst (scalar_type);
5017           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5018
5019           tmp = make_ssa_name (new_scalar_dest);
5020           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5021                                              initial_def, new_temp);
5022           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5023           new_temp = tmp;
5024         }
5025
5026       scalar_results.safe_push (new_temp);
5027     }
5028   else
5029     {
5030       bool reduce_with_shift = have_whole_vector_shift (mode);
5031       int element_bitsize = tree_to_uhwi (bitsize);
5032       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5033       tree vec_temp;
5034
5035       /* COND reductions all do the final reduction with MAX_EXPR.  */
5036       if (code == COND_EXPR)
5037         code = MAX_EXPR;
5038
5039       /* Regardless of whether we have a whole vector shift, if we're
5040          emulating the operation via tree-vect-generic, we don't want
5041          to use it.  Only the first round of the reduction is likely
5042          to still be profitable via emulation.  */
5043       /* ??? It might be better to emit a reduction tree code here, so that
5044          tree-vect-generic can expand the first round via bit tricks.  */
5045       if (!VECTOR_MODE_P (mode))
5046         reduce_with_shift = false;
5047       else
5048         {
5049           optab optab = optab_for_tree_code (code, vectype, optab_default);
5050           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5051             reduce_with_shift = false;
5052         }
5053
5054       if (reduce_with_shift && !slp_reduc)
5055         {
5056           int nelements = vec_size_in_bits / element_bitsize;
5057           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
5058
5059           int elt_offset;
5060
5061           tree zero_vec = build_zero_cst (vectype);
5062           /* Case 2: Create:
5063              for (offset = nelements/2; offset >= 1; offset/=2)
5064                 {
5065                   Create:  va' = vec_shift <va, offset>
5066                   Create:  va = vop <va, va'>
5067                 }  */
5068
5069           tree rhs;
5070
5071           if (dump_enabled_p ())
5072             dump_printf_loc (MSG_NOTE, vect_location,
5073                              "Reduce using vector shifts\n");
5074
5075           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5076           new_temp = new_phi_result;
5077           for (elt_offset = nelements / 2;
5078                elt_offset >= 1;
5079                elt_offset /= 2)
5080             {
5081               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
5082               tree mask = vect_gen_perm_mask_any (vectype, sel);
5083               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5084                                                  new_temp, zero_vec, mask);
5085               new_name = make_ssa_name (vec_dest, epilog_stmt);
5086               gimple_assign_set_lhs (epilog_stmt, new_name);
5087               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5088
5089               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5090                                                  new_temp);
5091               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5092               gimple_assign_set_lhs (epilog_stmt, new_temp);
5093               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5094             }
5095
5096           /* 2.4  Extract the final scalar result.  Create:
5097              s_out3 = extract_field <v_out2, bitpos>  */
5098
5099           if (dump_enabled_p ())
5100             dump_printf_loc (MSG_NOTE, vect_location,
5101                              "extract scalar result\n");
5102
5103           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5104                         bitsize, bitsize_zero_node);
5105           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5106           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5107           gimple_assign_set_lhs (epilog_stmt, new_temp);
5108           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5109           scalar_results.safe_push (new_temp);
5110         }
5111       else
5112         {
5113           /* Case 3: Create:
5114              s = extract_field <v_out2, 0>
5115              for (offset = element_size;
5116                   offset < vector_size;
5117                   offset += element_size;)
5118                {
5119                  Create:  s' = extract_field <v_out2, offset>
5120                  Create:  s = op <s, s'>  // For non SLP cases
5121                }  */
5122
5123           if (dump_enabled_p ())
5124             dump_printf_loc (MSG_NOTE, vect_location,
5125                              "Reduce using scalar code.\n");
5126
5127           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5128           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5129             {
5130               int bit_offset;
5131               if (gimple_code (new_phi) == GIMPLE_PHI)
5132                 vec_temp = PHI_RESULT (new_phi);
5133               else
5134                 vec_temp = gimple_assign_lhs (new_phi);
5135               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5136                             bitsize_zero_node);
5137               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5138               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5139               gimple_assign_set_lhs (epilog_stmt, new_temp);
5140               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5141
5142               /* In SLP we don't need to apply reduction operation, so we just
5143                  collect s' values in SCALAR_RESULTS.  */
5144               if (slp_reduc)
5145                 scalar_results.safe_push (new_temp);
5146
5147               for (bit_offset = element_bitsize;
5148                    bit_offset < vec_size_in_bits;
5149                    bit_offset += element_bitsize)
5150                 {
5151                   tree bitpos = bitsize_int (bit_offset);
5152                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5153                                      bitsize, bitpos);
5154
5155                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5156                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5157                   gimple_assign_set_lhs (epilog_stmt, new_name);
5158                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5159
5160                   if (slp_reduc)
5161                     {
5162                       /* In SLP we don't need to apply reduction operation, so
5163                          we just collect s' values in SCALAR_RESULTS.  */
5164                       new_temp = new_name;
5165                       scalar_results.safe_push (new_name);
5166                     }
5167                   else
5168                     {
5169                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5170                                                          new_name, new_temp);
5171                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5172                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5173                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5174                     }
5175                 }
5176             }
5177
5178           /* The only case where we need to reduce scalar results in SLP, is
5179              unrolling.  If the size of SCALAR_RESULTS is greater than
5180              GROUP_SIZE, we reduce them combining elements modulo
5181              GROUP_SIZE.  */
5182           if (slp_reduc)
5183             {
5184               tree res, first_res, new_res;
5185               gimple *new_stmt;
5186
5187               /* Reduce multiple scalar results in case of SLP unrolling.  */
5188               for (j = group_size; scalar_results.iterate (j, &res);
5189                    j++)
5190                 {
5191                   first_res = scalar_results[j % group_size];
5192                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5193                                                   first_res, res);
5194                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5195                   gimple_assign_set_lhs (new_stmt, new_res);
5196                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5197                   scalar_results[j % group_size] = new_res;
5198                 }
5199             }
5200           else
5201             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5202             scalar_results.safe_push (new_temp);
5203         }
5204
5205       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5206           == INTEGER_INDUC_COND_REDUCTION)
5207         {
5208           /* Earlier we set the initial value to be zero.  Check the result
5209              and if it is zero then replace with the original initial
5210              value.  */
5211           tree zero = build_zero_cst (scalar_type);
5212           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5213
5214           tree tmp = make_ssa_name (new_scalar_dest);
5215           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5216                                              initial_def, new_temp);
5217           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5218           scalar_results[0] = tmp;
5219         }
5220     }
5221
5222 vect_finalize_reduction:
5223
5224   if (double_reduc)
5225     loop = loop->inner;
5226
5227   /* 2.5 Adjust the final result by the initial value of the reduction
5228          variable. (When such adjustment is not needed, then
5229          'adjustment_def' is zero).  For example, if code is PLUS we create:
5230          new_temp = loop_exit_def + adjustment_def  */
5231
5232   if (adjustment_def)
5233     {
5234       gcc_assert (!slp_reduc);
5235       if (nested_in_vect_loop)
5236         {
5237           new_phi = new_phis[0];
5238           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5239           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5240           new_dest = vect_create_destination_var (scalar_dest, vectype);
5241         }
5242       else
5243         {
5244           new_temp = scalar_results[0];
5245           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5246           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5247           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5248         }
5249
5250       epilog_stmt = gimple_build_assign (new_dest, expr);
5251       new_temp = make_ssa_name (new_dest, epilog_stmt);
5252       gimple_assign_set_lhs (epilog_stmt, new_temp);
5253       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5254       if (nested_in_vect_loop)
5255         {
5256           set_vinfo_for_stmt (epilog_stmt,
5257                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5258           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5259                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5260
5261           if (!double_reduc)
5262             scalar_results.quick_push (new_temp);
5263           else
5264             scalar_results[0] = new_temp;
5265         }
5266       else
5267         scalar_results[0] = new_temp;
5268
5269       new_phis[0] = epilog_stmt;
5270     }
5271
5272   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5273           phis with new adjusted scalar results, i.e., replace use <s_out0>
5274           with use <s_out4>.
5275
5276      Transform:
5277         loop_exit:
5278           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5279           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5280           v_out2 = reduce <v_out1>
5281           s_out3 = extract_field <v_out2, 0>
5282           s_out4 = adjust_result <s_out3>
5283           use <s_out0>
5284           use <s_out0>
5285
5286      into:
5287
5288         loop_exit:
5289           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5290           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5291           v_out2 = reduce <v_out1>
5292           s_out3 = extract_field <v_out2, 0>
5293           s_out4 = adjust_result <s_out3>
5294           use <s_out4>
5295           use <s_out4> */
5296
5297
5298   /* In SLP reduction chain we reduce vector results into one vector if
5299      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5300      the last stmt in the reduction chain, since we are looking for the loop
5301      exit phi node.  */
5302   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5303     {
5304       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5305       /* Handle reduction patterns.  */
5306       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5307         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5308
5309       scalar_dest = gimple_assign_lhs (dest_stmt);
5310       group_size = 1;
5311     }
5312
5313   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5314      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5315      need to match SCALAR_RESULTS with corresponding statements.  The first
5316      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5317      the first vector stmt, etc.
5318      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5319   if (group_size > new_phis.length ())
5320     {
5321       ratio = group_size / new_phis.length ();
5322       gcc_assert (!(group_size % new_phis.length ()));
5323     }
5324   else
5325     ratio = 1;
5326
5327   for (k = 0; k < group_size; k++)
5328     {
5329       if (k % ratio == 0)
5330         {
5331           epilog_stmt = new_phis[k / ratio];
5332           reduction_phi = reduction_phis[k / ratio];
5333           if (double_reduc)
5334             inner_phi = inner_phis[k / ratio];
5335         }
5336
5337       if (slp_reduc)
5338         {
5339           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5340
5341           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5342           /* SLP statements can't participate in patterns.  */
5343           gcc_assert (!orig_stmt);
5344           scalar_dest = gimple_assign_lhs (current_stmt);
5345         }
5346
5347       phis.create (3);
5348       /* Find the loop-closed-use at the loop exit of the original scalar
5349          result.  (The reduction result is expected to have two immediate uses -
5350          one at the latch block, and one at the loop exit).  */
5351       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5352         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5353             && !is_gimple_debug (USE_STMT (use_p)))
5354           phis.safe_push (USE_STMT (use_p));
5355
5356       /* While we expect to have found an exit_phi because of loop-closed-ssa
5357          form we can end up without one if the scalar cycle is dead.  */
5358
5359       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5360         {
5361           if (outer_loop)
5362             {
5363               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5364               gphi *vect_phi;
5365
5366               /* FORNOW. Currently not supporting the case that an inner-loop
5367                  reduction is not used in the outer-loop (but only outside the
5368                  outer-loop), unless it is double reduction.  */
5369               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5370                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5371                           || double_reduc);
5372
5373               if (double_reduc)
5374                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5375               else
5376                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5377               if (!double_reduc
5378                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5379                       != vect_double_reduction_def)
5380                 continue;
5381
5382               /* Handle double reduction:
5383
5384                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5385                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5386                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5387                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5388
5389                  At that point the regular reduction (stmt2 and stmt3) is
5390                  already vectorized, as well as the exit phi node, stmt4.
5391                  Here we vectorize the phi node of double reduction, stmt1, and
5392                  update all relevant statements.  */
5393
5394               /* Go through all the uses of s2 to find double reduction phi
5395                  node, i.e., stmt1 above.  */
5396               orig_name = PHI_RESULT (exit_phi);
5397               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5398                 {
5399                   stmt_vec_info use_stmt_vinfo;
5400                   stmt_vec_info new_phi_vinfo;
5401                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5402                   basic_block bb = gimple_bb (use_stmt);
5403                   gimple *use;
5404
5405                   /* Check that USE_STMT is really double reduction phi
5406                      node.  */
5407                   if (gimple_code (use_stmt) != GIMPLE_PHI
5408                       || gimple_phi_num_args (use_stmt) != 2
5409                       || bb->loop_father != outer_loop)
5410                     continue;
5411                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5412                   if (!use_stmt_vinfo
5413                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5414                           != vect_double_reduction_def)
5415                     continue;
5416
5417                   /* Create vector phi node for double reduction:
5418                      vs1 = phi <vs0, vs2>
5419                      vs1 was created previously in this function by a call to
5420                        vect_get_vec_def_for_operand and is stored in
5421                        vec_initial_def;
5422                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5423                      vs0 is created here.  */
5424
5425                   /* Create vector phi node.  */
5426                   vect_phi = create_phi_node (vec_initial_def, bb);
5427                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5428                                     loop_vec_info_for_loop (outer_loop));
5429                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5430
5431                   /* Create vs0 - initial def of the double reduction phi.  */
5432                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5433                                              loop_preheader_edge (outer_loop));
5434                   init_def = get_initial_def_for_reduction (stmt,
5435                                                           preheader_arg, NULL);
5436                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5437                                                     vectype, NULL);
5438
5439                   /* Update phi node arguments with vs0 and vs2.  */
5440                   add_phi_arg (vect_phi, vect_phi_init,
5441                                loop_preheader_edge (outer_loop),
5442                                UNKNOWN_LOCATION);
5443                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5444                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5445                   if (dump_enabled_p ())
5446                     {
5447                       dump_printf_loc (MSG_NOTE, vect_location,
5448                                        "created double reduction phi node: ");
5449                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5450                     }
5451
5452                   vect_phi_res = PHI_RESULT (vect_phi);
5453
5454                   /* Replace the use, i.e., set the correct vs1 in the regular
5455                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5456                      loop is redundant.  */
5457                   use = reduction_phi;
5458                   for (j = 0; j < ncopies; j++)
5459                     {
5460                       edge pr_edge = loop_preheader_edge (loop);
5461                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5462                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5463                     }
5464                 }
5465             }
5466         }
5467
5468       phis.release ();
5469       if (nested_in_vect_loop)
5470         {
5471           if (double_reduc)
5472             loop = outer_loop;
5473           else
5474             continue;
5475         }
5476
5477       phis.create (3);
5478       /* Find the loop-closed-use at the loop exit of the original scalar
5479          result.  (The reduction result is expected to have two immediate uses,
5480          one at the latch block, and one at the loop exit).  For double
5481          reductions we are looking for exit phis of the outer loop.  */
5482       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5483         {
5484           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5485             {
5486               if (!is_gimple_debug (USE_STMT (use_p)))
5487                 phis.safe_push (USE_STMT (use_p));
5488             }
5489           else
5490             {
5491               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5492                 {
5493                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5494
5495                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5496                     {
5497                       if (!flow_bb_inside_loop_p (loop,
5498                                              gimple_bb (USE_STMT (phi_use_p)))
5499                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5500                         phis.safe_push (USE_STMT (phi_use_p));
5501                     }
5502                 }
5503             }
5504         }
5505
5506       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5507         {
5508           /* Replace the uses:  */
5509           orig_name = PHI_RESULT (exit_phi);
5510           scalar_result = scalar_results[k];
5511           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5512             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5513               SET_USE (use_p, scalar_result);
5514         }
5515
5516       phis.release ();
5517     }
5518 }
5519
5520
5521 /* Function is_nonwrapping_integer_induction.
5522
5523    Check if STMT (which is part of loop LOOP) both increments and
5524    does not cause overflow.  */
5525
5526 static bool
5527 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5528 {
5529   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5530   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5531   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5532   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5533   widest_int ni, max_loop_value, lhs_max;
5534   bool overflow = false;
5535
5536   /* Make sure the loop is integer based.  */
5537   if (TREE_CODE (base) != INTEGER_CST
5538       || TREE_CODE (step) != INTEGER_CST)
5539     return false;
5540
5541   /* Check that the induction increments.  */
5542   if (tree_int_cst_sgn (step) == -1)
5543     return false;
5544
5545   /* Check that the max size of the loop will not wrap.  */
5546
5547   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5548     return true;
5549
5550   if (! max_stmt_executions (loop, &ni))
5551     return false;
5552
5553   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5554                             &overflow);
5555   if (overflow)
5556     return false;
5557
5558   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5559                             TYPE_SIGN (lhs_type), &overflow);
5560   if (overflow)
5561     return false;
5562
5563   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5564           <= TYPE_PRECISION (lhs_type));
5565 }
5566
5567 /* Function vectorizable_reduction.
5568
5569    Check if STMT performs a reduction operation that can be vectorized.
5570    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5571    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5572    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5573
5574    This function also handles reduction idioms (patterns) that have been
5575    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5576    of this form:
5577      X = pattern_expr (arg0, arg1, ..., X)
5578    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5579    sequence that had been detected and replaced by the pattern-stmt (STMT).
5580
5581    This function also handles reduction of condition expressions, for example:
5582      for (int i = 0; i < N; i++)
5583        if (a[i] < value)
5584          last = a[i];
5585    This is handled by vectorising the loop and creating an additional vector
5586    containing the loop indexes for which "a[i] < value" was true.  In the
5587    function epilogue this is reduced to a single max value and then used to
5588    index into the vector of results.
5589
5590    In some cases of reduction patterns, the type of the reduction variable X is
5591    different than the type of the other arguments of STMT.
5592    In such cases, the vectype that is used when transforming STMT into a vector
5593    stmt is different than the vectype that is used to determine the
5594    vectorization factor, because it consists of a different number of elements
5595    than the actual number of elements that are being operated upon in parallel.
5596
5597    For example, consider an accumulation of shorts into an int accumulator.
5598    On some targets it's possible to vectorize this pattern operating on 8
5599    shorts at a time (hence, the vectype for purposes of determining the
5600    vectorization factor should be V8HI); on the other hand, the vectype that
5601    is used to create the vector form is actually V4SI (the type of the result).
5602
5603    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5604    indicates what is the actual level of parallelism (V8HI in the example), so
5605    that the right vectorization factor would be derived.  This vectype
5606    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5607    be used to create the vectorized stmt.  The right vectype for the vectorized
5608    stmt is obtained from the type of the result X:
5609         get_vectype_for_scalar_type (TREE_TYPE (X))
5610
5611    This means that, contrary to "regular" reductions (or "regular" stmts in
5612    general), the following equation:
5613       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5614    does *NOT* necessarily hold for reduction patterns.  */
5615
5616 bool
5617 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5618                         gimple **vec_stmt, slp_tree slp_node,
5619                         slp_instance slp_node_instance)
5620 {
5621   tree vec_dest;
5622   tree scalar_dest;
5623   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5624   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5625   tree vectype_in = NULL_TREE;
5626   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5627   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5628   enum tree_code code, orig_code, epilog_reduc_code;
5629   machine_mode vec_mode;
5630   int op_type;
5631   optab optab, reduc_optab;
5632   tree new_temp = NULL_TREE;
5633   gimple *def_stmt;
5634   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5635   tree scalar_type;
5636   bool is_simple_use;
5637   gimple *orig_stmt;
5638   stmt_vec_info orig_stmt_info = NULL;
5639   int i;
5640   int ncopies;
5641   int epilog_copies;
5642   stmt_vec_info prev_stmt_info, prev_phi_info;
5643   bool single_defuse_cycle = false;
5644   gimple *new_stmt = NULL;
5645   int j;
5646   tree ops[3];
5647   enum vect_def_type dts[3];
5648   bool nested_cycle = false, found_nested_cycle_def = false;
5649   bool double_reduc = false;
5650   basic_block def_bb;
5651   struct loop * def_stmt_loop, *outer_loop = NULL;
5652   tree def_arg;
5653   gimple *def_arg_stmt;
5654   auto_vec<tree> vec_oprnds0;
5655   auto_vec<tree> vec_oprnds1;
5656   auto_vec<tree> vec_oprnds2;
5657   auto_vec<tree> vect_defs;
5658   auto_vec<gimple *> phis;
5659   int vec_num;
5660   tree def0, tem;
5661   bool first_p = true;
5662   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5663   tree cond_reduc_val = NULL_TREE;
5664
5665   /* Make sure it was already recognized as a reduction computation.  */
5666   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5667       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5668     return false;
5669
5670   if (nested_in_vect_loop_p (loop, stmt))
5671     {
5672       outer_loop = loop;
5673       loop = loop->inner;
5674       nested_cycle = true;
5675     }
5676
5677   /* In case of reduction chain we switch to the first stmt in the chain, but
5678      we don't update STMT_INFO, since only the last stmt is marked as reduction
5679      and has reduction properties.  */
5680   if (GROUP_FIRST_ELEMENT (stmt_info)
5681       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5682     {
5683       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5684       first_p = false;
5685     }
5686
5687   if (gimple_code (stmt) == GIMPLE_PHI)
5688     {
5689       /* Analysis is fully done on the reduction stmt invocation.  */
5690       if (! vec_stmt)
5691         {
5692           if (slp_node)
5693             slp_node_instance->reduc_phis = slp_node;
5694
5695           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5696           return true;
5697         }
5698
5699       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5700       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5701         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5702
5703       gcc_assert (is_gimple_assign (reduc_stmt));
5704       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5705         {
5706           tree op = gimple_op (reduc_stmt, k);
5707           if (op == gimple_phi_result (stmt))
5708             continue;
5709           if (k == 1
5710               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5711             continue;
5712           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5713           if (! vectype_in
5714               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5715             vectype_in = tem;
5716           break;
5717         }
5718       gcc_assert (vectype_in);
5719
5720       if (slp_node)
5721         ncopies = 1;
5722       else
5723         ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5724                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5725
5726       use_operand_p use_p;
5727       gimple *use_stmt;
5728       if (ncopies > 1
5729           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5730               <= vect_used_only_live)
5731           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5732           && (use_stmt == reduc_stmt
5733               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5734                   == reduc_stmt)))
5735         single_defuse_cycle = true;
5736
5737       /* Create the destination vector  */
5738       scalar_dest = gimple_assign_lhs (reduc_stmt);
5739       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5740
5741       if (slp_node)
5742         /* The size vect_schedule_slp_instance computes is off for us.  */
5743         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5744                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5745                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5746       else
5747         vec_num = 1;
5748
5749       /* Generate the reduction PHIs upfront.  */
5750       prev_phi_info = NULL;
5751       for (j = 0; j < ncopies; j++)
5752         {
5753           if (j == 0 || !single_defuse_cycle)
5754             {
5755               for (i = 0; i < vec_num; i++)
5756                 {
5757                   /* Create the reduction-phi that defines the reduction
5758                      operand.  */
5759                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5760                   set_vinfo_for_stmt (new_phi,
5761                                       new_stmt_vec_info (new_phi, loop_vinfo));
5762
5763                   if (slp_node)
5764                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5765                   else
5766                     {
5767                       if (j == 0)
5768                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5769                       else
5770                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5771                       prev_phi_info = vinfo_for_stmt (new_phi);
5772                     }
5773                 }
5774             }
5775         }
5776
5777       return true;
5778     }
5779
5780   /* 1. Is vectorizable reduction?  */
5781   /* Not supportable if the reduction variable is used in the loop, unless
5782      it's a reduction chain.  */
5783   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5784       && !GROUP_FIRST_ELEMENT (stmt_info))
5785     return false;
5786
5787   /* Reductions that are not used even in an enclosing outer-loop,
5788      are expected to be "live" (used out of the loop).  */
5789   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5790       && !STMT_VINFO_LIVE_P (stmt_info))
5791     return false;
5792
5793   /* 2. Has this been recognized as a reduction pattern?
5794
5795      Check if STMT represents a pattern that has been recognized
5796      in earlier analysis stages.  For stmts that represent a pattern,
5797      the STMT_VINFO_RELATED_STMT field records the last stmt in
5798      the original sequence that constitutes the pattern.  */
5799
5800   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5801   if (orig_stmt)
5802     {
5803       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5804       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5805       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5806     }
5807
5808   /* 3. Check the operands of the operation.  The first operands are defined
5809         inside the loop body. The last operand is the reduction variable,
5810         which is defined by the loop-header-phi.  */
5811
5812   gcc_assert (is_gimple_assign (stmt));
5813
5814   /* Flatten RHS.  */
5815   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5816     {
5817     case GIMPLE_BINARY_RHS:
5818       code = gimple_assign_rhs_code (stmt);
5819       op_type = TREE_CODE_LENGTH (code);
5820       gcc_assert (op_type == binary_op);
5821       ops[0] = gimple_assign_rhs1 (stmt);
5822       ops[1] = gimple_assign_rhs2 (stmt);
5823       break;
5824
5825     case GIMPLE_TERNARY_RHS:
5826       code = gimple_assign_rhs_code (stmt);
5827       op_type = TREE_CODE_LENGTH (code);
5828       gcc_assert (op_type == ternary_op);
5829       ops[0] = gimple_assign_rhs1 (stmt);
5830       ops[1] = gimple_assign_rhs2 (stmt);
5831       ops[2] = gimple_assign_rhs3 (stmt);
5832       break;
5833
5834     case GIMPLE_UNARY_RHS:
5835       return false;
5836
5837     default:
5838       gcc_unreachable ();
5839     }
5840
5841   if (code == COND_EXPR && slp_node)
5842     return false;
5843
5844   scalar_dest = gimple_assign_lhs (stmt);
5845   scalar_type = TREE_TYPE (scalar_dest);
5846   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5847       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5848     return false;
5849
5850   /* Do not try to vectorize bit-precision reductions.  */
5851   if (!type_has_mode_precision_p (scalar_type))
5852     return false;
5853
5854   /* All uses but the last are expected to be defined in the loop.
5855      The last use is the reduction variable.  In case of nested cycle this
5856      assumption is not true: we use reduc_index to record the index of the
5857      reduction variable.  */
5858   gimple *reduc_def_stmt = NULL;
5859   int reduc_index = -1;
5860   for (i = 0; i < op_type; i++)
5861     {
5862       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5863       if (i == 0 && code == COND_EXPR)
5864         continue;
5865
5866       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5867                                           &def_stmt, &dts[i], &tem);
5868       dt = dts[i];
5869       gcc_assert (is_simple_use);
5870       if (dt == vect_reduction_def)
5871         {
5872           reduc_def_stmt = def_stmt;
5873           reduc_index = i;
5874           continue;
5875         }
5876       else
5877         {
5878           if (!vectype_in)
5879             vectype_in = tem;
5880         }
5881
5882       if (dt != vect_internal_def
5883           && dt != vect_external_def
5884           && dt != vect_constant_def
5885           && dt != vect_induction_def
5886           && !(dt == vect_nested_cycle && nested_cycle))
5887         return false;
5888
5889       if (dt == vect_nested_cycle)
5890         {
5891           found_nested_cycle_def = true;
5892           reduc_def_stmt = def_stmt;
5893           reduc_index = i;
5894         }
5895
5896       if (i == 1 && code == COND_EXPR)
5897         {
5898           /* Record how value of COND_EXPR is defined.  */
5899           if (dt == vect_constant_def)
5900             {
5901               cond_reduc_dt = dt;
5902               cond_reduc_val = ops[i];
5903             }
5904           if (dt == vect_induction_def && def_stmt != NULL
5905               && is_nonwrapping_integer_induction (def_stmt, loop))
5906             cond_reduc_dt = dt;
5907         }
5908     }
5909
5910   if (!vectype_in)
5911     vectype_in = vectype_out;
5912
5913   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5914      directy used in stmt.  */
5915   if (reduc_index == -1)
5916     {
5917       if (orig_stmt)
5918         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5919       else
5920         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5921     }
5922
5923   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5924     return false;
5925
5926   if (!(reduc_index == -1
5927         || dts[reduc_index] == vect_reduction_def
5928         || dts[reduc_index] == vect_nested_cycle
5929         || ((dts[reduc_index] == vect_internal_def
5930              || dts[reduc_index] == vect_external_def
5931              || dts[reduc_index] == vect_constant_def
5932              || dts[reduc_index] == vect_induction_def)
5933             && nested_cycle && found_nested_cycle_def)))
5934     {
5935       /* For pattern recognized stmts, orig_stmt might be a reduction,
5936          but some helper statements for the pattern might not, or
5937          might be COND_EXPRs with reduction uses in the condition.  */
5938       gcc_assert (orig_stmt);
5939       return false;
5940     }
5941
5942   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5943   enum vect_reduction_type v_reduc_type
5944     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5945   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5946
5947   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5948   /* If we have a condition reduction, see if we can simplify it further.  */
5949   if (v_reduc_type == COND_REDUCTION)
5950     {
5951       if (cond_reduc_dt == vect_induction_def)
5952         {
5953           if (dump_enabled_p ())
5954             dump_printf_loc (MSG_NOTE, vect_location,
5955                              "condition expression based on "
5956                              "integer induction.\n");
5957           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5958             = INTEGER_INDUC_COND_REDUCTION;
5959         }
5960
5961       /* Loop peeling modifies initial value of reduction PHI, which
5962          makes the reduction stmt to be transformed different to the
5963          original stmt analyzed.  We need to record reduction code for
5964          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5965          it can be used directly at transform stage.  */
5966       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5967           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5968         {
5969           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5970           gcc_assert (cond_reduc_dt == vect_constant_def);
5971           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5972         }
5973       else if (cond_reduc_dt == vect_constant_def)
5974         {
5975           enum vect_def_type cond_initial_dt;
5976           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5977           tree cond_initial_val
5978             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5979
5980           gcc_assert (cond_reduc_val != NULL_TREE);
5981           vect_is_simple_use (cond_initial_val, loop_vinfo,
5982                               &def_stmt, &cond_initial_dt);
5983           if (cond_initial_dt == vect_constant_def
5984               && types_compatible_p (TREE_TYPE (cond_initial_val),
5985                                      TREE_TYPE (cond_reduc_val)))
5986             {
5987               tree e = fold_binary (LE_EXPR, boolean_type_node,
5988                                     cond_initial_val, cond_reduc_val);
5989               if (e && (integer_onep (e) || integer_zerop (e)))
5990                 {
5991                   if (dump_enabled_p ())
5992                     dump_printf_loc (MSG_NOTE, vect_location,
5993                                      "condition expression based on "
5994                                      "compile time constant.\n");
5995                   /* Record reduction code at analysis stage.  */
5996                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5997                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5998                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5999                     = CONST_COND_REDUCTION;
6000                 }
6001             }
6002         }
6003     }
6004
6005   if (orig_stmt)
6006     gcc_assert (tmp == orig_stmt
6007                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6008   else
6009     /* We changed STMT to be the first stmt in reduction chain, hence we
6010        check that in this case the first element in the chain is STMT.  */
6011     gcc_assert (stmt == tmp
6012                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6013
6014   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6015     return false;
6016
6017   if (slp_node)
6018     ncopies = 1;
6019   else
6020     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6021                / TYPE_VECTOR_SUBPARTS (vectype_in));
6022
6023   gcc_assert (ncopies >= 1);
6024
6025   vec_mode = TYPE_MODE (vectype_in);
6026
6027   if (code == COND_EXPR)
6028     {
6029       /* Only call during the analysis stage, otherwise we'll lose
6030          STMT_VINFO_TYPE.  */
6031       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6032                                                 ops[reduc_index], 0, NULL))
6033         {
6034           if (dump_enabled_p ())
6035             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6036                              "unsupported condition in reduction\n");
6037           return false;
6038         }
6039     }
6040   else
6041     {
6042       /* 4. Supportable by target?  */
6043
6044       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6045           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6046         {
6047           /* Shifts and rotates are only supported by vectorizable_shifts,
6048              not vectorizable_reduction.  */
6049           if (dump_enabled_p ())
6050             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6051                              "unsupported shift or rotation.\n");
6052           return false;
6053         }
6054
6055       /* 4.1. check support for the operation in the loop  */
6056       optab = optab_for_tree_code (code, vectype_in, optab_default);
6057       if (!optab)
6058         {
6059           if (dump_enabled_p ())
6060             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6061                              "no optab.\n");
6062
6063           return false;
6064         }
6065
6066       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6067         {
6068           if (dump_enabled_p ())
6069             dump_printf (MSG_NOTE, "op not supported by target.\n");
6070
6071           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6072               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6073                   < vect_min_worthwhile_factor (code))
6074             return false;
6075
6076           if (dump_enabled_p ())
6077             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6078         }
6079
6080       /* Worthwhile without SIMD support?  */
6081       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6082           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6083              < vect_min_worthwhile_factor (code))
6084         {
6085           if (dump_enabled_p ())
6086             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6087                              "not worthwhile without SIMD support.\n");
6088
6089           return false;
6090         }
6091     }
6092
6093   /* 4.2. Check support for the epilog operation.
6094
6095           If STMT represents a reduction pattern, then the type of the
6096           reduction variable may be different than the type of the rest
6097           of the arguments.  For example, consider the case of accumulation
6098           of shorts into an int accumulator; The original code:
6099                         S1: int_a = (int) short_a;
6100           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6101
6102           was replaced with:
6103                         STMT: int_acc = widen_sum <short_a, int_acc>
6104
6105           This means that:
6106           1. The tree-code that is used to create the vector operation in the
6107              epilog code (that reduces the partial results) is not the
6108              tree-code of STMT, but is rather the tree-code of the original
6109              stmt from the pattern that STMT is replacing.  I.e, in the example
6110              above we want to use 'widen_sum' in the loop, but 'plus' in the
6111              epilog.
6112           2. The type (mode) we use to check available target support
6113              for the vector operation to be created in the *epilog*, is
6114              determined by the type of the reduction variable (in the example
6115              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6116              However the type (mode) we use to check available target support
6117              for the vector operation to be created *inside the loop*, is
6118              determined by the type of the other arguments to STMT (in the
6119              example we'd check this: optab_handler (widen_sum_optab,
6120              vect_short_mode)).
6121
6122           This is contrary to "regular" reductions, in which the types of all
6123           the arguments are the same as the type of the reduction variable.
6124           For "regular" reductions we can therefore use the same vector type
6125           (and also the same tree-code) when generating the epilog code and
6126           when generating the code inside the loop.  */
6127
6128   if (orig_stmt)
6129     {
6130       /* This is a reduction pattern: get the vectype from the type of the
6131          reduction variable, and get the tree-code from orig_stmt.  */
6132       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6133                   == TREE_CODE_REDUCTION);
6134       orig_code = gimple_assign_rhs_code (orig_stmt);
6135       gcc_assert (vectype_out);
6136       vec_mode = TYPE_MODE (vectype_out);
6137     }
6138   else
6139     {
6140       /* Regular reduction: use the same vectype and tree-code as used for
6141          the vector code inside the loop can be used for the epilog code. */
6142       orig_code = code;
6143
6144       if (code == MINUS_EXPR)
6145         orig_code = PLUS_EXPR;
6146
6147       /* For simple condition reductions, replace with the actual expression
6148          we want to base our reduction around.  */
6149       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6150         {
6151           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6152           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6153         }
6154       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6155                  == INTEGER_INDUC_COND_REDUCTION)
6156         orig_code = MAX_EXPR;
6157     }
6158
6159   if (nested_cycle)
6160     {
6161       def_bb = gimple_bb (reduc_def_stmt);
6162       def_stmt_loop = def_bb->loop_father;
6163       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6164                                        loop_preheader_edge (def_stmt_loop));
6165       if (TREE_CODE (def_arg) == SSA_NAME
6166           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6167           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6168           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6169           && vinfo_for_stmt (def_arg_stmt)
6170           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6171               == vect_double_reduction_def)
6172         double_reduc = true;
6173     }
6174
6175   epilog_reduc_code = ERROR_MARK;
6176
6177   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6178     {
6179       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6180         {
6181           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6182                                          optab_default);
6183           if (!reduc_optab)
6184             {
6185               if (dump_enabled_p ())
6186                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6187                                  "no optab for reduction.\n");
6188
6189               epilog_reduc_code = ERROR_MARK;
6190             }
6191           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6192             {
6193               if (dump_enabled_p ())
6194                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6195                                  "reduc op not supported by target.\n");
6196
6197               epilog_reduc_code = ERROR_MARK;
6198             }
6199         }
6200       else
6201         {
6202           if (!nested_cycle || double_reduc)
6203             {
6204               if (dump_enabled_p ())
6205                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6206                                  "no reduc code for scalar code.\n");
6207
6208               return false;
6209             }
6210         }
6211     }
6212   else
6213     {
6214       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
6215       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6216       cr_index_vector_type = build_vector_type
6217         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6218
6219       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6220                                    optab_default);
6221       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6222           != CODE_FOR_nothing)
6223         epilog_reduc_code = REDUC_MAX_EXPR;
6224     }
6225
6226   if ((double_reduc
6227        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6228       && ncopies > 1)
6229     {
6230       if (dump_enabled_p ())
6231         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6232                          "multiple types in double reduction or condition "
6233                          "reduction.\n");
6234       return false;
6235     }
6236
6237   /* In case of widenning multiplication by a constant, we update the type
6238      of the constant to be the type of the other operand.  We check that the
6239      constant fits the type in the pattern recognition pass.  */
6240   if (code == DOT_PROD_EXPR
6241       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6242     {
6243       if (TREE_CODE (ops[0]) == INTEGER_CST)
6244         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6245       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6246         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6247       else
6248         {
6249           if (dump_enabled_p ())
6250             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6251                              "invalid types in dot-prod\n");
6252
6253           return false;
6254         }
6255     }
6256
6257   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6258     {
6259       widest_int ni;
6260
6261       if (! max_loop_iterations (loop, &ni))
6262         {
6263           if (dump_enabled_p ())
6264             dump_printf_loc (MSG_NOTE, vect_location,
6265                              "loop count not known, cannot create cond "
6266                              "reduction.\n");
6267           return false;
6268         }
6269       /* Convert backedges to iterations.  */
6270       ni += 1;
6271
6272       /* The additional index will be the same type as the condition.  Check
6273          that the loop can fit into this less one (because we'll use up the
6274          zero slot for when there are no matches).  */
6275       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6276       if (wi::geu_p (ni, wi::to_widest (max_index)))
6277         {
6278           if (dump_enabled_p ())
6279             dump_printf_loc (MSG_NOTE, vect_location,
6280                              "loop size is greater than data size.\n");
6281           return false;
6282         }
6283     }
6284
6285   /* In case the vectorization factor (VF) is bigger than the number
6286      of elements that we can fit in a vectype (nunits), we have to generate
6287      more than one vector stmt - i.e - we need to "unroll" the
6288      vector stmt by a factor VF/nunits.  For more details see documentation
6289      in vectorizable_operation.  */
6290
6291   /* If the reduction is used in an outer loop we need to generate
6292      VF intermediate results, like so (e.g. for ncopies=2):
6293         r0 = phi (init, r0)
6294         r1 = phi (init, r1)
6295         r0 = x0 + r0;
6296         r1 = x1 + r1;
6297     (i.e. we generate VF results in 2 registers).
6298     In this case we have a separate def-use cycle for each copy, and therefore
6299     for each copy we get the vector def for the reduction variable from the
6300     respective phi node created for this copy.
6301
6302     Otherwise (the reduction is unused in the loop nest), we can combine
6303     together intermediate results, like so (e.g. for ncopies=2):
6304         r = phi (init, r)
6305         r = x0 + r;
6306         r = x1 + r;
6307    (i.e. we generate VF/2 results in a single register).
6308    In this case for each copy we get the vector def for the reduction variable
6309    from the vectorized reduction operation generated in the previous iteration.
6310
6311    This only works when we see both the reduction PHI and its only consumer
6312    in vectorizable_reduction and there are no intermediate stmts
6313    participating.  */
6314   use_operand_p use_p;
6315   gimple *use_stmt;
6316   if (ncopies > 1
6317       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6318       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6319       && (use_stmt == stmt
6320           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6321     {
6322       single_defuse_cycle = true;
6323       epilog_copies = 1;
6324     }
6325   else
6326     epilog_copies = ncopies;
6327
6328   /* If the reduction stmt is one of the patterns that have lane
6329      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6330   if ((ncopies > 1
6331        && ! single_defuse_cycle)
6332       && (code == DOT_PROD_EXPR
6333           || code == WIDEN_SUM_EXPR
6334           || code == SAD_EXPR))
6335     {
6336       if (dump_enabled_p ())
6337         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6338                          "multi def-use cycle not possible for lane-reducing "
6339                          "reduction operation\n");
6340       return false;
6341     }
6342
6343   if (!vec_stmt) /* transformation not required.  */
6344     {
6345       if (first_p)
6346         vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6347       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6348       return true;
6349     }
6350
6351   /* Transform.  */
6352
6353   if (dump_enabled_p ())
6354     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6355
6356   /* FORNOW: Multiple types are not supported for condition.  */
6357   if (code == COND_EXPR)
6358     gcc_assert (ncopies == 1);
6359
6360   /* Create the destination vector  */
6361   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6362
6363   prev_stmt_info = NULL;
6364   prev_phi_info = NULL;
6365   if (slp_node)
6366     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6367   else
6368     {
6369       vec_num = 1;
6370       vec_oprnds0.create (1);
6371       vec_oprnds1.create (1);
6372       if (op_type == ternary_op)
6373         vec_oprnds2.create (1);
6374     }
6375
6376   phis.create (vec_num);
6377   vect_defs.create (vec_num);
6378   if (!slp_node)
6379     vect_defs.quick_push (NULL_TREE);
6380
6381   if (slp_node)
6382     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6383   else
6384     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6385
6386   for (j = 0; j < ncopies; j++)
6387     {
6388       if (code == COND_EXPR)
6389         {
6390           gcc_assert (!slp_node);
6391           vectorizable_condition (stmt, gsi, vec_stmt,
6392                                   PHI_RESULT (phis[0]),
6393                                   reduc_index, NULL);
6394           /* Multiple types are not supported for condition.  */
6395           break;
6396         }
6397
6398       /* Handle uses.  */
6399       if (j == 0)
6400         {
6401           if (slp_node)
6402             {
6403               /* Get vec defs for all the operands except the reduction index,
6404                  ensuring the ordering of the ops in the vector is kept.  */
6405               auto_vec<tree, 3> slp_ops;
6406               auto_vec<vec<tree>, 3> vec_defs;
6407
6408               slp_ops.quick_push (ops[0]);
6409               slp_ops.quick_push (ops[1]);
6410               if (op_type == ternary_op)
6411                 slp_ops.quick_push (ops[2]);
6412
6413               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6414
6415               vec_oprnds0.safe_splice (vec_defs[0]);
6416               vec_defs[0].release ();
6417               vec_oprnds1.safe_splice (vec_defs[1]);
6418               vec_defs[1].release ();
6419               if (op_type == ternary_op)
6420                 {
6421                   vec_oprnds2.safe_splice (vec_defs[2]);
6422                   vec_defs[2].release ();
6423                 }
6424             }
6425           else
6426             {
6427               vec_oprnds0.quick_push
6428                 (vect_get_vec_def_for_operand (ops[0], stmt));
6429               vec_oprnds1.quick_push
6430                 (vect_get_vec_def_for_operand (ops[1], stmt));
6431               if (op_type == ternary_op)
6432                 vec_oprnds2.quick_push
6433                   (vect_get_vec_def_for_operand (ops[2], stmt));
6434             }
6435         }
6436       else
6437         {
6438           if (!slp_node)
6439             {
6440               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6441
6442               if (single_defuse_cycle && reduc_index == 0)
6443                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6444               else
6445                 vec_oprnds0[0]
6446                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6447               if (single_defuse_cycle && reduc_index == 1)
6448                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6449               else
6450                 vec_oprnds1[0]
6451                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6452               if (op_type == ternary_op)
6453                 {
6454                   if (single_defuse_cycle && reduc_index == 2)
6455                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6456                   else
6457                     vec_oprnds2[0]
6458                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6459                 }
6460             }
6461         }
6462
6463       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6464         {
6465           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6466           if (op_type == ternary_op)
6467             vop[2] = vec_oprnds2[i];
6468
6469           new_temp = make_ssa_name (vec_dest, new_stmt);
6470           new_stmt = gimple_build_assign (new_temp, code,
6471                                           vop[0], vop[1], vop[2]);
6472           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6473
6474           if (slp_node)
6475             {
6476               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6477               vect_defs.quick_push (new_temp);
6478             }
6479           else
6480             vect_defs[0] = new_temp;
6481         }
6482
6483       if (slp_node)
6484         continue;
6485
6486       if (j == 0)
6487         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6488       else
6489         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6490
6491       prev_stmt_info = vinfo_for_stmt (new_stmt);
6492     }
6493
6494   /* Finalize the reduction-phi (set its arguments) and create the
6495      epilog reduction code.  */
6496   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6497     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6498
6499   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6500                                     epilog_copies,
6501                                     epilog_reduc_code, phis,
6502                                     double_reduc, slp_node, slp_node_instance);
6503
6504   return true;
6505 }
6506
6507 /* Function vect_min_worthwhile_factor.
6508
6509    For a loop where we could vectorize the operation indicated by CODE,
6510    return the minimum vectorization factor that makes it worthwhile
6511    to use generic vectors.  */
6512 int
6513 vect_min_worthwhile_factor (enum tree_code code)
6514 {
6515   switch (code)
6516     {
6517     case PLUS_EXPR:
6518     case MINUS_EXPR:
6519     case NEGATE_EXPR:
6520       return 4;
6521
6522     case BIT_AND_EXPR:
6523     case BIT_IOR_EXPR:
6524     case BIT_XOR_EXPR:
6525     case BIT_NOT_EXPR:
6526       return 2;
6527
6528     default:
6529       return INT_MAX;
6530     }
6531 }
6532
6533
6534 /* Function vectorizable_induction
6535
6536    Check if PHI performs an induction computation that can be vectorized.
6537    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6538    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6539    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6540
6541 bool
6542 vectorizable_induction (gimple *phi,
6543                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6544                         gimple **vec_stmt, slp_tree slp_node)
6545 {
6546   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6547   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6548   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6549   unsigned ncopies;
6550   bool nested_in_vect_loop = false;
6551   struct loop *iv_loop;
6552   tree vec_def;
6553   edge pe = loop_preheader_edge (loop);
6554   basic_block new_bb;
6555   tree new_vec, vec_init, vec_step, t;
6556   tree new_name;
6557   gimple *new_stmt;
6558   gphi *induction_phi;
6559   tree induc_def, vec_dest;
6560   tree init_expr, step_expr;
6561   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6562   unsigned i;
6563   tree expr;
6564   gimple_seq stmts;
6565   imm_use_iterator imm_iter;
6566   use_operand_p use_p;
6567   gimple *exit_phi;
6568   edge latch_e;
6569   tree loop_arg;
6570   gimple_stmt_iterator si;
6571   basic_block bb = gimple_bb (phi);
6572
6573   if (gimple_code (phi) != GIMPLE_PHI)
6574     return false;
6575
6576   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6577     return false;
6578
6579   /* Make sure it was recognized as induction computation.  */
6580   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6581     return false;
6582
6583   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6584   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6585
6586   if (slp_node)
6587     ncopies = 1;
6588   else
6589     ncopies = vf / nunits;
6590   gcc_assert (ncopies >= 1);
6591
6592   /* FORNOW. These restrictions should be relaxed.  */
6593   if (nested_in_vect_loop_p (loop, phi))
6594     {
6595       imm_use_iterator imm_iter;
6596       use_operand_p use_p;
6597       gimple *exit_phi;
6598       edge latch_e;
6599       tree loop_arg;
6600
6601       if (ncopies > 1)
6602         {
6603           if (dump_enabled_p ())
6604             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6605                              "multiple types in nested loop.\n");
6606           return false;
6607         }
6608
6609       /* FORNOW: outer loop induction with SLP not supported.  */
6610       if (STMT_SLP_TYPE (stmt_info))
6611         return false;
6612
6613       exit_phi = NULL;
6614       latch_e = loop_latch_edge (loop->inner);
6615       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6616       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6617         {
6618           gimple *use_stmt = USE_STMT (use_p);
6619           if (is_gimple_debug (use_stmt))
6620             continue;
6621
6622           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6623             {
6624               exit_phi = use_stmt;
6625               break;
6626             }
6627         }
6628       if (exit_phi)
6629         {
6630           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6631           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6632                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6633             {
6634               if (dump_enabled_p ())
6635                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6636                                  "inner-loop induction only used outside "
6637                                  "of the outer vectorized loop.\n");
6638               return false;
6639             }
6640         }
6641
6642       nested_in_vect_loop = true;
6643       iv_loop = loop->inner;
6644     }
6645   else
6646     iv_loop = loop;
6647   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6648
6649   if (!vec_stmt) /* transformation not required.  */
6650     {
6651       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6652       if (dump_enabled_p ())
6653         dump_printf_loc (MSG_NOTE, vect_location,
6654                          "=== vectorizable_induction ===\n");
6655       vect_model_induction_cost (stmt_info, ncopies);
6656       return true;
6657     }
6658
6659   /* Transform.  */
6660
6661   /* Compute a vector variable, initialized with the first VF values of
6662      the induction variable.  E.g., for an iv with IV_PHI='X' and
6663      evolution S, for a vector of 4 units, we want to compute:
6664      [X, X + S, X + 2*S, X + 3*S].  */
6665
6666   if (dump_enabled_p ())
6667     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6668
6669   latch_e = loop_latch_edge (iv_loop);
6670   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6671
6672   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6673   gcc_assert (step_expr != NULL_TREE);
6674
6675   pe = loop_preheader_edge (iv_loop);
6676   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6677                                      loop_preheader_edge (iv_loop));
6678
6679   /* Convert the step to the desired type.  */
6680   stmts = NULL;
6681   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6682   if (stmts)
6683     {
6684       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6685       gcc_assert (!new_bb);
6686     }
6687
6688   /* Find the first insertion point in the BB.  */
6689   si = gsi_after_labels (bb);
6690
6691   /* For SLP induction we have to generate several IVs as for example
6692      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6693      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6694      [VF*S, VF*S, VF*S, VF*S] for all.  */
6695   if (slp_node)
6696     {
6697       /* Convert the init to the desired type.  */
6698       stmts = NULL;
6699       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6700       if (stmts)
6701         {
6702           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6703           gcc_assert (!new_bb);
6704         }
6705
6706       /* Generate [VF*S, VF*S, ... ].  */
6707       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6708         {
6709           expr = build_int_cst (integer_type_node, vf);
6710           expr = fold_convert (TREE_TYPE (step_expr), expr);
6711         }
6712       else
6713         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6714       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6715                               expr, step_expr);
6716       if (! CONSTANT_CLASS_P (new_name))
6717         new_name = vect_init_vector (phi, new_name,
6718                                      TREE_TYPE (step_expr), NULL);
6719       new_vec = build_vector_from_val (vectype, new_name);
6720       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6721
6722       /* Now generate the IVs.  */
6723       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6724       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6725       unsigned elts = nunits * nvects;
6726       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6727       gcc_assert (elts % group_size == 0);
6728       tree elt = init_expr;
6729       unsigned ivn;
6730       for (ivn = 0; ivn < nivs; ++ivn)
6731         {
6732           tree *elts = XALLOCAVEC (tree, nunits);
6733           bool constant_p = true;
6734           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6735             {
6736               if (ivn*nunits + eltn >= group_size
6737                   && (ivn*nunits + eltn) % group_size == 0)
6738                 {
6739                   stmts = NULL;
6740                   elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6741                                       elt, step_expr);
6742                   if (stmts)
6743                     {
6744                       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6745                       gcc_assert (!new_bb);
6746                     }
6747                 }
6748               if (! CONSTANT_CLASS_P (elt))
6749                 constant_p = false;
6750               elts[eltn] = elt;
6751             }
6752           if (constant_p)
6753             new_vec = build_vector (vectype, elts);
6754           else
6755             {
6756               vec<constructor_elt, va_gc> *v;
6757               vec_alloc (v, nunits);
6758               for (i = 0; i < nunits; ++i)
6759                 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
6760               new_vec = build_constructor (vectype, v);
6761             }
6762           vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6763
6764           /* Create the induction-phi that defines the induction-operand.  */
6765           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6766           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6767           set_vinfo_for_stmt (induction_phi,
6768                               new_stmt_vec_info (induction_phi, loop_vinfo));
6769           induc_def = PHI_RESULT (induction_phi);
6770
6771           /* Create the iv update inside the loop  */
6772           vec_def = make_ssa_name (vec_dest);
6773           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6774           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6775           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6776
6777           /* Set the arguments of the phi node:  */
6778           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6779           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6780                        UNKNOWN_LOCATION);
6781
6782           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6783         }
6784
6785       /* Re-use IVs when we can.  */
6786       if (ivn < nvects)
6787         {
6788           unsigned vfp
6789             = least_common_multiple (group_size, nunits) / group_size;
6790           /* Generate [VF'*S, VF'*S, ... ].  */
6791           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6792             {
6793               expr = build_int_cst (integer_type_node, vfp);
6794               expr = fold_convert (TREE_TYPE (step_expr), expr);
6795             }
6796           else
6797             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6798           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6799                                   expr, step_expr);
6800           if (! CONSTANT_CLASS_P (new_name))
6801             new_name = vect_init_vector (phi, new_name,
6802                                          TREE_TYPE (step_expr), NULL);
6803           new_vec = build_vector_from_val (vectype, new_name);
6804           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6805           for (; ivn < nvects; ++ivn)
6806             {
6807               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6808               tree def;
6809               if (gimple_code (iv) == GIMPLE_PHI)
6810                 def = gimple_phi_result (iv);
6811               else
6812                 def = gimple_assign_lhs (iv);
6813               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6814                                               PLUS_EXPR,
6815                                               def, vec_step);
6816               if (gimple_code (iv) == GIMPLE_PHI)
6817                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6818               else
6819                 {
6820                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6821                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6822                 }
6823               set_vinfo_for_stmt (new_stmt,
6824                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6825               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6826             }
6827         }
6828
6829       return true;
6830     }
6831
6832   /* Create the vector that holds the initial_value of the induction.  */
6833   if (nested_in_vect_loop)
6834     {
6835       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6836          been created during vectorization of previous stmts.  We obtain it
6837          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6838       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6839       /* If the initial value is not of proper type, convert it.  */
6840       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6841         {
6842           new_stmt
6843             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6844                                                           vect_simple_var,
6845                                                           "vec_iv_"),
6846                                    VIEW_CONVERT_EXPR,
6847                                    build1 (VIEW_CONVERT_EXPR, vectype,
6848                                            vec_init));
6849           vec_init = gimple_assign_lhs (new_stmt);
6850           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6851                                                  new_stmt);
6852           gcc_assert (!new_bb);
6853           set_vinfo_for_stmt (new_stmt,
6854                               new_stmt_vec_info (new_stmt, loop_vinfo));
6855         }
6856     }
6857   else
6858     {
6859       vec<constructor_elt, va_gc> *v;
6860
6861       /* iv_loop is the loop to be vectorized. Create:
6862          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6863       stmts = NULL;
6864       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6865
6866       vec_alloc (v, nunits);
6867       bool constant_p = is_gimple_min_invariant (new_name);
6868       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6869       for (i = 1; i < nunits; i++)
6870         {
6871           /* Create: new_name_i = new_name + step_expr  */
6872           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6873                                    new_name, step_expr);
6874           if (!is_gimple_min_invariant (new_name))
6875             constant_p = false;
6876           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6877         }
6878       if (stmts)
6879         {
6880           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6881           gcc_assert (!new_bb);
6882         }
6883
6884       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
6885       if (constant_p)
6886         new_vec = build_vector_from_ctor (vectype, v);
6887       else
6888         new_vec = build_constructor (vectype, v);
6889       vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6890     }
6891
6892
6893   /* Create the vector that holds the step of the induction.  */
6894   if (nested_in_vect_loop)
6895     /* iv_loop is nested in the loop to be vectorized. Generate:
6896        vec_step = [S, S, S, S]  */
6897     new_name = step_expr;
6898   else
6899     {
6900       /* iv_loop is the loop to be vectorized. Generate:
6901           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6902       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6903         {
6904           expr = build_int_cst (integer_type_node, vf);
6905           expr = fold_convert (TREE_TYPE (step_expr), expr);
6906         }
6907       else
6908         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6909       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6910                               expr, step_expr);
6911       if (TREE_CODE (step_expr) == SSA_NAME)
6912         new_name = vect_init_vector (phi, new_name,
6913                                      TREE_TYPE (step_expr), NULL);
6914     }
6915
6916   t = unshare_expr (new_name);
6917   gcc_assert (CONSTANT_CLASS_P (new_name)
6918               || TREE_CODE (new_name) == SSA_NAME);
6919   new_vec = build_vector_from_val (vectype, t);
6920   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6921
6922
6923   /* Create the following def-use cycle:
6924      loop prolog:
6925          vec_init = ...
6926          vec_step = ...
6927      loop:
6928          vec_iv = PHI <vec_init, vec_loop>
6929          ...
6930          STMT
6931          ...
6932          vec_loop = vec_iv + vec_step;  */
6933
6934   /* Create the induction-phi that defines the induction-operand.  */
6935   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6936   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6937   set_vinfo_for_stmt (induction_phi,
6938                       new_stmt_vec_info (induction_phi, loop_vinfo));
6939   induc_def = PHI_RESULT (induction_phi);
6940
6941   /* Create the iv update inside the loop  */
6942   vec_def = make_ssa_name (vec_dest);
6943   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6944   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6945   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6946
6947   /* Set the arguments of the phi node:  */
6948   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6949   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6950                UNKNOWN_LOCATION);
6951
6952   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6953
6954   /* In case that vectorization factor (VF) is bigger than the number
6955      of elements that we can fit in a vectype (nunits), we have to generate
6956      more than one vector stmt - i.e - we need to "unroll" the
6957      vector stmt by a factor VF/nunits.  For more details see documentation
6958      in vectorizable_operation.  */
6959
6960   if (ncopies > 1)
6961     {
6962       stmt_vec_info prev_stmt_vinfo;
6963       /* FORNOW. This restriction should be relaxed.  */
6964       gcc_assert (!nested_in_vect_loop);
6965
6966       /* Create the vector that holds the step of the induction.  */
6967       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6968         {
6969           expr = build_int_cst (integer_type_node, nunits);
6970           expr = fold_convert (TREE_TYPE (step_expr), expr);
6971         }
6972       else
6973         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6974       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6975                               expr, step_expr);
6976       if (TREE_CODE (step_expr) == SSA_NAME)
6977         new_name = vect_init_vector (phi, new_name,
6978                                      TREE_TYPE (step_expr), NULL);
6979       t = unshare_expr (new_name);
6980       gcc_assert (CONSTANT_CLASS_P (new_name)
6981                   || TREE_CODE (new_name) == SSA_NAME);
6982       new_vec = build_vector_from_val (vectype, t);
6983       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6984
6985       vec_def = induc_def;
6986       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6987       for (i = 1; i < ncopies; i++)
6988         {
6989           /* vec_i = vec_prev + vec_step  */
6990           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6991                                           vec_def, vec_step);
6992           vec_def = make_ssa_name (vec_dest, new_stmt);
6993           gimple_assign_set_lhs (new_stmt, vec_def);
6994
6995           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6996           set_vinfo_for_stmt (new_stmt,
6997                               new_stmt_vec_info (new_stmt, loop_vinfo));
6998           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6999           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7000         }
7001     }
7002
7003   if (nested_in_vect_loop)
7004     {
7005       /* Find the loop-closed exit-phi of the induction, and record
7006          the final vector of induction results:  */
7007       exit_phi = NULL;
7008       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7009         {
7010           gimple *use_stmt = USE_STMT (use_p);
7011           if (is_gimple_debug (use_stmt))
7012             continue;
7013
7014           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7015             {
7016               exit_phi = use_stmt;
7017               break;
7018             }
7019         }
7020       if (exit_phi)
7021         {
7022           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7023           /* FORNOW. Currently not supporting the case that an inner-loop induction
7024              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7025           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7026                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7027
7028           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7029           if (dump_enabled_p ())
7030             {
7031               dump_printf_loc (MSG_NOTE, vect_location,
7032                                "vector of inductions after inner-loop:");
7033               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7034             }
7035         }
7036     }
7037
7038
7039   if (dump_enabled_p ())
7040     {
7041       dump_printf_loc (MSG_NOTE, vect_location,
7042                        "transform induction: created def-use cycle: ");
7043       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7044       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7045                         SSA_NAME_DEF_STMT (vec_def), 0);
7046     }
7047
7048   return true;
7049 }
7050
7051 /* Function vectorizable_live_operation.
7052
7053    STMT computes a value that is used outside the loop.  Check if
7054    it can be supported.  */
7055
7056 bool
7057 vectorizable_live_operation (gimple *stmt,
7058                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7059                              slp_tree slp_node, int slp_index,
7060                              gimple **vec_stmt)
7061 {
7062   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7063   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7064   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7065   imm_use_iterator imm_iter;
7066   tree lhs, lhs_type, bitsize, vec_bitsize;
7067   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7068   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7069   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
7070   gimple *use_stmt;
7071   auto_vec<tree> vec_oprnds;
7072
7073   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7074
7075   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7076     return false;
7077
7078   /* FORNOW.  CHECKME.  */
7079   if (nested_in_vect_loop_p (loop, stmt))
7080     return false;
7081
7082   /* If STMT is not relevant and it is a simple assignment and its inputs are
7083      invariant then it can remain in place, unvectorized.  The original last
7084      scalar value that it computes will be used.  */
7085   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7086     {
7087       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7088       if (dump_enabled_p ())
7089         dump_printf_loc (MSG_NOTE, vect_location,
7090                          "statement is simple and uses invariant.  Leaving in "
7091                          "place.\n");
7092       return true;
7093     }
7094
7095   if (!vec_stmt)
7096     /* No transformation required.  */
7097     return true;
7098
7099   /* If stmt has a related stmt, then use that for getting the lhs.  */
7100   if (is_pattern_stmt_p (stmt_info))
7101     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7102
7103   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7104         : gimple_get_lhs (stmt);
7105   lhs_type = TREE_TYPE (lhs);
7106
7107   bitsize = TYPE_SIZE (TREE_TYPE (vectype));
7108   vec_bitsize = TYPE_SIZE (vectype);
7109
7110   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7111   tree vec_lhs, bitstart;
7112   if (slp_node)
7113     {
7114       gcc_assert (slp_index >= 0);
7115
7116       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7117       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7118
7119       /* Get the last occurrence of the scalar index from the concatenation of
7120          all the slp vectors. Calculate which slp vector it is and the index
7121          within.  */
7122       int pos = (num_vec * nunits) - num_scalar + slp_index;
7123       int vec_entry = pos / nunits;
7124       int vec_index = pos % nunits;
7125
7126       /* Get the correct slp vectorized stmt.  */
7127       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7128
7129       /* Get entry to use.  */
7130       bitstart = build_int_cst (unsigned_type_node, vec_index);
7131       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7132     }
7133   else
7134     {
7135       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7136       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7137
7138       /* For multiple copies, get the last copy.  */
7139       for (int i = 1; i < ncopies; ++i)
7140         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7141                                                   vec_lhs);
7142
7143       /* Get the last lane in the vector.  */
7144       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7145     }
7146
7147   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7148      loop.  */
7149   gimple_seq stmts = NULL;
7150   tree bftype = TREE_TYPE (vectype);
7151   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7152     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7153   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7154   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7155                                    true, NULL_TREE);
7156   if (stmts)
7157     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7158
7159   /* Replace use of lhs with newly computed result.  If the use stmt is a
7160      single arg PHI, just replace all uses of PHI result.  It's necessary
7161      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7162   use_operand_p use_p;
7163   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7164     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7165         && !is_gimple_debug (use_stmt))
7166     {
7167       if (gimple_code (use_stmt) == GIMPLE_PHI
7168           && gimple_phi_num_args (use_stmt) == 1)
7169         {
7170           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7171         }
7172       else
7173         {
7174           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7175             SET_USE (use_p, new_tree);
7176         }
7177       update_stmt (use_stmt);
7178     }
7179
7180   return true;
7181 }
7182
7183 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7184
7185 static void
7186 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7187 {
7188   ssa_op_iter op_iter;
7189   imm_use_iterator imm_iter;
7190   def_operand_p def_p;
7191   gimple *ustmt;
7192
7193   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7194     {
7195       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7196         {
7197           basic_block bb;
7198
7199           if (!is_gimple_debug (ustmt))
7200             continue;
7201
7202           bb = gimple_bb (ustmt);
7203
7204           if (!flow_bb_inside_loop_p (loop, bb))
7205             {
7206               if (gimple_debug_bind_p (ustmt))
7207                 {
7208                   if (dump_enabled_p ())
7209                     dump_printf_loc (MSG_NOTE, vect_location,
7210                                      "killing debug use\n");
7211
7212                   gimple_debug_bind_reset_value (ustmt);
7213                   update_stmt (ustmt);
7214                 }
7215               else
7216                 gcc_unreachable ();
7217             }
7218         }
7219     }
7220 }
7221
7222 /* Given loop represented by LOOP_VINFO, return true if computation of
7223    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7224    otherwise.  */
7225
7226 static bool
7227 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7228 {
7229   /* Constant case.  */
7230   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7231     {
7232       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7233       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7234
7235       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7236       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7237       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7238         return true;
7239     }
7240
7241   widest_int max;
7242   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7243   /* Check the upper bound of loop niters.  */
7244   if (get_max_loop_iterations (loop, &max))
7245     {
7246       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7247       signop sgn = TYPE_SIGN (type);
7248       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7249       if (max < type_max)
7250         return true;
7251     }
7252   return false;
7253 }
7254
7255 /* Scale profiling counters by estimation for LOOP which is vectorized
7256    by factor VF.  */
7257
7258 static void
7259 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7260 {
7261   edge preheader = loop_preheader_edge (loop);
7262   /* Reduce loop iterations by the vectorization factor.  */
7263   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7264   profile_count freq_h = loop->header->count, freq_e = preheader->count;
7265
7266   /* Use frequency only if counts are zero.  */
7267   if (!(freq_h > 0) && !(freq_e > 0))
7268     {
7269       freq_h = profile_count::from_gcov_type (loop->header->frequency);
7270       freq_e = profile_count::from_gcov_type (EDGE_FREQUENCY (preheader));
7271     }
7272   if (freq_h > 0)
7273     {
7274       profile_probability p;
7275
7276       /* Avoid dropping loop body profile counter to 0 because of zero count
7277          in loop's preheader.  */
7278       if (!(freq_e > profile_count::from_gcov_type (1)))
7279        freq_e = profile_count::from_gcov_type (1);
7280       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7281       scale_loop_frequencies (loop, p);
7282     }
7283
7284   basic_block exit_bb = single_pred (loop->latch);
7285   edge exit_e = single_exit (loop);
7286   exit_e->count = loop_preheader_edge (loop)->count;
7287   exit_e->probability = profile_probability::always ()
7288                                  .apply_scale (1, new_est_niter + 1);
7289
7290   edge exit_l = single_pred_edge (loop->latch);
7291   profile_probability prob = exit_l->probability;
7292   exit_l->probability = exit_e->probability.invert ();
7293   exit_l->count = exit_bb->count - exit_e->count;
7294   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7295     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7296 }
7297
7298 /* Function vect_transform_loop.
7299
7300    The analysis phase has determined that the loop is vectorizable.
7301    Vectorize the loop - created vectorized stmts to replace the scalar
7302    stmts in the loop, and update the loop exit condition.
7303    Returns scalar epilogue loop if any.  */
7304
7305 struct loop *
7306 vect_transform_loop (loop_vec_info loop_vinfo)
7307 {
7308   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7309   struct loop *epilogue = NULL;
7310   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7311   int nbbs = loop->num_nodes;
7312   int i;
7313   tree niters_vector = NULL;
7314   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7315   bool grouped_store;
7316   bool slp_scheduled = false;
7317   gimple *stmt, *pattern_stmt;
7318   gimple_seq pattern_def_seq = NULL;
7319   gimple_stmt_iterator pattern_def_si = gsi_none ();
7320   bool transform_pattern_stmt = false;
7321   bool check_profitability = false;
7322   int th;
7323
7324   if (dump_enabled_p ())
7325     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7326
7327   /* Use the more conservative vectorization threshold.  If the number
7328      of iterations is constant assume the cost check has been performed
7329      by our caller.  If the threshold makes all loops profitable that
7330      run at least the vectorization factor number of times checking
7331      is pointless, too.  */
7332   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7333   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7334       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7335     {
7336       if (dump_enabled_p ())
7337         dump_printf_loc (MSG_NOTE, vect_location,
7338                          "Profitability threshold is %d loop iterations.\n",
7339                          th);
7340       check_profitability = true;
7341     }
7342
7343   /* Make sure there exists a single-predecessor exit bb.  Do this before
7344      versioning.   */
7345   edge e = single_exit (loop);
7346   if (! single_pred_p (e->dest))
7347     {
7348       split_loop_exit_edge (e);
7349       if (dump_enabled_p ())
7350         dump_printf (MSG_NOTE, "split exit edge\n");
7351     }
7352
7353   /* Version the loop first, if required, so the profitability check
7354      comes first.  */
7355
7356   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7357     {
7358       vect_loop_versioning (loop_vinfo, th, check_profitability);
7359       check_profitability = false;
7360     }
7361
7362   /* Make sure there exists a single-predecessor exit bb also on the
7363      scalar loop copy.  Do this after versioning but before peeling
7364      so CFG structure is fine for both scalar and if-converted loop
7365      to make slpeel_duplicate_current_defs_from_edges face matched
7366      loop closed PHI nodes on the exit.  */
7367   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7368     {
7369       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7370       if (! single_pred_p (e->dest))
7371         {
7372           split_loop_exit_edge (e);
7373           if (dump_enabled_p ())
7374             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7375         }
7376     }
7377
7378   tree niters = vect_build_loop_niters (loop_vinfo);
7379   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7380   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7381   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7382   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7383                               check_profitability, niters_no_overflow);
7384   if (niters_vector == NULL_TREE)
7385     {
7386       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7387         niters_vector
7388           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7389                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7390       else
7391         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7392                                      niters_no_overflow);
7393     }
7394
7395   /* 1) Make sure the loop header has exactly two entries
7396      2) Make sure we have a preheader basic block.  */
7397
7398   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7399
7400   split_edge (loop_preheader_edge (loop));
7401
7402   /* FORNOW: the vectorizer supports only loops which body consist
7403      of one basic block (header + empty latch). When the vectorizer will
7404      support more involved loop forms, the order by which the BBs are
7405      traversed need to be reconsidered.  */
7406
7407   for (i = 0; i < nbbs; i++)
7408     {
7409       basic_block bb = bbs[i];
7410       stmt_vec_info stmt_info;
7411
7412       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7413            gsi_next (&si))
7414         {
7415           gphi *phi = si.phi ();
7416           if (dump_enabled_p ())
7417             {
7418               dump_printf_loc (MSG_NOTE, vect_location,
7419                                "------>vectorizing phi: ");
7420               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7421             }
7422           stmt_info = vinfo_for_stmt (phi);
7423           if (!stmt_info)
7424             continue;
7425
7426           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7427             vect_loop_kill_debug_uses (loop, phi);
7428
7429           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7430               && !STMT_VINFO_LIVE_P (stmt_info))
7431             continue;
7432
7433           if (STMT_VINFO_VECTYPE (stmt_info)
7434               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7435                   != (unsigned HOST_WIDE_INT) vf)
7436               && dump_enabled_p ())
7437             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7438
7439           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7440                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7441                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7442               && ! PURE_SLP_STMT (stmt_info))
7443             {
7444               if (dump_enabled_p ())
7445                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7446               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7447             }
7448         }
7449
7450       pattern_stmt = NULL;
7451       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7452            !gsi_end_p (si) || transform_pattern_stmt;)
7453         {
7454           bool is_store;
7455
7456           if (transform_pattern_stmt)
7457             stmt = pattern_stmt;
7458           else
7459             {
7460               stmt = gsi_stmt (si);
7461               /* During vectorization remove existing clobber stmts.  */
7462               if (gimple_clobber_p (stmt))
7463                 {
7464                   unlink_stmt_vdef (stmt);
7465                   gsi_remove (&si, true);
7466                   release_defs (stmt);
7467                   continue;
7468                 }
7469             }
7470
7471           if (dump_enabled_p ())
7472             {
7473               dump_printf_loc (MSG_NOTE, vect_location,
7474                                "------>vectorizing statement: ");
7475               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7476             }
7477
7478           stmt_info = vinfo_for_stmt (stmt);
7479
7480           /* vector stmts created in the outer-loop during vectorization of
7481              stmts in an inner-loop may not have a stmt_info, and do not
7482              need to be vectorized.  */
7483           if (!stmt_info)
7484             {
7485               gsi_next (&si);
7486               continue;
7487             }
7488
7489           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7490             vect_loop_kill_debug_uses (loop, stmt);
7491
7492           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7493               && !STMT_VINFO_LIVE_P (stmt_info))
7494             {
7495               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7496                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7497                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7498                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7499                 {
7500                   stmt = pattern_stmt;
7501                   stmt_info = vinfo_for_stmt (stmt);
7502                 }
7503               else
7504                 {
7505                   gsi_next (&si);
7506                   continue;
7507                 }
7508             }
7509           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7510                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7511                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7512                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7513             transform_pattern_stmt = true;
7514
7515           /* If pattern statement has def stmts, vectorize them too.  */
7516           if (is_pattern_stmt_p (stmt_info))
7517             {
7518               if (pattern_def_seq == NULL)
7519                 {
7520                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7521                   pattern_def_si = gsi_start (pattern_def_seq);
7522                 }
7523               else if (!gsi_end_p (pattern_def_si))
7524                 gsi_next (&pattern_def_si);
7525               if (pattern_def_seq != NULL)
7526                 {
7527                   gimple *pattern_def_stmt = NULL;
7528                   stmt_vec_info pattern_def_stmt_info = NULL;
7529
7530                   while (!gsi_end_p (pattern_def_si))
7531                     {
7532                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7533                       pattern_def_stmt_info
7534                         = vinfo_for_stmt (pattern_def_stmt);
7535                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7536                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7537                         break;
7538                       gsi_next (&pattern_def_si);
7539                     }
7540
7541                   if (!gsi_end_p (pattern_def_si))
7542                     {
7543                       if (dump_enabled_p ())
7544                         {
7545                           dump_printf_loc (MSG_NOTE, vect_location,
7546                                            "==> vectorizing pattern def "
7547                                            "stmt: ");
7548                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7549                                             pattern_def_stmt, 0);
7550                         }
7551
7552                       stmt = pattern_def_stmt;
7553                       stmt_info = pattern_def_stmt_info;
7554                     }
7555                   else
7556                     {
7557                       pattern_def_si = gsi_none ();
7558                       transform_pattern_stmt = false;
7559                     }
7560                 }
7561               else
7562                 transform_pattern_stmt = false;
7563             }
7564
7565           if (STMT_VINFO_VECTYPE (stmt_info))
7566             {
7567               unsigned int nunits
7568                 = (unsigned int)
7569                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7570               if (!STMT_SLP_TYPE (stmt_info)
7571                   && nunits != (unsigned int) vf
7572                   && dump_enabled_p ())
7573                   /* For SLP VF is set according to unrolling factor, and not
7574                      to vector size, hence for SLP this print is not valid.  */
7575                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7576             }
7577
7578           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7579              reached.  */
7580           if (STMT_SLP_TYPE (stmt_info))
7581             {
7582               if (!slp_scheduled)
7583                 {
7584                   slp_scheduled = true;
7585
7586                   if (dump_enabled_p ())
7587                     dump_printf_loc (MSG_NOTE, vect_location,
7588                                      "=== scheduling SLP instances ===\n");
7589
7590                   vect_schedule_slp (loop_vinfo);
7591                 }
7592
7593               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7594               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7595                 {
7596                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7597                     {
7598                       pattern_def_seq = NULL;
7599                       gsi_next (&si);
7600                     }
7601                   continue;
7602                 }
7603             }
7604
7605           /* -------- vectorize statement ------------ */
7606           if (dump_enabled_p ())
7607             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7608
7609           grouped_store = false;
7610           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7611           if (is_store)
7612             {
7613               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7614                 {
7615                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7616                      interleaving chain was completed - free all the stores in
7617                      the chain.  */
7618                   gsi_next (&si);
7619                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7620                 }
7621               else
7622                 {
7623                   /* Free the attached stmt_vec_info and remove the stmt.  */
7624                   gimple *store = gsi_stmt (si);
7625                   free_stmt_vec_info (store);
7626                   unlink_stmt_vdef (store);
7627                   gsi_remove (&si, true);
7628                   release_defs (store);
7629                 }
7630
7631               /* Stores can only appear at the end of pattern statements.  */
7632               gcc_assert (!transform_pattern_stmt);
7633               pattern_def_seq = NULL;
7634             }
7635           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7636             {
7637               pattern_def_seq = NULL;
7638               gsi_next (&si);
7639             }
7640         }                       /* stmts in BB */
7641     }                           /* BBs in loop */
7642
7643   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7644
7645   scale_profile_for_vect_loop (loop, vf);
7646
7647   /* The minimum number of iterations performed by the epilogue.  This
7648      is 1 when peeling for gaps because we always need a final scalar
7649      iteration.  */
7650   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7651   /* +1 to convert latch counts to loop iteration counts,
7652      -min_epilogue_iters to remove iterations that cannot be performed
7653        by the vector code.  */
7654   int bias = 1 - min_epilogue_iters;
7655   /* In these calculations the "- 1" converts loop iteration counts
7656      back to latch counts.  */
7657   if (loop->any_upper_bound)
7658     loop->nb_iterations_upper_bound
7659       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7660   if (loop->any_likely_upper_bound)
7661     loop->nb_iterations_likely_upper_bound
7662       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7663   if (loop->any_estimate)
7664     loop->nb_iterations_estimate
7665       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7666
7667   if (dump_enabled_p ())
7668     {
7669       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7670         {
7671           dump_printf_loc (MSG_NOTE, vect_location,
7672                            "LOOP VECTORIZED\n");
7673           if (loop->inner)
7674             dump_printf_loc (MSG_NOTE, vect_location,
7675                              "OUTER LOOP VECTORIZED\n");
7676           dump_printf (MSG_NOTE, "\n");
7677         }
7678       else
7679         dump_printf_loc (MSG_NOTE, vect_location,
7680                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7681                          current_vector_size);
7682     }
7683
7684   /* Free SLP instances here because otherwise stmt reference counting
7685      won't work.  */
7686   slp_instance instance;
7687   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7688     vect_free_slp_instance (instance);
7689   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7690   /* Clear-up safelen field since its value is invalid after vectorization
7691      since vectorized loop can have loop-carried dependencies.  */
7692   loop->safelen = 0;
7693
7694   /* Don't vectorize epilogue for epilogue.  */
7695   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7696     epilogue = NULL;
7697
7698   if (epilogue)
7699     {
7700         unsigned int vector_sizes
7701           = targetm.vectorize.autovectorize_vector_sizes ();
7702         vector_sizes &= current_vector_size - 1;
7703
7704         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7705           epilogue = NULL;
7706         else if (!vector_sizes)
7707           epilogue = NULL;
7708         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7709                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7710           {
7711             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7712             int ratio = current_vector_size / smallest_vec_size;
7713             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7714               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7715             eiters = eiters % vf;
7716
7717             epilogue->nb_iterations_upper_bound = eiters - 1;
7718
7719             if (eiters < vf / ratio)
7720               epilogue = NULL;
7721             }
7722     }
7723
7724   if (epilogue)
7725     {
7726       epilogue->force_vectorize = loop->force_vectorize;
7727       epilogue->safelen = loop->safelen;
7728       epilogue->dont_vectorize = false;
7729
7730       /* We may need to if-convert epilogue to vectorize it.  */
7731       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7732         tree_if_conversion (epilogue);
7733     }
7734
7735   return epilogue;
7736 }
7737
7738 /* The code below is trying to perform simple optimization - revert
7739    if-conversion for masked stores, i.e. if the mask of a store is zero
7740    do not perform it and all stored value producers also if possible.
7741    For example,
7742      for (i=0; i<n; i++)
7743        if (c[i])
7744         {
7745           p1[i] += 1;
7746           p2[i] = p3[i] +2;
7747         }
7748    this transformation will produce the following semi-hammock:
7749
7750    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7751      {
7752        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7753        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7754        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7755        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7756        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7757        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7758      }
7759 */
7760
7761 void
7762 optimize_mask_stores (struct loop *loop)
7763 {
7764   basic_block *bbs = get_loop_body (loop);
7765   unsigned nbbs = loop->num_nodes;
7766   unsigned i;
7767   basic_block bb;
7768   struct loop *bb_loop;
7769   gimple_stmt_iterator gsi;
7770   gimple *stmt;
7771   auto_vec<gimple *> worklist;
7772
7773   vect_location = find_loop_location (loop);
7774   /* Pick up all masked stores in loop if any.  */
7775   for (i = 0; i < nbbs; i++)
7776     {
7777       bb = bbs[i];
7778       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7779            gsi_next (&gsi))
7780         {
7781           stmt = gsi_stmt (gsi);
7782           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7783             worklist.safe_push (stmt);
7784         }
7785     }
7786
7787   free (bbs);
7788   if (worklist.is_empty ())
7789     return;
7790
7791   /* Loop has masked stores.  */
7792   while (!worklist.is_empty ())
7793     {
7794       gimple *last, *last_store;
7795       edge e, efalse;
7796       tree mask;
7797       basic_block store_bb, join_bb;
7798       gimple_stmt_iterator gsi_to;
7799       tree vdef, new_vdef;
7800       gphi *phi;
7801       tree vectype;
7802       tree zero;
7803
7804       last = worklist.pop ();
7805       mask = gimple_call_arg (last, 2);
7806       bb = gimple_bb (last);
7807       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7808          the same loop as if_bb.  It could be different to LOOP when two
7809          level loop-nest is vectorized and mask_store belongs to the inner
7810          one.  */
7811       e = split_block (bb, last);
7812       bb_loop = bb->loop_father;
7813       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7814       join_bb = e->dest;
7815       store_bb = create_empty_bb (bb);
7816       add_bb_to_loop (store_bb, bb_loop);
7817       e->flags = EDGE_TRUE_VALUE;
7818       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7819       /* Put STORE_BB to likely part.  */
7820       efalse->probability = profile_probability::unlikely ();
7821       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7822       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7823       if (dom_info_available_p (CDI_DOMINATORS))
7824         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7825       if (dump_enabled_p ())
7826         dump_printf_loc (MSG_NOTE, vect_location,
7827                          "Create new block %d to sink mask stores.",
7828                          store_bb->index);
7829       /* Create vector comparison with boolean result.  */
7830       vectype = TREE_TYPE (mask);
7831       zero = build_zero_cst (vectype);
7832       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7833       gsi = gsi_last_bb (bb);
7834       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7835       /* Create new PHI node for vdef of the last masked store:
7836          .MEM_2 = VDEF <.MEM_1>
7837          will be converted to
7838          .MEM.3 = VDEF <.MEM_1>
7839          and new PHI node will be created in join bb
7840          .MEM_2 = PHI <.MEM_1, .MEM_3>
7841       */
7842       vdef = gimple_vdef (last);
7843       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7844       gimple_set_vdef (last, new_vdef);
7845       phi = create_phi_node (vdef, join_bb);
7846       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7847
7848       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7849       while (true)
7850         {
7851           gimple_stmt_iterator gsi_from;
7852           gimple *stmt1 = NULL;
7853
7854           /* Move masked store to STORE_BB.  */
7855           last_store = last;
7856           gsi = gsi_for_stmt (last);
7857           gsi_from = gsi;
7858           /* Shift GSI to the previous stmt for further traversal.  */
7859           gsi_prev (&gsi);
7860           gsi_to = gsi_start_bb (store_bb);
7861           gsi_move_before (&gsi_from, &gsi_to);
7862           /* Setup GSI_TO to the non-empty block start.  */
7863           gsi_to = gsi_start_bb (store_bb);
7864           if (dump_enabled_p ())
7865             {
7866               dump_printf_loc (MSG_NOTE, vect_location,
7867                                "Move stmt to created bb\n");
7868               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7869             }
7870           /* Move all stored value producers if possible.  */
7871           while (!gsi_end_p (gsi))
7872             {
7873               tree lhs;
7874               imm_use_iterator imm_iter;
7875               use_operand_p use_p;
7876               bool res;
7877
7878               /* Skip debug statements.  */
7879               if (is_gimple_debug (gsi_stmt (gsi)))
7880                 {
7881                   gsi_prev (&gsi);
7882                   continue;
7883                 }
7884               stmt1 = gsi_stmt (gsi);
7885               /* Do not consider statements writing to memory or having
7886                  volatile operand.  */
7887               if (gimple_vdef (stmt1)
7888                   || gimple_has_volatile_ops (stmt1))
7889                 break;
7890               gsi_from = gsi;
7891               gsi_prev (&gsi);
7892               lhs = gimple_get_lhs (stmt1);
7893               if (!lhs)
7894                 break;
7895
7896               /* LHS of vectorized stmt must be SSA_NAME.  */
7897               if (TREE_CODE (lhs) != SSA_NAME)
7898                 break;
7899
7900               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7901                 {
7902                   /* Remove dead scalar statement.  */
7903                   if (has_zero_uses (lhs))
7904                     {
7905                       gsi_remove (&gsi_from, true);
7906                       continue;
7907                     }
7908                 }
7909
7910               /* Check that LHS does not have uses outside of STORE_BB.  */
7911               res = true;
7912               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7913                 {
7914                   gimple *use_stmt;
7915                   use_stmt = USE_STMT (use_p);
7916                   if (is_gimple_debug (use_stmt))
7917                     continue;
7918                   if (gimple_bb (use_stmt) != store_bb)
7919                     {
7920                       res = false;
7921                       break;
7922                     }
7923                 }
7924               if (!res)
7925                 break;
7926
7927               if (gimple_vuse (stmt1)
7928                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7929                 break;
7930
7931               /* Can move STMT1 to STORE_BB.  */
7932               if (dump_enabled_p ())
7933                 {
7934                   dump_printf_loc (MSG_NOTE, vect_location,
7935                                    "Move stmt to created bb\n");
7936                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7937                 }
7938               gsi_move_before (&gsi_from, &gsi_to);
7939               /* Shift GSI_TO for further insertion.  */
7940               gsi_prev (&gsi_to);
7941             }
7942           /* Put other masked stores with the same mask to STORE_BB.  */
7943           if (worklist.is_empty ()
7944               || gimple_call_arg (worklist.last (), 2) != mask
7945               || worklist.last () != stmt1)
7946             break;
7947           last = worklist.pop ();
7948         }
7949       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7950     }
7951 }