gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53
  54 /* Loop Vectorization Pass.
  55
  56    This pass tries to vectorize loops.
  57
  58    For example, the vectorizer transforms the following simple loop:
  59
  60         short a[N]; short b[N]; short c[N]; int i;
  61
  62         for (i=0; i<N; i++){
  63           a[i] = b[i] + c[i];
  64         }
  65
  66    as if it was manually vectorized by rewriting the source code into:
  67
  68         typedef int __attribute__((mode(V8HI))) v8hi;
  69         short a[N];  short b[N]; short c[N];   int i;
  70         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  71         v8hi va, vb, vc;
  72
  73         for (i=0; i<N/8; i++){
  74           vb = pb[i];
  75           vc = pc[i];
  76           va = vb + vc;
  77           pa[i] = va;
  78         }
  79
  80         The main entry to this pass is vectorize_loops(), in which
  81    the vectorizer applies a set of analyses on a given set of loops,
  82    followed by the actual vectorization transformation for the loops that
  83    had successfully passed the analysis phase.
  84         Throughout this pass we make a distinction between two types of
  85    data: scalars (which are represented by SSA_NAMES), and memory references
  86    ("data-refs").  These two types of data require different handling both
  87    during analysis and transformation. The types of data-refs that the
  88    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  89    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  90    accesses are required to have a simple (consecutive) access pattern.
  91
  92    Analysis phase:
  93    ===============
  94         The driver for the analysis phase is vect_analyze_loop().
  95    It applies a set of analyses, some of which rely on the scalar evolution
  96    analyzer (scev) developed by Sebastian Pop.
  97
  98         During the analysis phase the vectorizer records some information
  99    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 100    loop, as well as general information about the loop as a whole, which is
 101    recorded in a "loop_vec_info" struct attached to each loop.
 102
 103    Transformation phase:
 104    =====================
 105         The loop transformation phase scans all the stmts in the loop, and
 106    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 107    the loop that needs to be vectorized.  It inserts the vector code sequence
 108    just before the scalar stmt S, and records a pointer to the vector code
 109    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 110    attached to S).  This pointer will be used for the vectorization of following
 111    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 112    otherwise, we rely on dead code elimination for removing it.
 113
 114         For example, say stmt S1 was vectorized into stmt VS1:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    S2:  a = b;
 119
 120    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 121    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 122    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 123    resulting sequence would be:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    VS2: va = vb;
 128    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 129
 130         Operands that are not SSA_NAMEs, are data-refs that appear in
 131    load/store operations (like 'x[i]' in S1), and are handled differently.
 132
 133    Target modeling:
 134    =================
 135         Currently the only target specific information that is used is the
 136    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 137    Targets that can support different sizes of vectors, for now will need
 138    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 139    flexibility will be added in the future.
 140
 141         Since we only vectorize operations which vector form can be
 142    expressed using existing tree codes, to verify that an operation is
 143    supported, the vectorizer checks the relevant optab at the relevant
 144    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 145    the value found is CODE_FOR_nothing, then there's no target support, and
 146    we can't vectorize the stmt.
 147
 148    For additional information on this project see:
 149    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 150 */
 151
 152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 153
 154 /* Function vect_determine_vectorization_factor
 155
 156    Determine the vectorization factor (VF).  VF is the number of data elements
 157    that are operated upon in parallel in a single iteration of the vectorized
 158    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 159    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 160    elements can fit in a single vector register.
 161
 162    We currently support vectorization of loops in which all types operated upon
 163    are of the same size.  Therefore this function currently sets VF according to
 164    the size of the types operated upon, and fails if there are multiple sizes
 165    in the loop.
 166
 167    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 168    original loop:
 169         for (i=0; i<N; i++){
 170           a[i] = b[i] + c[i];
 171         }
 172
 173    vectorized loop:
 174         for (i=0; i<N; i+=VF){
 175           a[i:VF] = b[i:VF] + c[i:VF];
 176         }
 177 */
 178
 179 static bool
 180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 181 {
 182   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 183   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 184   unsigned nbbs = loop->num_nodes;
 185   unsigned int vectorization_factor = 0;
 186   tree scalar_type = NULL_TREE;
 187   gphi *phi;
 188   tree vectype;
 189   unsigned int nunits;
 190   stmt_vec_info stmt_info;
 191   unsigned i;
 192   HOST_WIDE_INT dummy;
 193   gimple *stmt, *pattern_stmt = NULL;
 194   gimple_seq pattern_def_seq = NULL;
 195   gimple_stmt_iterator pattern_def_si = gsi_none ();
 196   bool analyze_pattern_stmt = false;
 197   bool bool_result;
 198   auto_vec<stmt_vec_info> mask_producers;
 199
 200   if (dump_enabled_p ())
 201     dump_printf_loc (MSG_NOTE, vect_location,
 202                      "=== vect_determine_vectorization_factor ===\n");
 203
 204   for (i = 0; i < nbbs; i++)
 205     {
 206       basic_block bb = bbs[i];
 207
 208       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 209            gsi_next (&si))
 210         {
 211           phi = si.phi ();
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217             }
 218
 219           gcc_assert (stmt_info);
 220
 221           if (STMT_VINFO_RELEVANT_P (stmt_info)
 222               || STMT_VINFO_LIVE_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 554           if (dump_enabled_p ())
 555             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 556           if (!vectorization_factor
 557               || (nunits > vectorization_factor))
 558             vectorization_factor = nunits;
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 571                      vectorization_factor);
 572   if (vectorization_factor <= 1)
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "not vectorized: unsupported data-type\n");
 577       return false;
 578     }
 579   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 580
 581   for (i = 0; i < mask_producers.length (); i++)
 582     {
 583       tree mask_type = NULL;
 584
 585       stmt = STMT_VINFO_STMT (mask_producers[i]);
 586
 587       if (is_gimple_assign (stmt)
 588           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 589           && !VECT_SCALAR_BOOLEAN_TYPE_P
 590                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                     }
 623                   return false;
 624                 }
 625
 626               /* No vectype probably means external definition.
 627                  Allow it in case there is another operand which
 628                  allows to determine mask type.  */
 629               if (!vectype)
 630                 continue;
 631
 632               if (!mask_type)
 633                 mask_type = vectype;
 634               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 635                        != TYPE_VECTOR_SUBPARTS (vectype))
 636                 {
 637                   if (dump_enabled_p ())
 638                     {
 639                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                                        "not vectorized: different sized masks "
 641                                        "types in statement, ");
 642                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 643                                          mask_type);
 644                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 645                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 646                                          vectype);
 647                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 648                     }
 649                   return false;
 650                 }
 651               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 652                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 653                 {
 654                   if (dump_enabled_p ())
 655                     {
 656                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                                        "not vectorized: mixed mask and "
 658                                        "nonmask vector types in statement, ");
 659                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 660                                          mask_type);
 661                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 662                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 663                                          vectype);
 664                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 665                     }
 666                   return false;
 667                 }
 668             }
 669
 670           /* We may compare boolean value loaded as vector of integers.
 671              Fix mask_type in such case.  */
 672           if (mask_type
 673               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 674               && gimple_code (stmt) == GIMPLE_ASSIGN
 675               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 676             mask_type = build_same_sized_truth_vector_type (mask_type);
 677         }
 678
 679       /* No mask_type should mean loop invariant predicate.
 680          This is probably a subject for optimization in
 681          if-conversion.  */
 682       if (!mask_type)
 683         {
 684           if (dump_enabled_p ())
 685             {
 686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                                "not vectorized: can't compute mask type "
 688                                "for statement, ");
 689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 690                                 0);
 691             }
 692           return false;
 693         }
 694
 695       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 696     }
 697
 698   return true;
 699 }
 700
 701
 702 /* Function vect_is_simple_iv_evolution.
 703
 704    FORNOW: A simple evolution of an induction variables in the loop is
 705    considered a polynomial evolution.  */
 706
 707 static bool
 708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 709                              tree * step)
 710 {
 711   tree init_expr;
 712   tree step_expr;
 713   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 714   basic_block bb;
 715
 716   /* When there is no evolution in this loop, the evolution function
 717      is not "simple".  */
 718   if (evolution_part == NULL_TREE)
 719     return false;
 720
 721   /* When the evolution is a polynomial of degree >= 2
 722      the evolution function is not "simple".  */
 723   if (tree_is_chrec (evolution_part))
 724     return false;
 725
 726   step_expr = evolution_part;
 727   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 728
 729   if (dump_enabled_p ())
 730     {
 731       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 732       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 733       dump_printf (MSG_NOTE, ",  init: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 735       dump_printf (MSG_NOTE, "\n");
 736     }
 737
 738   *init = init_expr;
 739   *step = step_expr;
 740
 741   if (TREE_CODE (step_expr) != INTEGER_CST
 742       && (TREE_CODE (step_expr) != SSA_NAME
 743           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 744               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 745           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 746               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 747                   || !flag_associative_math)))
 748       && (TREE_CODE (step_expr) != REAL_CST
 749           || !flag_associative_math))
 750     {
 751       if (dump_enabled_p ())
 752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                          "step unknown.\n");
 754       return false;
 755     }
 756
 757   return true;
 758 }
 759
 760 /* Function vect_analyze_scalar_cycles_1.
 761
 762    Examine the cross iteration def-use cycles of scalar variables
 763    in LOOP.  LOOP_VINFO represents the loop that is now being
 764    considered for vectorization (can be LOOP, or an outer-loop
 765    enclosing LOOP).  */
 766
 767 static void
 768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 769 {
 770   basic_block bb = loop->header;
 771   tree init, step;
 772   auto_vec<gimple *, 64> worklist;
 773   gphi_iterator gsi;
 774   bool double_reduc;
 775
 776   if (dump_enabled_p ())
 777     dump_printf_loc (MSG_NOTE, vect_location,
 778                      "=== vect_analyze_scalar_cycles ===\n");
 779
 780   /* First - identify all inductions.  Reduction detection assumes that all the
 781      inductions have been identified, therefore, this order must not be
 782      changed.  */
 783   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 784     {
 785       gphi *phi = gsi.phi ();
 786       tree access_fn = NULL;
 787       tree def = PHI_RESULT (phi);
 788       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 789
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 793           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 794         }
 795
 796       /* Skip virtual phi's.  The data dependences that are associated with
 797          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 798       if (virtual_operand_p (def))
 799         continue;
 800
 801       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 802
 803       /* Analyze the evolution function.  */
 804       access_fn = analyze_scalar_evolution (loop, def);
 805       if (access_fn)
 806         {
 807           STRIP_NOPS (access_fn);
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "Access function of PHI: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 816             = initial_condition_in_loop_num (access_fn, loop->num);
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 818             = evolution_part_in_loop_num (access_fn, loop->num);
 819         }
 820
 821       if (!access_fn
 822           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 823           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 824               && TREE_CODE (step) != INTEGER_CST))
 825         {
 826           worklist.safe_push (phi);
 827           continue;
 828         }
 829
 830       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 831                   != NULL_TREE);
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 833
 834       if (dump_enabled_p ())
 835         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 836       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 837     }
 838
 839
 840   /* Second - identify all reductions and nested cycles.  */
 841   while (worklist.length () > 0)
 842     {
 843       gimple *phi = worklist.pop ();
 844       tree def = PHI_RESULT (phi);
 845       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 846       gimple *reduc_stmt;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852         }
 853
 854       gcc_assert (!virtual_operand_p (def)
 855                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 856
 857       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 858                                                 &double_reduc, false);
 859       if (reduc_stmt)
 860         {
 861           if (double_reduc)
 862             {
 863               if (dump_enabled_p ())
 864                 dump_printf_loc (MSG_NOTE, vect_location,
 865                                  "Detected double reduction.\n");
 866
 867               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 868               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 869                                                     vect_double_reduction_def;
 870             }
 871           else
 872             {
 873               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 874                 {
 875                   if (dump_enabled_p ())
 876                     dump_printf_loc (MSG_NOTE, vect_location,
 877                                      "Detected vectorizable nested cycle.\n");
 878
 879                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 880                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 881                                                              vect_nested_cycle;
 882                 }
 883               else
 884                 {
 885                   if (dump_enabled_p ())
 886                     dump_printf_loc (MSG_NOTE, vect_location,
 887                                      "Detected reduction.\n");
 888
 889                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 890                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 891                                                            vect_reduction_def;
 892                   /* Store the reduction cycles for possible vectorization in
 893                      loop-aware SLP if it was not detected as reduction
 894                      chain.  */
 895                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 896                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 984         while (next)
 985           {
 986             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 987               break;
 988             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 989           }
 990         /* If not all stmt in the chain are patterns try to handle
 991            the chain without patterns.  */
 992         if (! next)
 993           {
 994             vect_fixup_reduc_chain (first);
 995             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 996               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 997           }
 998       }
 999 }
1000
1001 /* Function vect_get_loop_niters.
1002
1003    Determine how many iterations the loop is executed and place it
1004    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1005    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1006    niter information holds in ASSUMPTIONS.
1007
1008    Return the loop exit condition.  */
1009
1010
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013                       tree *number_of_iterations, tree *number_of_iterationsm1)
1014 {
1015   edge exit = single_exit (loop);
1016   struct tree_niter_desc niter_desc;
1017   tree niter_assumptions, niter, may_be_zero;
1018   gcond *cond = get_loop_exit_condition (loop);
1019
1020   *assumptions = boolean_true_node;
1021   *number_of_iterationsm1 = chrec_dont_know;
1022   *number_of_iterations = chrec_dont_know;
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   if (!exit)
1028     return cond;
1029
1030   niter = chrec_dont_know;
1031   may_be_zero = NULL_TREE;
1032   niter_assumptions = boolean_true_node;
1033   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034       || chrec_contains_undetermined (niter_desc.niter))
1035     return cond;
1036
1037   niter_assumptions = niter_desc.assumptions;
1038   may_be_zero = niter_desc.may_be_zero;
1039   niter = niter_desc.niter;
1040
1041   if (may_be_zero && integer_zerop (may_be_zero))
1042     may_be_zero = NULL_TREE;
1043
1044   if (may_be_zero)
1045     {
1046       if (COMPARISON_CLASS_P (may_be_zero))
1047         {
1048           /* Try to combine may_be_zero with assumptions, this can simplify
1049              computation of niter expression.  */
1050           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052                                              niter_assumptions,
1053                                              fold_build1 (TRUTH_NOT_EXPR,
1054                                                           boolean_type_node,
1055                                                           may_be_zero));
1056           else
1057             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1059
1060           may_be_zero = NULL_TREE;
1061         }
1062       else if (integer_nonzerop (may_be_zero))
1063         {
1064           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066           return cond;
1067         }
1068       else
1069         return cond;
1070     }
1071
1072   *assumptions = niter_assumptions;
1073   *number_of_iterationsm1 = niter;
1074
1075   /* We want the number of loop header executions which is the number
1076      of latch executions plus one.
1077      ???  For UINT_MAX latch executions this number overflows to zero
1078      for loops like do { n++; } while (n != 0);  */
1079   if (niter && !chrec_contains_undetermined (niter))
1080     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081                           build_int_cst (TREE_TYPE (niter), 1));
1082   *number_of_iterations = niter;
1083
1084   return cond;
1085 }
1086
1087 /* Function bb_in_loop_p
1088
1089    Used as predicate for dfs order traversal of the loop bbs.  */
1090
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1093 {
1094   const struct loop *const loop = (const struct loop *)data;
1095   if (flow_bb_inside_loop_p (loop, bb))
1096     return true;
1097   return false;
1098 }
1099
1100
1101 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1102    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1103
1104 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1105   : vec_info (vec_info::loop, init_cost (loop_in)),
1106     loop (loop_in),
1107     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1108     num_itersm1 (NULL_TREE),
1109     num_iters (NULL_TREE),
1110     num_iters_unchanged (NULL_TREE),
1111     num_iters_assumptions (NULL_TREE),
1112     th (0),
1113     vectorization_factor (0),
1114     unaligned_dr (NULL),
1115     peeling_for_alignment (0),
1116     ptr_mask (0),
1117     slp_unrolling_factor (1),
1118     single_scalar_iteration_cost (0),
1119     vectorizable (false),
1120     peeling_for_gaps (false),
1121     peeling_for_niter (false),
1122     operands_swapped (false),
1123     no_data_dependencies (false),
1124     has_mask_store (false),
1125     scalar_loop (NULL),
1126     orig_loop_info (NULL)
1127 {
1128   /* Create/Update stmt_info for all stmts in the loop.  */
1129   basic_block *body = get_loop_body (loop);
1130   for (unsigned int i = 0; i < loop->num_nodes; i++)
1131     {
1132       basic_block bb = body[i];
1133       gimple_stmt_iterator si;
1134
1135       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1136         {
1137           gimple *phi = gsi_stmt (si);
1138           gimple_set_uid (phi, 0);
1139           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1140         }
1141
1142       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1143         {
1144           gimple *stmt = gsi_stmt (si);
1145           gimple_set_uid (stmt, 0);
1146           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1147         }
1148     }
1149   free (body);
1150
1151   /* CHECKME: We want to visit all BBs before their successors (except for
1152      latch blocks, for which this assertion wouldn't hold).  In the simple
1153      case of the loop forms we allow, a dfs order of the BBs would the same
1154      as reversed postorder traversal, so we are safe.  */
1155
1156   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1157                                           bbs, loop->num_nodes, loop);
1158   gcc_assert (nbbs == loop->num_nodes);
1159 }
1160
1161
1162 /* Free all memory used by the _loop_vec_info, as well as all the
1163    stmt_vec_info structs of all the stmts in the loop.  */
1164
1165 _loop_vec_info::~_loop_vec_info ()
1166 {
1167   int nbbs;
1168   gimple_stmt_iterator si;
1169   int j;
1170
1171   nbbs = loop->num_nodes;
1172   for (j = 0; j < nbbs; j++)
1173     {
1174       basic_block bb = bbs[j];
1175       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1176         free_stmt_vec_info (gsi_stmt (si));
1177
1178       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1179         {
1180           gimple *stmt = gsi_stmt (si);
1181
1182           /* We may have broken canonical form by moving a constant
1183              into RHS1 of a commutative op.  Fix such occurrences.  */
1184           if (operands_swapped && is_gimple_assign (stmt))
1185             {
1186               enum tree_code code = gimple_assign_rhs_code (stmt);
1187
1188               if ((code == PLUS_EXPR
1189                    || code == POINTER_PLUS_EXPR
1190                    || code == MULT_EXPR)
1191                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1192                 swap_ssa_operands (stmt,
1193                                    gimple_assign_rhs1_ptr (stmt),
1194                                    gimple_assign_rhs2_ptr (stmt));
1195               else if (code == COND_EXPR
1196                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1197                 {
1198                   tree cond_expr = gimple_assign_rhs1 (stmt);
1199                   enum tree_code cond_code = TREE_CODE (cond_expr);
1200
1201                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1202                     {
1203                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1204                                                                   0));
1205                       cond_code = invert_tree_comparison (cond_code,
1206                                                           honor_nans);
1207                       if (cond_code != ERROR_MARK)
1208                         {
1209                           TREE_SET_CODE (cond_expr, cond_code);
1210                           swap_ssa_operands (stmt,
1211                                              gimple_assign_rhs2_ptr (stmt),
1212                                              gimple_assign_rhs3_ptr (stmt));
1213                         }
1214                     }
1215                 }
1216             }
1217
1218           /* Free stmt_vec_info.  */
1219           free_stmt_vec_info (stmt);
1220           gsi_next (&si);
1221         }
1222     }
1223
1224   free (bbs);
1225
1226   loop->aux = NULL;
1227 }
1228
1229
1230 /* Calculate the cost of one scalar iteration of the loop.  */
1231 static void
1232 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1233 {
1234   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1235   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1236   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1237   int innerloop_iters, i;
1238
1239   /* Count statements in scalar loop.  Using this as scalar cost for a single
1240      iteration for now.
1241
1242      TODO: Add outer loop support.
1243
1244      TODO: Consider assigning different costs to different scalar
1245      statements.  */
1246
1247   /* FORNOW.  */
1248   innerloop_iters = 1;
1249   if (loop->inner)
1250     innerloop_iters = 50; /* FIXME */
1251
1252   for (i = 0; i < nbbs; i++)
1253     {
1254       gimple_stmt_iterator si;
1255       basic_block bb = bbs[i];
1256
1257       if (bb->loop_father == loop->inner)
1258         factor = innerloop_iters;
1259       else
1260         factor = 1;
1261
1262       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1263         {
1264           gimple *stmt = gsi_stmt (si);
1265           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1266
1267           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1268             continue;
1269
1270           /* Skip stmts that are not vectorized inside the loop.  */
1271           if (stmt_info
1272               && !STMT_VINFO_RELEVANT_P (stmt_info)
1273               && (!STMT_VINFO_LIVE_P (stmt_info)
1274                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1275               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1276             continue;
1277
1278           vect_cost_for_stmt kind;
1279           if (STMT_VINFO_DATA_REF (stmt_info))
1280             {
1281               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1282                kind = scalar_load;
1283              else
1284                kind = scalar_store;
1285             }
1286           else
1287             kind = scalar_stmt;
1288
1289           scalar_single_iter_cost
1290             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1291                                  factor, kind, stmt_info, 0, vect_prologue);
1292         }
1293     }
1294   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1295     = scalar_single_iter_cost;
1296 }
1297
1298
1299 /* Function vect_analyze_loop_form_1.
1300
1301    Verify that certain CFG restrictions hold, including:
1302    - the loop has a pre-header
1303    - the loop has a single entry and exit
1304    - the loop exit condition is simple enough
1305    - the number of iterations can be analyzed, i.e, a countable loop.  The
1306      niter could be analyzed under some assumptions.  */
1307
1308 bool
1309 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1310                           tree *assumptions, tree *number_of_iterationsm1,
1311                           tree *number_of_iterations, gcond **inner_loop_cond)
1312 {
1313   if (dump_enabled_p ())
1314     dump_printf_loc (MSG_NOTE, vect_location,
1315                      "=== vect_analyze_loop_form ===\n");
1316
1317   /* Different restrictions apply when we are considering an inner-most loop,
1318      vs. an outer (nested) loop.
1319      (FORNOW. May want to relax some of these restrictions in the future).  */
1320
1321   if (!loop->inner)
1322     {
1323       /* Inner-most loop.  We currently require that the number of BBs is
1324          exactly 2 (the header and latch).  Vectorizable inner-most loops
1325          look like this:
1326
1327                         (pre-header)
1328                            |
1329                           header <--------+
1330                            | |            |
1331                            | +--> latch --+
1332                            |
1333                         (exit-bb)  */
1334
1335       if (loop->num_nodes != 2)
1336         {
1337           if (dump_enabled_p ())
1338             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                              "not vectorized: control flow in loop.\n");
1340           return false;
1341         }
1342
1343       if (empty_block_p (loop->header))
1344         {
1345           if (dump_enabled_p ())
1346             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1347                              "not vectorized: empty loop.\n");
1348           return false;
1349         }
1350     }
1351   else
1352     {
1353       struct loop *innerloop = loop->inner;
1354       edge entryedge;
1355
1356       /* Nested loop. We currently require that the loop is doubly-nested,
1357          contains a single inner loop, and the number of BBs is exactly 5.
1358          Vectorizable outer-loops look like this:
1359
1360                         (pre-header)
1361                            |
1362                           header <---+
1363                            |         |
1364                           inner-loop |
1365                            |         |
1366                           tail ------+
1367                            |
1368                         (exit-bb)
1369
1370          The inner-loop has the properties expected of inner-most loops
1371          as described above.  */
1372
1373       if ((loop->inner)->inner || (loop->inner)->next)
1374         {
1375           if (dump_enabled_p ())
1376             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1377                              "not vectorized: multiple nested loops.\n");
1378           return false;
1379         }
1380
1381       if (loop->num_nodes != 5)
1382         {
1383           if (dump_enabled_p ())
1384             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1385                              "not vectorized: control flow in loop.\n");
1386           return false;
1387         }
1388
1389       entryedge = loop_preheader_edge (innerloop);
1390       if (entryedge->src != loop->header
1391           || !single_exit (innerloop)
1392           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1393         {
1394           if (dump_enabled_p ())
1395             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1396                              "not vectorized: unsupported outerloop form.\n");
1397           return false;
1398         }
1399
1400       /* Analyze the inner-loop.  */
1401       tree inner_niterm1, inner_niter, inner_assumptions;
1402       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1403                                       &inner_assumptions, &inner_niterm1,
1404                                       &inner_niter, NULL)
1405           /* Don't support analyzing niter under assumptions for inner
1406              loop.  */
1407           || !integer_onep (inner_assumptions))
1408         {
1409           if (dump_enabled_p ())
1410             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1411                              "not vectorized: Bad inner loop.\n");
1412           return false;
1413         }
1414
1415       if (!expr_invariant_in_loop_p (loop, inner_niter))
1416         {
1417           if (dump_enabled_p ())
1418             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1419                              "not vectorized: inner-loop count not"
1420                              " invariant.\n");
1421           return false;
1422         }
1423
1424       if (dump_enabled_p ())
1425         dump_printf_loc (MSG_NOTE, vect_location,
1426                          "Considering outer-loop vectorization.\n");
1427     }
1428
1429   if (!single_exit (loop)
1430       || EDGE_COUNT (loop->header->preds) != 2)
1431     {
1432       if (dump_enabled_p ())
1433         {
1434           if (!single_exit (loop))
1435             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1436                              "not vectorized: multiple exits.\n");
1437           else if (EDGE_COUNT (loop->header->preds) != 2)
1438             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1439                              "not vectorized: too many incoming edges.\n");
1440         }
1441       return false;
1442     }
1443
1444   /* We assume that the loop exit condition is at the end of the loop. i.e,
1445      that the loop is represented as a do-while (with a proper if-guard
1446      before the loop if needed), where the loop header contains all the
1447      executable statements, and the latch is empty.  */
1448   if (!empty_block_p (loop->latch)
1449       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1450     {
1451       if (dump_enabled_p ())
1452         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1453                          "not vectorized: latch block not empty.\n");
1454       return false;
1455     }
1456
1457   /* Make sure the exit is not abnormal.  */
1458   edge e = single_exit (loop);
1459   if (e->flags & EDGE_ABNORMAL)
1460     {
1461       if (dump_enabled_p ())
1462         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1463                          "not vectorized: abnormal loop exit edge.\n");
1464       return false;
1465     }
1466
1467   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1468                                      number_of_iterationsm1);
1469   if (!*loop_cond)
1470     {
1471       if (dump_enabled_p ())
1472         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1473                          "not vectorized: complicated exit condition.\n");
1474       return false;
1475     }
1476
1477   if (integer_zerop (*assumptions)
1478       || !*number_of_iterations
1479       || chrec_contains_undetermined (*number_of_iterations))
1480     {
1481       if (dump_enabled_p ())
1482         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1483                          "not vectorized: number of iterations cannot be "
1484                          "computed.\n");
1485       return false;
1486     }
1487
1488   if (integer_zerop (*number_of_iterations))
1489     {
1490       if (dump_enabled_p ())
1491         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492                          "not vectorized: number of iterations = 0.\n");
1493       return false;
1494     }
1495
1496   return true;
1497 }
1498
1499 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1500
1501 loop_vec_info
1502 vect_analyze_loop_form (struct loop *loop)
1503 {
1504   tree assumptions, number_of_iterations, number_of_iterationsm1;
1505   gcond *loop_cond, *inner_loop_cond = NULL;
1506
1507   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1508                                   &assumptions, &number_of_iterationsm1,
1509                                   &number_of_iterations, &inner_loop_cond))
1510     return NULL;
1511
1512   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1513   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1514   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1515   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1516   if (!integer_onep (assumptions))
1517     {
1518       /* We consider to vectorize this loop by versioning it under
1519          some assumptions.  In order to do this, we need to clear
1520          existing information computed by scev and niter analyzer.  */
1521       scev_reset_htab ();
1522       free_numbers_of_iterations_estimates (loop);
1523       /* Also set flag for this loop so that following scev and niter
1524          analysis are done under the assumptions.  */
1525       loop_constraint_set (loop, LOOP_C_FINITE);
1526       /* Also record the assumptions for versioning.  */
1527       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1528     }
1529
1530   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1531     {
1532       if (dump_enabled_p ())
1533         {
1534           dump_printf_loc (MSG_NOTE, vect_location,
1535                            "Symbolic number of iterations is ");
1536           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1537           dump_printf (MSG_NOTE, "\n");
1538         }
1539     }
1540
1541   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1542   if (inner_loop_cond)
1543     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1544       = loop_exit_ctrl_vec_info_type;
1545
1546   gcc_assert (!loop->aux);
1547   loop->aux = loop_vinfo;
1548   return loop_vinfo;
1549 }
1550
1551
1552
1553 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1554    statements update the vectorization factor.  */
1555
1556 static void
1557 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1558 {
1559   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1560   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1561   int nbbs = loop->num_nodes;
1562   unsigned int vectorization_factor;
1563   int i;
1564
1565   if (dump_enabled_p ())
1566     dump_printf_loc (MSG_NOTE, vect_location,
1567                      "=== vect_update_vf_for_slp ===\n");
1568
1569   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1570   gcc_assert (vectorization_factor != 0);
1571
1572   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1573      vectorization factor of the loop is the unrolling factor required by
1574      the SLP instances.  If that unrolling factor is 1, we say, that we
1575      perform pure SLP on loop - cross iteration parallelism is not
1576      exploited.  */
1577   bool only_slp_in_loop = true;
1578   for (i = 0; i < nbbs; i++)
1579     {
1580       basic_block bb = bbs[i];
1581       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1582            gsi_next (&si))
1583         {
1584           gimple *stmt = gsi_stmt (si);
1585           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1586           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1587               && STMT_VINFO_RELATED_STMT (stmt_info))
1588             {
1589               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1590               stmt_info = vinfo_for_stmt (stmt);
1591             }
1592           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1593                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1594               && !PURE_SLP_STMT (stmt_info))
1595             /* STMT needs both SLP and loop-based vectorization.  */
1596             only_slp_in_loop = false;
1597         }
1598     }
1599
1600   if (only_slp_in_loop)
1601     {
1602       dump_printf_loc (MSG_NOTE, vect_location,
1603                        "Loop contains only SLP stmts\n");
1604       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1605     }
1606   else
1607     {
1608       dump_printf_loc (MSG_NOTE, vect_location,
1609                        "Loop contains SLP and non-SLP stmts\n");
1610       vectorization_factor
1611         = least_common_multiple (vectorization_factor,
1612                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1613     }
1614
1615   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1616   if (dump_enabled_p ())
1617     dump_printf_loc (MSG_NOTE, vect_location,
1618                      "Updating vectorization factor to %d\n",
1619                      vectorization_factor);
1620 }
1621
1622 /* Function vect_analyze_loop_operations.
1623
1624    Scan the loop stmts and make sure they are all vectorizable.  */
1625
1626 static bool
1627 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1628 {
1629   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1630   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1631   int nbbs = loop->num_nodes;
1632   int i;
1633   stmt_vec_info stmt_info;
1634   bool need_to_vectorize = false;
1635   bool ok;
1636
1637   if (dump_enabled_p ())
1638     dump_printf_loc (MSG_NOTE, vect_location,
1639                      "=== vect_analyze_loop_operations ===\n");
1640
1641   for (i = 0; i < nbbs; i++)
1642     {
1643       basic_block bb = bbs[i];
1644
1645       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1646            gsi_next (&si))
1647         {
1648           gphi *phi = si.phi ();
1649           ok = true;
1650
1651           stmt_info = vinfo_for_stmt (phi);
1652           if (dump_enabled_p ())
1653             {
1654               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1655               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1656             }
1657           if (virtual_operand_p (gimple_phi_result (phi)))
1658             continue;
1659
1660           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1661              (i.e., a phi in the tail of the outer-loop).  */
1662           if (! is_loop_header_bb_p (bb))
1663             {
1664               /* FORNOW: we currently don't support the case that these phis
1665                  are not used in the outerloop (unless it is double reduction,
1666                  i.e., this phi is vect_reduction_def), cause this case
1667                  requires to actually do something here.  */
1668               if (STMT_VINFO_LIVE_P (stmt_info)
1669                   && STMT_VINFO_DEF_TYPE (stmt_info)
1670                      != vect_double_reduction_def)
1671                 {
1672                   if (dump_enabled_p ())
1673                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1674                                      "Unsupported loop-closed phi in "
1675                                      "outer-loop.\n");
1676                   return false;
1677                 }
1678
1679               /* If PHI is used in the outer loop, we check that its operand
1680                  is defined in the inner loop.  */
1681               if (STMT_VINFO_RELEVANT_P (stmt_info))
1682                 {
1683                   tree phi_op;
1684                   gimple *op_def_stmt;
1685
1686                   if (gimple_phi_num_args (phi) != 1)
1687                     return false;
1688
1689                   phi_op = PHI_ARG_DEF (phi, 0);
1690                   if (TREE_CODE (phi_op) != SSA_NAME)
1691                     return false;
1692
1693                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1694                   if (gimple_nop_p (op_def_stmt)
1695                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1696                       || !vinfo_for_stmt (op_def_stmt))
1697                     return false;
1698
1699                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1700                         != vect_used_in_outer
1701                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1702                            != vect_used_in_outer_by_reduction)
1703                     return false;
1704                 }
1705
1706               continue;
1707             }
1708
1709           gcc_assert (stmt_info);
1710
1711           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1712                || STMT_VINFO_LIVE_P (stmt_info))
1713               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1714             {
1715               /* A scalar-dependence cycle that we don't support.  */
1716               if (dump_enabled_p ())
1717                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1718                                  "not vectorized: scalar dependence cycle.\n");
1719               return false;
1720             }
1721
1722           if (STMT_VINFO_RELEVANT_P (stmt_info))
1723             {
1724               need_to_vectorize = true;
1725               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1726                   && ! PURE_SLP_STMT (stmt_info))
1727                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1728               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1729                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1730                        && ! PURE_SLP_STMT (stmt_info))
1731                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1732             }
1733
1734           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1735             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1736
1737           if (!ok)
1738             {
1739               if (dump_enabled_p ())
1740                 {
1741                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742                                    "not vectorized: relevant phi not "
1743                                    "supported: ");
1744                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1745                 }
1746               return false;
1747             }
1748         }
1749
1750       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1751            gsi_next (&si))
1752         {
1753           gimple *stmt = gsi_stmt (si);
1754           if (!gimple_clobber_p (stmt)
1755               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1756             return false;
1757         }
1758     } /* bbs */
1759
1760   /* All operations in the loop are either irrelevant (deal with loop
1761      control, or dead), or only used outside the loop and can be moved
1762      out of the loop (e.g. invariants, inductions).  The loop can be
1763      optimized away by scalar optimizations.  We're better off not
1764      touching this loop.  */
1765   if (!need_to_vectorize)
1766     {
1767       if (dump_enabled_p ())
1768         dump_printf_loc (MSG_NOTE, vect_location,
1769                          "All the computation can be taken out of the loop.\n");
1770       if (dump_enabled_p ())
1771         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772                          "not vectorized: redundant loop. no profit to "
1773                          "vectorize.\n");
1774       return false;
1775     }
1776
1777   return true;
1778 }
1779
1780
1781 /* Function vect_analyze_loop_2.
1782
1783    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1784    for it.  The different analyses will record information in the
1785    loop_vec_info struct.  */
1786 static bool
1787 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1788 {
1789   bool ok;
1790   int max_vf = MAX_VECTORIZATION_FACTOR;
1791   int min_vf = 2;
1792   unsigned int n_stmts = 0;
1793
1794   /* The first group of checks is independent of the vector size.  */
1795   fatal = true;
1796
1797   /* Find all data references in the loop (which correspond to vdefs/vuses)
1798      and analyze their evolution in the loop.  */
1799
1800   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1801
1802   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1803   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1804     {
1805       if (dump_enabled_p ())
1806         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1807                          "not vectorized: loop nest containing two "
1808                          "or more consecutive inner loops cannot be "
1809                          "vectorized\n");
1810       return false;
1811     }
1812
1813   for (unsigned i = 0; i < loop->num_nodes; i++)
1814     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1815          !gsi_end_p (gsi); gsi_next (&gsi))
1816       {
1817         gimple *stmt = gsi_stmt (gsi);
1818         if (is_gimple_debug (stmt))
1819           continue;
1820         ++n_stmts;
1821         if (!find_data_references_in_stmt (loop, stmt,
1822                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1823           {
1824             if (is_gimple_call (stmt) && loop->safelen)
1825               {
1826                 tree fndecl = gimple_call_fndecl (stmt), op;
1827                 if (fndecl != NULL_TREE)
1828                   {
1829                     cgraph_node *node = cgraph_node::get (fndecl);
1830                     if (node != NULL && node->simd_clones != NULL)
1831                       {
1832                         unsigned int j, n = gimple_call_num_args (stmt);
1833                         for (j = 0; j < n; j++)
1834                           {
1835                             op = gimple_call_arg (stmt, j);
1836                             if (DECL_P (op)
1837                                 || (REFERENCE_CLASS_P (op)
1838                                     && get_base_address (op)))
1839                               break;
1840                           }
1841                         op = gimple_call_lhs (stmt);
1842                         /* Ignore #pragma omp declare simd functions
1843                            if they don't have data references in the
1844                            call stmt itself.  */
1845                         if (j == n
1846                             && !(op
1847                                  && (DECL_P (op)
1848                                      || (REFERENCE_CLASS_P (op)
1849                                          && get_base_address (op)))))
1850                           continue;
1851                       }
1852                   }
1853               }
1854             if (dump_enabled_p ())
1855               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1856                                "not vectorized: loop contains function "
1857                                "calls or data references that cannot "
1858                                "be analyzed\n");
1859             return false;
1860           }
1861       }
1862
1863   /* Analyze the data references and also adjust the minimal
1864      vectorization factor according to the loads and stores.  */
1865
1866   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1867   if (!ok)
1868     {
1869       if (dump_enabled_p ())
1870         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1871                          "bad data references.\n");
1872       return false;
1873     }
1874
1875   /* Classify all cross-iteration scalar data-flow cycles.
1876      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1877   vect_analyze_scalar_cycles (loop_vinfo);
1878
1879   vect_pattern_recog (loop_vinfo);
1880
1881   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1882
1883   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1884      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1885
1886   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1887   if (!ok)
1888     {
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                          "bad data access.\n");
1892       return false;
1893     }
1894
1895   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1896
1897   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1898   if (!ok)
1899     {
1900       if (dump_enabled_p ())
1901         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1902                          "unexpected pattern.\n");
1903       return false;
1904     }
1905
1906   /* While the rest of the analysis below depends on it in some way.  */
1907   fatal = false;
1908
1909   /* Analyze data dependences between the data-refs in the loop
1910      and adjust the maximum vectorization factor according to
1911      the dependences.
1912      FORNOW: fail at the first data dependence that we encounter.  */
1913
1914   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1915   if (!ok
1916       || max_vf < min_vf)
1917     {
1918       if (dump_enabled_p ())
1919             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920                              "bad data dependence.\n");
1921       return false;
1922     }
1923
1924   ok = vect_determine_vectorization_factor (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "can't determine vectorization factor.\n");
1930       return false;
1931     }
1932   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1933     {
1934       if (dump_enabled_p ())
1935         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1936                          "bad data dependence.\n");
1937       return false;
1938     }
1939
1940   /* Compute the scalar iteration cost.  */
1941   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1942
1943   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1944   HOST_WIDE_INT estimated_niter;
1945   unsigned th;
1946   int min_scalar_loop_bound;
1947
1948   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1949   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1950   if (!ok)
1951     return false;
1952
1953   /* If there are any SLP instances mark them as pure_slp.  */
1954   bool slp = vect_make_slp_decision (loop_vinfo);
1955   if (slp)
1956     {
1957       /* Find stmts that need to be both vectorized and SLPed.  */
1958       vect_detect_hybrid_slp (loop_vinfo);
1959
1960       /* Update the vectorization factor based on the SLP decision.  */
1961       vect_update_vf_for_slp (loop_vinfo);
1962     }
1963
1964   /* This is the point where we can re-start analysis with SLP forced off.  */
1965 start_over:
1966
1967   /* Now the vectorization factor is final.  */
1968   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1969   gcc_assert (vectorization_factor != 0);
1970
1971   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1972     dump_printf_loc (MSG_NOTE, vect_location,
1973                      "vectorization_factor = %d, niters = "
1974                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1975                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1976
1977   HOST_WIDE_INT max_niter
1978     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1979   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1980        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1981       || (max_niter != -1
1982           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1983     {
1984       if (dump_enabled_p ())
1985         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1986                          "not vectorized: iteration count smaller than "
1987                          "vectorization factor.\n");
1988       return false;
1989     }
1990
1991   /* Analyze the alignment of the data-refs in the loop.
1992      Fail if a data reference is found that cannot be vectorized.  */
1993
1994   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1995   if (!ok)
1996     {
1997       if (dump_enabled_p ())
1998         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1999                          "bad data alignment.\n");
2000       return false;
2001     }
2002
2003   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2004      It is important to call pruning after vect_analyze_data_ref_accesses,
2005      since we use grouping information gathered by interleaving analysis.  */
2006   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2007   if (!ok)
2008     return false;
2009
2010   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2011      vectorization.  */
2012   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2013     {
2014     /* This pass will decide on using loop versioning and/or loop peeling in
2015        order to enhance the alignment of data references in the loop.  */
2016     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2017     if (!ok)
2018       {
2019         if (dump_enabled_p ())
2020           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2021                            "bad data alignment.\n");
2022         return false;
2023       }
2024     }
2025
2026   if (slp)
2027     {
2028       /* Analyze operations in the SLP instances.  Note this may
2029          remove unsupported SLP instances which makes the above
2030          SLP kind detection invalid.  */
2031       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2032       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2033                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2034       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2035         goto again;
2036     }
2037
2038   /* Scan all the remaining operations in the loop that are not subject
2039      to SLP and make sure they are vectorizable.  */
2040   ok = vect_analyze_loop_operations (loop_vinfo);
2041   if (!ok)
2042     {
2043       if (dump_enabled_p ())
2044         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2045                          "bad operation or unsupported loop bound.\n");
2046       return false;
2047     }
2048
2049   /* If epilog loop is required because of data accesses with gaps,
2050      one additional iteration needs to be peeled.  Check if there is
2051      enough iterations for vectorization.  */
2052   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2053       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2054     {
2055       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2056       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2057
2058       if (wi::to_widest (scalar_niters) < vf)
2059         {
2060           if (dump_enabled_p ())
2061             dump_printf_loc (MSG_NOTE, vect_location,
2062                              "loop has no enough iterations to support"
2063                              " peeling for gaps.\n");
2064           return false;
2065         }
2066     }
2067
2068   /* Analyze cost.  Decide if worth while to vectorize.  */
2069   int min_profitable_estimate, min_profitable_iters;
2070   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2071                                       &min_profitable_estimate);
2072
2073   if (min_profitable_iters < 0)
2074     {
2075       if (dump_enabled_p ())
2076         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2077                          "not vectorized: vectorization not profitable.\n");
2078       if (dump_enabled_p ())
2079         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2080                          "not vectorized: vector version will never be "
2081                          "profitable.\n");
2082       goto again;
2083     }
2084
2085   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2086                            * vectorization_factor);
2087
2088   /* Use the cost model only if it is more conservative than user specified
2089      threshold.  */
2090   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2091
2092   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2093
2094   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2095       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2096     {
2097       if (dump_enabled_p ())
2098         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099                          "not vectorized: vectorization not profitable.\n");
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_NOTE, vect_location,
2102                          "not vectorized: iteration count smaller than user "
2103                          "specified loop bound parameter or minimum profitable "
2104                          "iterations (whichever is more conservative).\n");
2105       goto again;
2106     }
2107
2108   estimated_niter
2109     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2110   if (estimated_niter == -1)
2111     estimated_niter = max_niter;
2112   if (estimated_niter != -1
2113       && ((unsigned HOST_WIDE_INT) estimated_niter
2114           < MAX (th, (unsigned) min_profitable_estimate)))
2115     {
2116       if (dump_enabled_p ())
2117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118                          "not vectorized: estimated iteration count too "
2119                          "small.\n");
2120       if (dump_enabled_p ())
2121         dump_printf_loc (MSG_NOTE, vect_location,
2122                          "not vectorized: estimated iteration count smaller "
2123                          "than specified loop bound parameter or minimum "
2124                          "profitable iterations (whichever is more "
2125                          "conservative).\n");
2126       goto again;
2127     }
2128
2129   /* Decide whether we need to create an epilogue loop to handle
2130      remaining scalar iterations.  */
2131   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2132          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2133         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2134
2135   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2136       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2137     {
2138       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2139                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2140           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2141         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2142     }
2143   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2144            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2145                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2146                /* In case of versioning, check if the maximum number of
2147                   iterations is greater than th.  If they are identical,
2148                   the epilogue is unnecessary.  */
2149                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2150                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2151     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2152
2153   /* If an epilogue loop is required make sure we can create one.  */
2154   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2155       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2156     {
2157       if (dump_enabled_p ())
2158         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2159       if (!vect_can_advance_ivs_p (loop_vinfo)
2160           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2161                                            single_exit (LOOP_VINFO_LOOP
2162                                                          (loop_vinfo))))
2163         {
2164           if (dump_enabled_p ())
2165             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166                              "not vectorized: can't create required "
2167                              "epilog loop\n");
2168           goto again;
2169         }
2170     }
2171
2172   /* During peeling, we need to check if number of loop iterations is
2173      enough for both peeled prolog loop and vector loop.  This check
2174      can be merged along with threshold check of loop versioning, so
2175      increase threshold for this case if necessary.  */
2176   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2177       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2178           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2179     {
2180       unsigned niters_th;
2181
2182       /* Niters for peeled prolog loop.  */
2183       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2184         {
2185           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2186           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2187
2188           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2189         }
2190       else
2191         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2192
2193       /* Niters for at least one iteration of vectorized loop.  */
2194       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2195       /* One additional iteration because of peeling for gap.  */
2196       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2197         niters_th++;
2198       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2199         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2200     }
2201
2202   gcc_assert (vectorization_factor
2203               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2204
2205   /* Ok to vectorize!  */
2206   return true;
2207
2208 again:
2209   /* Try again with SLP forced off but if we didn't do any SLP there is
2210      no point in re-trying.  */
2211   if (!slp)
2212     return false;
2213
2214   /* If there are reduction chains re-trying will fail anyway.  */
2215   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2216     return false;
2217
2218   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2219      via interleaving or lane instructions.  */
2220   slp_instance instance;
2221   slp_tree node;
2222   unsigned i, j;
2223   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2224     {
2225       stmt_vec_info vinfo;
2226       vinfo = vinfo_for_stmt
2227           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2228       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2229         continue;
2230       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2231       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2232       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2233       if (! vect_store_lanes_supported (vectype, size)
2234           && ! vect_grouped_store_supported (vectype, size))
2235         return false;
2236       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2237         {
2238           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2239           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2240           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2241           size = STMT_VINFO_GROUP_SIZE (vinfo);
2242           vectype = STMT_VINFO_VECTYPE (vinfo);
2243           if (! vect_load_lanes_supported (vectype, size)
2244               && ! vect_grouped_load_supported (vectype, single_element_p,
2245                                                 size))
2246             return false;
2247         }
2248     }
2249
2250   if (dump_enabled_p ())
2251     dump_printf_loc (MSG_NOTE, vect_location,
2252                      "re-trying with SLP disabled\n");
2253
2254   /* Roll back state appropriately.  No SLP this time.  */
2255   slp = false;
2256   /* Restore vectorization factor as it were without SLP.  */
2257   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2258   /* Free the SLP instances.  */
2259   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2260     vect_free_slp_instance (instance);
2261   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2262   /* Reset SLP type to loop_vect on all stmts.  */
2263   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2264     {
2265       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2266       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2267            !gsi_end_p (si); gsi_next (&si))
2268         {
2269           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2270           STMT_SLP_TYPE (stmt_info) = loop_vect;
2271         }
2272       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2273            !gsi_end_p (si); gsi_next (&si))
2274         {
2275           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2276           STMT_SLP_TYPE (stmt_info) = loop_vect;
2277           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2278             {
2279               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2280               STMT_SLP_TYPE (stmt_info) = loop_vect;
2281               for (gimple_stmt_iterator pi
2282                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2283                    !gsi_end_p (pi); gsi_next (&pi))
2284                 {
2285                   gimple *pstmt = gsi_stmt (pi);
2286                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2287                 }
2288             }
2289         }
2290     }
2291   /* Free optimized alias test DDRS.  */
2292   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2293   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2294   /* Reset target cost data.  */
2295   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2296   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2297     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2298   /* Reset assorted flags.  */
2299   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2300   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2301   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2302
2303   goto start_over;
2304 }
2305
2306 /* Function vect_analyze_loop.
2307
2308    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2309    for it.  The different analyses will record information in the
2310    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2311    be vectorized.  */
2312 loop_vec_info
2313 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2314 {
2315   loop_vec_info loop_vinfo;
2316   unsigned int vector_sizes;
2317
2318   /* Autodetect first vector size we try.  */
2319   current_vector_size = 0;
2320   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2321
2322   if (dump_enabled_p ())
2323     dump_printf_loc (MSG_NOTE, vect_location,
2324                      "===== analyze_loop_nest =====\n");
2325
2326   if (loop_outer (loop)
2327       && loop_vec_info_for_loop (loop_outer (loop))
2328       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2329     {
2330       if (dump_enabled_p ())
2331         dump_printf_loc (MSG_NOTE, vect_location,
2332                          "outer-loop already vectorized.\n");
2333       return NULL;
2334     }
2335
2336   while (1)
2337     {
2338       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2339       loop_vinfo = vect_analyze_loop_form (loop);
2340       if (!loop_vinfo)
2341         {
2342           if (dump_enabled_p ())
2343             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2344                              "bad loop form.\n");
2345           return NULL;
2346         }
2347
2348       bool fatal = false;
2349
2350       if (orig_loop_vinfo)
2351         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2352
2353       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2354         {
2355           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2356
2357           return loop_vinfo;
2358         }
2359
2360       delete loop_vinfo;
2361
2362       vector_sizes &= ~current_vector_size;
2363       if (fatal
2364           || vector_sizes == 0
2365           || current_vector_size == 0)
2366         return NULL;
2367
2368       /* Try the next biggest vector size.  */
2369       current_vector_size = 1 << floor_log2 (vector_sizes);
2370       if (dump_enabled_p ())
2371         dump_printf_loc (MSG_NOTE, vect_location,
2372                          "***** Re-trying analysis with "
2373                          "vector size %d\n", current_vector_size);
2374     }
2375 }
2376
2377
2378 /* Function reduction_code_for_scalar_code
2379
2380    Input:
2381    CODE - tree_code of a reduction operations.
2382
2383    Output:
2384    REDUC_CODE - the corresponding tree-code to be used to reduce the
2385       vector of partial results into a single scalar result, or ERROR_MARK
2386       if the operation is a supported reduction operation, but does not have
2387       such a tree-code.
2388
2389    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2390
2391 static bool
2392 reduction_code_for_scalar_code (enum tree_code code,
2393                                 enum tree_code *reduc_code)
2394 {
2395   switch (code)
2396     {
2397       case MAX_EXPR:
2398         *reduc_code = REDUC_MAX_EXPR;
2399         return true;
2400
2401       case MIN_EXPR:
2402         *reduc_code = REDUC_MIN_EXPR;
2403         return true;
2404
2405       case PLUS_EXPR:
2406         *reduc_code = REDUC_PLUS_EXPR;
2407         return true;
2408
2409       case MULT_EXPR:
2410       case MINUS_EXPR:
2411       case BIT_IOR_EXPR:
2412       case BIT_XOR_EXPR:
2413       case BIT_AND_EXPR:
2414         *reduc_code = ERROR_MARK;
2415         return true;
2416
2417       default:
2418        return false;
2419     }
2420 }
2421
2422
2423 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2424    STMT is printed with a message MSG. */
2425
2426 static void
2427 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2428 {
2429   dump_printf_loc (msg_type, vect_location, "%s", msg);
2430   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2431 }
2432
2433
2434 /* Detect SLP reduction of the form:
2435
2436    #a1 = phi <a5, a0>
2437    a2 = operation (a1)
2438    a3 = operation (a2)
2439    a4 = operation (a3)
2440    a5 = operation (a4)
2441
2442    #a = phi <a5>
2443
2444    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2445    FIRST_STMT is the first reduction stmt in the chain
2446    (a2 = operation (a1)).
2447
2448    Return TRUE if a reduction chain was detected.  */
2449
2450 static bool
2451 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2452                        gimple *first_stmt)
2453 {
2454   struct loop *loop = (gimple_bb (phi))->loop_father;
2455   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2456   enum tree_code code;
2457   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2458   stmt_vec_info use_stmt_info, current_stmt_info;
2459   tree lhs;
2460   imm_use_iterator imm_iter;
2461   use_operand_p use_p;
2462   int nloop_uses, size = 0, n_out_of_loop_uses;
2463   bool found = false;
2464
2465   if (loop != vect_loop)
2466     return false;
2467
2468   lhs = PHI_RESULT (phi);
2469   code = gimple_assign_rhs_code (first_stmt);
2470   while (1)
2471     {
2472       nloop_uses = 0;
2473       n_out_of_loop_uses = 0;
2474       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2475         {
2476           gimple *use_stmt = USE_STMT (use_p);
2477           if (is_gimple_debug (use_stmt))
2478             continue;
2479
2480           /* Check if we got back to the reduction phi.  */
2481           if (use_stmt == phi)
2482             {
2483               loop_use_stmt = use_stmt;
2484               found = true;
2485               break;
2486             }
2487
2488           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2489             {
2490               loop_use_stmt = use_stmt;
2491               nloop_uses++;
2492             }
2493            else
2494              n_out_of_loop_uses++;
2495
2496            /* There are can be either a single use in the loop or two uses in
2497               phi nodes.  */
2498            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2499              return false;
2500         }
2501
2502       if (found)
2503         break;
2504
2505       /* We reached a statement with no loop uses.  */
2506       if (nloop_uses == 0)
2507         return false;
2508
2509       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2510       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2511         return false;
2512
2513       if (!is_gimple_assign (loop_use_stmt)
2514           || code != gimple_assign_rhs_code (loop_use_stmt)
2515           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2516         return false;
2517
2518       /* Insert USE_STMT into reduction chain.  */
2519       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2520       if (current_stmt)
2521         {
2522           current_stmt_info = vinfo_for_stmt (current_stmt);
2523           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2524           GROUP_FIRST_ELEMENT (use_stmt_info)
2525             = GROUP_FIRST_ELEMENT (current_stmt_info);
2526         }
2527       else
2528         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2529
2530       lhs = gimple_assign_lhs (loop_use_stmt);
2531       current_stmt = loop_use_stmt;
2532       size++;
2533    }
2534
2535   if (!found || loop_use_stmt != phi || size < 2)
2536     return false;
2537
2538   /* Swap the operands, if needed, to make the reduction operand be the second
2539      operand.  */
2540   lhs = PHI_RESULT (phi);
2541   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2542   while (next_stmt)
2543     {
2544       if (gimple_assign_rhs2 (next_stmt) == lhs)
2545         {
2546           tree op = gimple_assign_rhs1 (next_stmt);
2547           gimple *def_stmt = NULL;
2548
2549           if (TREE_CODE (op) == SSA_NAME)
2550             def_stmt = SSA_NAME_DEF_STMT (op);
2551
2552           /* Check that the other def is either defined in the loop
2553              ("vect_internal_def"), or it's an induction (defined by a
2554              loop-header phi-node).  */
2555           if (def_stmt
2556               && gimple_bb (def_stmt)
2557               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2558               && (is_gimple_assign (def_stmt)
2559                   || is_gimple_call (def_stmt)
2560                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2561                            == vect_induction_def
2562                   || (gimple_code (def_stmt) == GIMPLE_PHI
2563                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2564                                   == vect_internal_def
2565                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2566             {
2567               lhs = gimple_assign_lhs (next_stmt);
2568               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2569               continue;
2570             }
2571
2572           return false;
2573         }
2574       else
2575         {
2576           tree op = gimple_assign_rhs2 (next_stmt);
2577           gimple *def_stmt = NULL;
2578
2579           if (TREE_CODE (op) == SSA_NAME)
2580             def_stmt = SSA_NAME_DEF_STMT (op);
2581
2582           /* Check that the other def is either defined in the loop
2583             ("vect_internal_def"), or it's an induction (defined by a
2584             loop-header phi-node).  */
2585           if (def_stmt
2586               && gimple_bb (def_stmt)
2587               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2588               && (is_gimple_assign (def_stmt)
2589                   || is_gimple_call (def_stmt)
2590                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2591                               == vect_induction_def
2592                   || (gimple_code (def_stmt) == GIMPLE_PHI
2593                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2594                                   == vect_internal_def
2595                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2596             {
2597               if (dump_enabled_p ())
2598                 {
2599                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2600                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2601                 }
2602
2603               swap_ssa_operands (next_stmt,
2604                                  gimple_assign_rhs1_ptr (next_stmt),
2605                                  gimple_assign_rhs2_ptr (next_stmt));
2606               update_stmt (next_stmt);
2607
2608               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2609                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2610             }
2611           else
2612             return false;
2613         }
2614
2615       lhs = gimple_assign_lhs (next_stmt);
2616       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2617     }
2618
2619   /* Save the chain for further analysis in SLP detection.  */
2620   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2621   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2622   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2623
2624   return true;
2625 }
2626
2627
2628 /* Function vect_is_simple_reduction
2629
2630    (1) Detect a cross-iteration def-use cycle that represents a simple
2631    reduction computation.  We look for the following pattern:
2632
2633    loop_header:
2634      a1 = phi < a0, a2 >
2635      a3 = ...
2636      a2 = operation (a3, a1)
2637
2638    or
2639
2640    a3 = ...
2641    loop_header:
2642      a1 = phi < a0, a2 >
2643      a2 = operation (a3, a1)
2644
2645    such that:
2646    1. operation is commutative and associative and it is safe to
2647       change the order of the computation
2648    2. no uses for a2 in the loop (a2 is used out of the loop)
2649    3. no uses of a1 in the loop besides the reduction operation
2650    4. no uses of a1 outside the loop.
2651
2652    Conditions 1,4 are tested here.
2653    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2654
2655    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2656    nested cycles.
2657
2658    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2659    reductions:
2660
2661      a1 = phi < a0, a2 >
2662      inner loop (def of a3)
2663      a2 = phi < a3 >
2664
2665    (4) Detect condition expressions, ie:
2666      for (int i = 0; i < N; i++)
2667        if (a[i] < val)
2668         ret_val = a[i];
2669
2670 */
2671
2672 static gimple *
2673 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2674                           bool *double_reduc,
2675                           bool need_wrapping_integral_overflow,
2676                           enum vect_reduction_type *v_reduc_type)
2677 {
2678   struct loop *loop = (gimple_bb (phi))->loop_father;
2679   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2680   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2681   enum tree_code orig_code, code;
2682   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2683   tree type;
2684   int nloop_uses;
2685   tree name;
2686   imm_use_iterator imm_iter;
2687   use_operand_p use_p;
2688   bool phi_def;
2689
2690   *double_reduc = false;
2691   *v_reduc_type = TREE_CODE_REDUCTION;
2692
2693   tree phi_name = PHI_RESULT (phi);
2694   /* ???  If there are no uses of the PHI result the inner loop reduction
2695      won't be detected as possibly double-reduction by vectorizable_reduction
2696      because that tries to walk the PHI arg from the preheader edge which
2697      can be constant.  See PR60382.  */
2698   if (has_zero_uses (phi_name))
2699     return NULL;
2700   nloop_uses = 0;
2701   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2702     {
2703       gimple *use_stmt = USE_STMT (use_p);
2704       if (is_gimple_debug (use_stmt))
2705         continue;
2706
2707       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2708         {
2709           if (dump_enabled_p ())
2710             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2711                              "intermediate value used outside loop.\n");
2712
2713           return NULL;
2714         }
2715
2716       nloop_uses++;
2717       if (nloop_uses > 1)
2718         {
2719           if (dump_enabled_p ())
2720             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2721                              "reduction value used in loop.\n");
2722           return NULL;
2723         }
2724
2725       phi_use_stmt = use_stmt;
2726     }
2727
2728   edge latch_e = loop_latch_edge (loop);
2729   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2730   if (TREE_CODE (loop_arg) != SSA_NAME)
2731     {
2732       if (dump_enabled_p ())
2733         {
2734           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2735                            "reduction: not ssa_name: ");
2736           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2737           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2738         }
2739       return NULL;
2740     }
2741
2742   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2743   if (is_gimple_assign (def_stmt))
2744     {
2745       name = gimple_assign_lhs (def_stmt);
2746       phi_def = false;
2747     }
2748   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2749     {
2750       name = PHI_RESULT (def_stmt);
2751       phi_def = true;
2752     }
2753   else
2754     {
2755       if (dump_enabled_p ())
2756         {
2757           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2758                            "reduction: unhandled reduction operation: ");
2759           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2760         }
2761       return NULL;
2762     }
2763
2764   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2765     return NULL;
2766
2767   nloop_uses = 0;
2768   auto_vec<gphi *, 3> lcphis;
2769   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2770     {
2771       gimple *use_stmt = USE_STMT (use_p);
2772       if (is_gimple_debug (use_stmt))
2773         continue;
2774       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2775         nloop_uses++;
2776       else
2777         /* We can have more than one loop-closed PHI.  */
2778         lcphis.safe_push (as_a <gphi *> (use_stmt));
2779       if (nloop_uses > 1)
2780         {
2781           if (dump_enabled_p ())
2782             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2783                              "reduction used in loop.\n");
2784           return NULL;
2785         }
2786     }
2787
2788   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2789      defined in the inner loop.  */
2790   if (phi_def)
2791     {
2792       op1 = PHI_ARG_DEF (def_stmt, 0);
2793
2794       if (gimple_phi_num_args (def_stmt) != 1
2795           || TREE_CODE (op1) != SSA_NAME)
2796         {
2797           if (dump_enabled_p ())
2798             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2799                              "unsupported phi node definition.\n");
2800
2801           return NULL;
2802         }
2803
2804       def1 = SSA_NAME_DEF_STMT (op1);
2805       if (gimple_bb (def1)
2806           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2807           && loop->inner
2808           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2809           && is_gimple_assign (def1)
2810           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2811         {
2812           if (dump_enabled_p ())
2813             report_vect_op (MSG_NOTE, def_stmt,
2814                             "detected double reduction: ");
2815
2816           *double_reduc = true;
2817           return def_stmt;
2818         }
2819
2820       return NULL;
2821     }
2822
2823   /* If we are vectorizing an inner reduction we are executing that
2824      in the original order only in case we are not dealing with a
2825      double reduction.  */
2826   bool check_reduction = true;
2827   if (flow_loop_nested_p (vect_loop, loop))
2828     {
2829       gphi *lcphi;
2830       unsigned i;
2831       check_reduction = false;
2832       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2833         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2834           {
2835             gimple *use_stmt = USE_STMT (use_p);
2836             if (is_gimple_debug (use_stmt))
2837               continue;
2838             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2839               check_reduction = true;
2840           }
2841     }
2842
2843   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2844   code = orig_code = gimple_assign_rhs_code (def_stmt);
2845
2846   /* We can handle "res -= x[i]", which is non-associative by
2847      simply rewriting this into "res += -x[i]".  Avoid changing
2848      gimple instruction for the first simple tests and only do this
2849      if we're allowed to change code at all.  */
2850   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2851     code = PLUS_EXPR;
2852
2853   if (code == COND_EXPR)
2854     {
2855       if (! nested_in_vect_loop)
2856         *v_reduc_type = COND_REDUCTION;
2857
2858       op3 = gimple_assign_rhs1 (def_stmt);
2859       if (COMPARISON_CLASS_P (op3))
2860         {
2861           op4 = TREE_OPERAND (op3, 1);
2862           op3 = TREE_OPERAND (op3, 0);
2863         }
2864       if (op3 == phi_name || op4 == phi_name)
2865         {
2866           if (dump_enabled_p ())
2867             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2868                             "reduction: condition depends on previous"
2869                             " iteration: ");
2870           return NULL;
2871         }
2872
2873       op1 = gimple_assign_rhs2 (def_stmt);
2874       op2 = gimple_assign_rhs3 (def_stmt);
2875     }
2876   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2877     {
2878       if (dump_enabled_p ())
2879         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2880                         "reduction: not commutative/associative: ");
2881       return NULL;
2882     }
2883   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2884     {
2885       op1 = gimple_assign_rhs1 (def_stmt);
2886       op2 = gimple_assign_rhs2 (def_stmt);
2887     }
2888   else
2889     {
2890       if (dump_enabled_p ())
2891         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2892                         "reduction: not handled operation: ");
2893       return NULL;
2894     }
2895
2896   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2897     {
2898       if (dump_enabled_p ())
2899         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2900                         "reduction: both uses not ssa_names: ");
2901
2902       return NULL;
2903     }
2904
2905   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2906   if ((TREE_CODE (op1) == SSA_NAME
2907        && !types_compatible_p (type,TREE_TYPE (op1)))
2908       || (TREE_CODE (op2) == SSA_NAME
2909           && !types_compatible_p (type, TREE_TYPE (op2)))
2910       || (op3 && TREE_CODE (op3) == SSA_NAME
2911           && !types_compatible_p (type, TREE_TYPE (op3)))
2912       || (op4 && TREE_CODE (op4) == SSA_NAME
2913           && !types_compatible_p (type, TREE_TYPE (op4))))
2914     {
2915       if (dump_enabled_p ())
2916         {
2917           dump_printf_loc (MSG_NOTE, vect_location,
2918                            "reduction: multiple types: operation type: ");
2919           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2920           dump_printf (MSG_NOTE, ", operands types: ");
2921           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2922                              TREE_TYPE (op1));
2923           dump_printf (MSG_NOTE, ",");
2924           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2925                              TREE_TYPE (op2));
2926           if (op3)
2927             {
2928               dump_printf (MSG_NOTE, ",");
2929               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2930                                  TREE_TYPE (op3));
2931             }
2932
2933           if (op4)
2934             {
2935               dump_printf (MSG_NOTE, ",");
2936               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2937                                  TREE_TYPE (op4));
2938             }
2939           dump_printf (MSG_NOTE, "\n");
2940         }
2941
2942       return NULL;
2943     }
2944
2945   /* Check that it's ok to change the order of the computation.
2946      Generally, when vectorizing a reduction we change the order of the
2947      computation.  This may change the behavior of the program in some
2948      cases, so we need to check that this is ok.  One exception is when
2949      vectorizing an outer-loop: the inner-loop is executed sequentially,
2950      and therefore vectorizing reductions in the inner-loop during
2951      outer-loop vectorization is safe.  */
2952
2953   if (*v_reduc_type != COND_REDUCTION
2954       && check_reduction)
2955     {
2956       /* CHECKME: check for !flag_finite_math_only too?  */
2957       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2958         {
2959           /* Changing the order of operations changes the semantics.  */
2960           if (dump_enabled_p ())
2961             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2962                         "reduction: unsafe fp math optimization: ");
2963           return NULL;
2964         }
2965       else if (INTEGRAL_TYPE_P (type))
2966         {
2967           if (!operation_no_trapping_overflow (type, code))
2968             {
2969               /* Changing the order of operations changes the semantics.  */
2970               if (dump_enabled_p ())
2971                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2972                                 "reduction: unsafe int math optimization"
2973                                 " (overflow traps): ");
2974               return NULL;
2975             }
2976           if (need_wrapping_integral_overflow
2977               && !TYPE_OVERFLOW_WRAPS (type)
2978               && operation_can_overflow (code))
2979             {
2980               /* Changing the order of operations changes the semantics.  */
2981               if (dump_enabled_p ())
2982                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2983                                 "reduction: unsafe int math optimization"
2984                                 " (overflow doesn't wrap): ");
2985               return NULL;
2986             }
2987         }
2988       else if (SAT_FIXED_POINT_TYPE_P (type))
2989         {
2990           /* Changing the order of operations changes the semantics.  */
2991           if (dump_enabled_p ())
2992           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2993                           "reduction: unsafe fixed-point math optimization: ");
2994           return NULL;
2995         }
2996     }
2997
2998   /* Reduction is safe. We're dealing with one of the following:
2999      1) integer arithmetic and no trapv
3000      2) floating point arithmetic, and special flags permit this optimization
3001      3) nested cycle (i.e., outer loop vectorization).  */
3002   if (TREE_CODE (op1) == SSA_NAME)
3003     def1 = SSA_NAME_DEF_STMT (op1);
3004
3005   if (TREE_CODE (op2) == SSA_NAME)
3006     def2 = SSA_NAME_DEF_STMT (op2);
3007
3008   if (code != COND_EXPR
3009       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3010     {
3011       if (dump_enabled_p ())
3012         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3013       return NULL;
3014     }
3015
3016   /* Check that one def is the reduction def, defined by PHI,
3017      the other def is either defined in the loop ("vect_internal_def"),
3018      or it's an induction (defined by a loop-header phi-node).  */
3019
3020   if (def2 && def2 == phi
3021       && (code == COND_EXPR
3022           || !def1 || gimple_nop_p (def1)
3023           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3024           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025               && (is_gimple_assign (def1)
3026                   || is_gimple_call (def1)
3027                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3028                       == vect_induction_def
3029                   || (gimple_code (def1) == GIMPLE_PHI
3030                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3031                           == vect_internal_def
3032                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3033     {
3034       if (dump_enabled_p ())
3035         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3036       return def_stmt;
3037     }
3038
3039   if (def1 && def1 == phi
3040       && (code == COND_EXPR
3041           || !def2 || gimple_nop_p (def2)
3042           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3043           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044               && (is_gimple_assign (def2)
3045                   || is_gimple_call (def2)
3046                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3047                        == vect_induction_def
3048                   || (gimple_code (def2) == GIMPLE_PHI
3049                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3050                            == vect_internal_def
3051                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3052     {
3053       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3054         {
3055           /* Check if we can swap operands (just for simplicity - so that
3056              the rest of the code can assume that the reduction variable
3057              is always the last (second) argument).  */
3058           if (code == COND_EXPR)
3059             {
3060               /* Swap cond_expr by inverting the condition.  */
3061               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3062               enum tree_code invert_code = ERROR_MARK;
3063               enum tree_code cond_code = TREE_CODE (cond_expr);
3064
3065               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3066                 {
3067                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3068                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3069                 }
3070               if (invert_code != ERROR_MARK)
3071                 {
3072                   TREE_SET_CODE (cond_expr, invert_code);
3073                   swap_ssa_operands (def_stmt,
3074                                      gimple_assign_rhs2_ptr (def_stmt),
3075                                      gimple_assign_rhs3_ptr (def_stmt));
3076                 }
3077               else
3078                 {
3079                   if (dump_enabled_p ())
3080                     report_vect_op (MSG_NOTE, def_stmt,
3081                                     "detected reduction: cannot swap operands "
3082                                     "for cond_expr");
3083                   return NULL;
3084                 }
3085             }
3086           else
3087             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3088                                gimple_assign_rhs2_ptr (def_stmt));
3089
3090           if (dump_enabled_p ())
3091             report_vect_op (MSG_NOTE, def_stmt,
3092                             "detected reduction: need to swap operands: ");
3093
3094           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3095             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3096         }
3097       else
3098         {
3099           if (dump_enabled_p ())
3100             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3101         }
3102
3103       return def_stmt;
3104     }
3105
3106   /* Try to find SLP reduction chain.  */
3107   if (! nested_in_vect_loop
3108       && code != COND_EXPR
3109       && orig_code != MINUS_EXPR
3110       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3111     {
3112       if (dump_enabled_p ())
3113         report_vect_op (MSG_NOTE, def_stmt,
3114                         "reduction: detected reduction chain: ");
3115
3116       return def_stmt;
3117     }
3118
3119   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3120   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3121   while (first)
3122     {
3123       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3124       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3125       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126       first = next;
3127     }
3128
3129   /* Look for the expression computing loop_arg from loop PHI result.  */
3130   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3131   auto_bitmap visited;
3132   tree lookfor = PHI_RESULT (phi);
3133   ssa_op_iter curri;
3134   use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3135                                             SSA_OP_USE);
3136   while (USE_FROM_PTR (curr) != loop_arg)
3137     curr = op_iter_next_use (&curri);
3138   curri.i = curri.numops;
3139   do
3140     {
3141       path.safe_push (std::make_pair (curri, curr));
3142       tree use = USE_FROM_PTR (curr);
3143       if (use == lookfor)
3144         break;
3145       gimple *def = SSA_NAME_DEF_STMT (use);
3146       if (gimple_nop_p (def)
3147           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3148         {
3149 pop:
3150           do
3151             {
3152               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3153               curri = x.first;
3154               curr = x.second;
3155               do
3156                 curr = op_iter_next_use (&curri);
3157               /* Skip already visited or non-SSA operands (from iterating
3158                  over PHI args).  */
3159               while (curr != NULL_USE_OPERAND_P
3160                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3161                          || ! bitmap_set_bit (visited,
3162                                               SSA_NAME_VERSION
3163                                                 (USE_FROM_PTR (curr)))));
3164             }
3165           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3166           if (curr == NULL_USE_OPERAND_P)
3167             break;
3168         }
3169       else
3170         {
3171           if (gimple_code (def) == GIMPLE_PHI)
3172             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3173           else
3174             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3175           while (curr != NULL_USE_OPERAND_P
3176                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3177                      || ! bitmap_set_bit (visited,
3178                                           SSA_NAME_VERSION
3179                                             (USE_FROM_PTR (curr)))))
3180             curr = op_iter_next_use (&curri);
3181           if (curr == NULL_USE_OPERAND_P)
3182             goto pop;
3183         }
3184     }
3185   while (1);
3186   if (dump_file && (dump_flags & TDF_DETAILS))
3187     {
3188       dump_printf_loc (MSG_NOTE, vect_location,
3189                        "reduction path: ");
3190       unsigned i;
3191       std::pair<ssa_op_iter, use_operand_p> *x;
3192       FOR_EACH_VEC_ELT (path, i, x)
3193         {
3194           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3195           dump_printf (MSG_NOTE, " ");
3196         }
3197       dump_printf (MSG_NOTE, "\n");
3198     }
3199
3200   /* Check whether the reduction path detected is valid.  */
3201   bool fail = path.length () == 0;
3202   bool neg = false;
3203   for (unsigned i = 1; i < path.length (); ++i)
3204     {
3205       gimple *use_stmt = USE_STMT (path[i].second);
3206       tree op = USE_FROM_PTR (path[i].second);
3207       if (! has_single_use (op)
3208           || ! is_gimple_assign (use_stmt))
3209         {
3210           fail = true;
3211           break;
3212         }
3213       if (gimple_assign_rhs_code (use_stmt) != code)
3214         {
3215           if (code == PLUS_EXPR
3216               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3217             {
3218               /* Track whether we negate the reduction value each iteration.  */
3219               if (gimple_assign_rhs2 (use_stmt) == op)
3220                 neg = ! neg;
3221             }
3222           else
3223             {
3224               fail = true;
3225               break;
3226             }
3227         }
3228     }
3229   if (! fail && ! neg)
3230     return def_stmt;
3231
3232   if (dump_enabled_p ())
3233     {
3234       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3235                       "reduction: unknown pattern: ");
3236     }
3237
3238   return NULL;
3239 }
3240
3241 /* Wrapper around vect_is_simple_reduction, which will modify code
3242    in-place if it enables detection of more reductions.  Arguments
3243    as there.  */
3244
3245 gimple *
3246 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3247                              bool *double_reduc,
3248                              bool need_wrapping_integral_overflow)
3249 {
3250   enum vect_reduction_type v_reduc_type;
3251   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3252                                           need_wrapping_integral_overflow,
3253                                           &v_reduc_type);
3254   if (def)
3255     {
3256       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3257       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3258       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3259       reduc_def_info = vinfo_for_stmt (def);
3260       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3261     }
3262   return def;
3263 }
3264
3265 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3266 int
3267 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3268                              int *peel_iters_epilogue,
3269                              stmt_vector_for_cost *scalar_cost_vec,
3270                              stmt_vector_for_cost *prologue_cost_vec,
3271                              stmt_vector_for_cost *epilogue_cost_vec)
3272 {
3273   int retval = 0;
3274   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3275
3276   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3277     {
3278       *peel_iters_epilogue = vf/2;
3279       if (dump_enabled_p ())
3280         dump_printf_loc (MSG_NOTE, vect_location,
3281                          "cost model: epilogue peel iters set to vf/2 "
3282                          "because loop iterations are unknown .\n");
3283
3284       /* If peeled iterations are known but number of scalar loop
3285          iterations are unknown, count a taken branch per peeled loop.  */
3286       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3287                                  NULL, 0, vect_prologue);
3288       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3289                                  NULL, 0, vect_epilogue);
3290     }
3291   else
3292     {
3293       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3294       peel_iters_prologue = niters < peel_iters_prologue ?
3295                             niters : peel_iters_prologue;
3296       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3297       /* If we need to peel for gaps, but no peeling is required, we have to
3298          peel VF iterations.  */
3299       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3300         *peel_iters_epilogue = vf;
3301     }
3302
3303   stmt_info_for_cost *si;
3304   int j;
3305   if (peel_iters_prologue)
3306     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3307         {
3308           stmt_vec_info stmt_info
3309             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3310           retval += record_stmt_cost (prologue_cost_vec,
3311                                       si->count * peel_iters_prologue,
3312                                       si->kind, stmt_info, si->misalign,
3313                                       vect_prologue);
3314         }
3315   if (*peel_iters_epilogue)
3316     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3317         {
3318           stmt_vec_info stmt_info
3319             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3320           retval += record_stmt_cost (epilogue_cost_vec,
3321                                       si->count * *peel_iters_epilogue,
3322                                       si->kind, stmt_info, si->misalign,
3323                                       vect_epilogue);
3324         }
3325
3326   return retval;
3327 }
3328
3329 /* Function vect_estimate_min_profitable_iters
3330
3331    Return the number of iterations required for the vector version of the
3332    loop to be profitable relative to the cost of the scalar version of the
3333    loop.
3334
3335    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3336    of iterations for vectorization.  -1 value means loop vectorization
3337    is not profitable.  This returned value may be used for dynamic
3338    profitability check.
3339
3340    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3341    for static check against estimated number of iterations.  */
3342
3343 static void
3344 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3345                                     int *ret_min_profitable_niters,
3346                                     int *ret_min_profitable_estimate)
3347 {
3348   int min_profitable_iters;
3349   int min_profitable_estimate;
3350   int peel_iters_prologue;
3351   int peel_iters_epilogue;
3352   unsigned vec_inside_cost = 0;
3353   int vec_outside_cost = 0;
3354   unsigned vec_prologue_cost = 0;
3355   unsigned vec_epilogue_cost = 0;
3356   int scalar_single_iter_cost = 0;
3357   int scalar_outside_cost = 0;
3358   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3359   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3360   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3361
3362   /* Cost model disabled.  */
3363   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3364     {
3365       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3366       *ret_min_profitable_niters = 0;
3367       *ret_min_profitable_estimate = 0;
3368       return;
3369     }
3370
3371   /* Requires loop versioning tests to handle misalignment.  */
3372   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3373     {
3374       /*  FIXME: Make cost depend on complexity of individual check.  */
3375       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3376       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3377                             vect_prologue);
3378       dump_printf (MSG_NOTE,
3379                    "cost model: Adding cost of checks for loop "
3380                    "versioning to treat misalignment.\n");
3381     }
3382
3383   /* Requires loop versioning with alias checks.  */
3384   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3385     {
3386       /*  FIXME: Make cost depend on complexity of individual check.  */
3387       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3388       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3389                             vect_prologue);
3390       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3391       if (len)
3392         /* Count LEN - 1 ANDs and LEN comparisons.  */
3393         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3394                               NULL, 0, vect_prologue);
3395       dump_printf (MSG_NOTE,
3396                    "cost model: Adding cost of checks for loop "
3397                    "versioning aliasing.\n");
3398     }
3399
3400   /* Requires loop versioning with niter checks.  */
3401   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3402     {
3403       /*  FIXME: Make cost depend on complexity of individual check.  */
3404       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3405                             vect_prologue);
3406       dump_printf (MSG_NOTE,
3407                    "cost model: Adding cost of checks for loop "
3408                    "versioning niters.\n");
3409     }
3410
3411   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3412     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3413                           vect_prologue);
3414
3415   /* Count statements in scalar loop.  Using this as scalar cost for a single
3416      iteration for now.
3417
3418      TODO: Add outer loop support.
3419
3420      TODO: Consider assigning different costs to different scalar
3421      statements.  */
3422
3423   scalar_single_iter_cost
3424     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3425
3426   /* Add additional cost for the peeled instructions in prologue and epilogue
3427      loop.
3428
3429      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3430      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3431
3432      TODO: Build an expression that represents peel_iters for prologue and
3433      epilogue to be used in a run-time test.  */
3434
3435   if (npeel  < 0)
3436     {
3437       peel_iters_prologue = vf/2;
3438       dump_printf (MSG_NOTE, "cost model: "
3439                    "prologue peel iters set to vf/2.\n");
3440
3441       /* If peeling for alignment is unknown, loop bound of main loop becomes
3442          unknown.  */
3443       peel_iters_epilogue = vf/2;
3444       dump_printf (MSG_NOTE, "cost model: "
3445                    "epilogue peel iters set to vf/2 because "
3446                    "peeling for alignment is unknown.\n");
3447
3448       /* If peeled iterations are unknown, count a taken branch and a not taken
3449          branch per peeled loop. Even if scalar loop iterations are known,
3450          vector iterations are not known since peeled prologue iterations are
3451          not known. Hence guards remain the same.  */
3452       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3453                             NULL, 0, vect_prologue);
3454       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3455                             NULL, 0, vect_prologue);
3456       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3457                             NULL, 0, vect_epilogue);
3458       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3459                             NULL, 0, vect_epilogue);
3460       stmt_info_for_cost *si;
3461       int j;
3462       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3463         {
3464           struct _stmt_vec_info *stmt_info
3465             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3466           (void) add_stmt_cost (target_cost_data,
3467                                 si->count * peel_iters_prologue,
3468                                 si->kind, stmt_info, si->misalign,
3469                                 vect_prologue);
3470           (void) add_stmt_cost (target_cost_data,
3471                                 si->count * peel_iters_epilogue,
3472                                 si->kind, stmt_info, si->misalign,
3473                                 vect_epilogue);
3474         }
3475     }
3476   else
3477     {
3478       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3479       stmt_info_for_cost *si;
3480       int j;
3481       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3482
3483       prologue_cost_vec.create (2);
3484       epilogue_cost_vec.create (2);
3485       peel_iters_prologue = npeel;
3486
3487       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3488                                           &peel_iters_epilogue,
3489                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3490                                             (loop_vinfo),
3491                                           &prologue_cost_vec,
3492                                           &epilogue_cost_vec);
3493
3494       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3495         {
3496           struct _stmt_vec_info *stmt_info
3497             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3498           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3499                                 si->misalign, vect_prologue);
3500         }
3501
3502       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3503         {
3504           struct _stmt_vec_info *stmt_info
3505             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3506           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3507                                 si->misalign, vect_epilogue);
3508         }
3509
3510       prologue_cost_vec.release ();
3511       epilogue_cost_vec.release ();
3512     }
3513
3514   /* FORNOW: The scalar outside cost is incremented in one of the
3515      following ways:
3516
3517      1. The vectorizer checks for alignment and aliasing and generates
3518      a condition that allows dynamic vectorization.  A cost model
3519      check is ANDED with the versioning condition.  Hence scalar code
3520      path now has the added cost of the versioning check.
3521
3522        if (cost > th & versioning_check)
3523          jmp to vector code
3524
3525      Hence run-time scalar is incremented by not-taken branch cost.
3526
3527      2. The vectorizer then checks if a prologue is required.  If the
3528      cost model check was not done before during versioning, it has to
3529      be done before the prologue check.
3530
3531        if (cost <= th)
3532          prologue = scalar_iters
3533        if (prologue == 0)
3534          jmp to vector code
3535        else
3536          execute prologue
3537        if (prologue == num_iters)
3538          go to exit
3539
3540      Hence the run-time scalar cost is incremented by a taken branch,
3541      plus a not-taken branch, plus a taken branch cost.
3542
3543      3. The vectorizer then checks if an epilogue is required.  If the
3544      cost model check was not done before during prologue check, it
3545      has to be done with the epilogue check.
3546
3547        if (prologue == 0)
3548          jmp to vector code
3549        else
3550          execute prologue
3551        if (prologue == num_iters)
3552          go to exit
3553        vector code:
3554          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3555            jmp to epilogue
3556
3557      Hence the run-time scalar cost should be incremented by 2 taken
3558      branches.
3559
3560      TODO: The back end may reorder the BBS's differently and reverse
3561      conditions/branch directions.  Change the estimates below to
3562      something more reasonable.  */
3563
3564   /* If the number of iterations is known and we do not do versioning, we can
3565      decide whether to vectorize at compile time.  Hence the scalar version
3566      do not carry cost model guard costs.  */
3567   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3568       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3569     {
3570       /* Cost model check occurs at versioning.  */
3571       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3572         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3573       else
3574         {
3575           /* Cost model check occurs at prologue generation.  */
3576           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3577             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3578               + vect_get_stmt_cost (cond_branch_not_taken);
3579           /* Cost model check occurs at epilogue generation.  */
3580           else
3581             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3582         }
3583     }
3584
3585   /* Complete the target-specific cost calculations.  */
3586   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3587                &vec_inside_cost, &vec_epilogue_cost);
3588
3589   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3590
3591   if (dump_enabled_p ())
3592     {
3593       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3594       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3595                    vec_inside_cost);
3596       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3597                    vec_prologue_cost);
3598       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3599                    vec_epilogue_cost);
3600       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3601                    scalar_single_iter_cost);
3602       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3603                    scalar_outside_cost);
3604       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3605                    vec_outside_cost);
3606       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3607                    peel_iters_prologue);
3608       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3609                    peel_iters_epilogue);
3610     }
3611
3612   /* Calculate number of iterations required to make the vector version
3613      profitable, relative to the loop bodies only.  The following condition
3614      must hold true:
3615      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3616      where
3617      SIC = scalar iteration cost, VIC = vector iteration cost,
3618      VOC = vector outside cost, VF = vectorization factor,
3619      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3620      SOC = scalar outside cost for run time cost model check.  */
3621
3622   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3623     {
3624       if (vec_outside_cost <= 0)
3625         min_profitable_iters = 0;
3626       else
3627         {
3628           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3629                                   - vec_inside_cost * peel_iters_prologue
3630                                   - vec_inside_cost * peel_iters_epilogue)
3631                                  / ((scalar_single_iter_cost * vf)
3632                                     - vec_inside_cost);
3633
3634           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3635               <= (((int) vec_inside_cost * min_profitable_iters)
3636                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3637             min_profitable_iters++;
3638         }
3639     }
3640   /* vector version will never be profitable.  */
3641   else
3642     {
3643       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3644         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3645                     "did not happen for a simd loop");
3646
3647       if (dump_enabled_p ())
3648         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3649                          "cost model: the vector iteration cost = %d "
3650                          "divided by the scalar iteration cost = %d "
3651                          "is greater or equal to the vectorization factor = %d"
3652                          ".\n",
3653                          vec_inside_cost, scalar_single_iter_cost, vf);
3654       *ret_min_profitable_niters = -1;
3655       *ret_min_profitable_estimate = -1;
3656       return;
3657     }
3658
3659   dump_printf (MSG_NOTE,
3660                "  Calculated minimum iters for profitability: %d\n",
3661                min_profitable_iters);
3662
3663   /* We want the vectorized loop to execute at least once.  */
3664   if (min_profitable_iters < (vf + peel_iters_prologue + peel_iters_epilogue))
3665     min_profitable_iters = vf + peel_iters_prologue + peel_iters_epilogue;
3666
3667   if (dump_enabled_p ())
3668     dump_printf_loc (MSG_NOTE, vect_location,
3669                      "  Runtime profitability threshold = %d\n",
3670                      min_profitable_iters);
3671
3672   *ret_min_profitable_niters = min_profitable_iters;
3673
3674   /* Calculate number of iterations required to make the vector version
3675      profitable, relative to the loop bodies only.
3676
3677      Non-vectorized variant is SIC * niters and it must win over vector
3678      variant on the expected loop trip count.  The following condition must hold true:
3679      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3680
3681   if (vec_outside_cost <= 0)
3682     min_profitable_estimate = 0;
3683   else
3684     {
3685       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3686                                  - vec_inside_cost * peel_iters_prologue
3687                                  - vec_inside_cost * peel_iters_epilogue)
3688                                  / ((scalar_single_iter_cost * vf)
3689                                    - vec_inside_cost);
3690     }
3691   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3692   if (dump_enabled_p ())
3693     dump_printf_loc (MSG_NOTE, vect_location,
3694                      "  Static estimate profitability threshold = %d\n",
3695                      min_profitable_estimate);
3696
3697   *ret_min_profitable_estimate = min_profitable_estimate;
3698 }
3699
3700 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3701    vector elements (not bits) for a vector of mode MODE.  */
3702 static void
3703 calc_vec_perm_mask_for_shift (machine_mode mode, unsigned int offset,
3704                               unsigned char *sel)
3705 {
3706   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3707
3708   for (i = 0; i < nelt; i++)
3709     sel[i] = (i + offset) & (2*nelt - 1);
3710 }
3711
3712 /* Checks whether the target supports whole-vector shifts for vectors of mode
3713    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3714    it supports vec_perm_const with masks for all necessary shift amounts.  */
3715 static bool
3716 have_whole_vector_shift (machine_mode mode)
3717 {
3718   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3719     return true;
3720
3721   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3722     return false;
3723
3724   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3725   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3726
3727   for (i = nelt/2; i >= 1; i/=2)
3728     {
3729       calc_vec_perm_mask_for_shift (mode, i, sel);
3730       if (!can_vec_perm_p (mode, false, sel))
3731         return false;
3732     }
3733   return true;
3734 }
3735
3736 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3737    functions. Design better to avoid maintenance issues.  */
3738
3739 /* Function vect_model_reduction_cost.
3740
3741    Models cost for a reduction operation, including the vector ops
3742    generated within the strip-mine loop, the initial definition before
3743    the loop, and the epilogue code that must be generated.  */
3744
3745 static void
3746 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3747                            int ncopies)
3748 {
3749   int prologue_cost = 0, epilogue_cost = 0;
3750   enum tree_code code;
3751   optab optab;
3752   tree vectype;
3753   gimple *orig_stmt;
3754   machine_mode mode;
3755   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3756   struct loop *loop = NULL;
3757   void *target_cost_data;
3758
3759   if (loop_vinfo)
3760     {
3761       loop = LOOP_VINFO_LOOP (loop_vinfo);
3762       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3763     }
3764   else
3765     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3766
3767   /* Condition reductions generate two reductions in the loop.  */
3768   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3769     ncopies *= 2;
3770
3771   /* Cost of reduction op inside loop.  */
3772   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3773                                         stmt_info, 0, vect_body);
3774
3775   vectype = STMT_VINFO_VECTYPE (stmt_info);
3776   mode = TYPE_MODE (vectype);
3777   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3778
3779   if (!orig_stmt)
3780     orig_stmt = STMT_VINFO_STMT (stmt_info);
3781
3782   code = gimple_assign_rhs_code (orig_stmt);
3783
3784   /* Add in cost for initial definition.
3785      For cond reduction we have four vectors: initial index, step, initial
3786      result of the data reduction, initial value of the index reduction.  */
3787   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3788                        == COND_REDUCTION ? 4 : 1;
3789   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3790                                   scalar_to_vec, stmt_info, 0,
3791                                   vect_prologue);
3792
3793   /* Determine cost of epilogue code.
3794
3795      We have a reduction operator that will reduce the vector in one statement.
3796      Also requires scalar extract.  */
3797
3798   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3799     {
3800       if (reduc_code != ERROR_MARK)
3801         {
3802           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3803             {
3804               /* An EQ stmt and an COND_EXPR stmt.  */
3805               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3806                                               vector_stmt, stmt_info, 0,
3807                                               vect_epilogue);
3808               /* Reduction of the max index and a reduction of the found
3809                  values.  */
3810               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3811                                               vec_to_scalar, stmt_info, 0,
3812                                               vect_epilogue);
3813               /* A broadcast of the max value.  */
3814               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3815                                               scalar_to_vec, stmt_info, 0,
3816                                               vect_epilogue);
3817             }
3818           else
3819             {
3820               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3821                                               stmt_info, 0, vect_epilogue);
3822               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3823                                               vec_to_scalar, stmt_info, 0,
3824                                               vect_epilogue);
3825             }
3826         }
3827       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3828         {
3829           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3830           /* Extraction of scalar elements.  */
3831           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3832                                           vec_to_scalar, stmt_info, 0,
3833                                           vect_epilogue);
3834           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3835           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3836                                           scalar_stmt, stmt_info, 0,
3837                                           vect_epilogue);
3838         }
3839       else
3840         {
3841           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3842           tree bitsize =
3843             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3844           int element_bitsize = tree_to_uhwi (bitsize);
3845           int nelements = vec_size_in_bits / element_bitsize;
3846
3847           if (code == COND_EXPR)
3848             code = MAX_EXPR;
3849
3850           optab = optab_for_tree_code (code, vectype, optab_default);
3851
3852           /* We have a whole vector shift available.  */
3853           if (optab != unknown_optab
3854               && VECTOR_MODE_P (mode)
3855               && optab_handler (optab, mode) != CODE_FOR_nothing
3856               && have_whole_vector_shift (mode))
3857             {
3858               /* Final reduction via vector shifts and the reduction operator.
3859                  Also requires scalar extract.  */
3860               epilogue_cost += add_stmt_cost (target_cost_data,
3861                                               exact_log2 (nelements) * 2,
3862                                               vector_stmt, stmt_info, 0,
3863                                               vect_epilogue);
3864               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3865                                               vec_to_scalar, stmt_info, 0,
3866                                               vect_epilogue);
3867             }
3868           else
3869             /* Use extracts and reduction op for final reduction.  For N
3870                elements, we have N extracts and N-1 reduction ops.  */
3871             epilogue_cost += add_stmt_cost (target_cost_data,
3872                                             nelements + nelements - 1,
3873                                             vector_stmt, stmt_info, 0,
3874                                             vect_epilogue);
3875         }
3876     }
3877
3878   if (dump_enabled_p ())
3879     dump_printf (MSG_NOTE,
3880                  "vect_model_reduction_cost: inside_cost = %d, "
3881                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3882                  prologue_cost, epilogue_cost);
3883 }
3884
3885
3886 /* Function vect_model_induction_cost.
3887
3888    Models cost for induction operations.  */
3889
3890 static void
3891 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3892 {
3893   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3894   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3895   unsigned inside_cost, prologue_cost;
3896
3897   if (PURE_SLP_STMT (stmt_info))
3898     return;
3899
3900   /* loop cost for vec_loop.  */
3901   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3902                                stmt_info, 0, vect_body);
3903
3904   /* prologue cost for vec_init and vec_step.  */
3905   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3906                                  stmt_info, 0, vect_prologue);
3907
3908   if (dump_enabled_p ())
3909     dump_printf_loc (MSG_NOTE, vect_location,
3910                      "vect_model_induction_cost: inside_cost = %d, "
3911                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3912 }
3913
3914
3915
3916 /* Function get_initial_def_for_reduction
3917
3918    Input:
3919    STMT - a stmt that performs a reduction operation in the loop.
3920    INIT_VAL - the initial value of the reduction variable
3921
3922    Output:
3923    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3924         of the reduction (used for adjusting the epilog - see below).
3925    Return a vector variable, initialized according to the operation that STMT
3926         performs. This vector will be used as the initial value of the
3927         vector of partial results.
3928
3929    Option1 (adjust in epilog): Initialize the vector as follows:
3930      add/bit or/xor:    [0,0,...,0,0]
3931      mult/bit and:      [1,1,...,1,1]
3932      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3933    and when necessary (e.g. add/mult case) let the caller know
3934    that it needs to adjust the result by init_val.
3935
3936    Option2: Initialize the vector as follows:
3937      add/bit or/xor:    [init_val,0,0,...,0]
3938      mult/bit and:      [init_val,1,1,...,1]
3939      min/max/cond_expr: [init_val,init_val,...,init_val]
3940    and no adjustments are needed.
3941
3942    For example, for the following code:
3943
3944    s = init_val;
3945    for (i=0;i<n;i++)
3946      s = s + a[i];
3947
3948    STMT is 's = s + a[i]', and the reduction variable is 's'.
3949    For a vector of 4 units, we want to return either [0,0,0,init_val],
3950    or [0,0,0,0] and let the caller know that it needs to adjust
3951    the result at the end by 'init_val'.
3952
3953    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3954    initialization vector is simpler (same element in all entries), if
3955    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3956
3957    A cost model should help decide between these two schemes.  */
3958
3959 tree
3960 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3961                                tree *adjustment_def)
3962 {
3963   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3964   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3965   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3966   tree scalar_type = TREE_TYPE (init_val);
3967   tree vectype = get_vectype_for_scalar_type (scalar_type);
3968   int nunits;
3969   enum tree_code code = gimple_assign_rhs_code (stmt);
3970   tree def_for_init;
3971   tree init_def;
3972   tree *elts;
3973   int i;
3974   bool nested_in_vect_loop = false;
3975   REAL_VALUE_TYPE real_init_val = dconst0;
3976   int int_init_val = 0;
3977   gimple *def_stmt = NULL;
3978   gimple_seq stmts = NULL;
3979
3980   gcc_assert (vectype);
3981   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3982
3983   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3984               || SCALAR_FLOAT_TYPE_P (scalar_type));
3985
3986   if (nested_in_vect_loop_p (loop, stmt))
3987     nested_in_vect_loop = true;
3988   else
3989     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3990
3991   /* In case of double reduction we only create a vector variable to be put
3992      in the reduction phi node.  The actual statement creation is done in
3993      vect_create_epilog_for_reduction.  */
3994   if (adjustment_def && nested_in_vect_loop
3995       && TREE_CODE (init_val) == SSA_NAME
3996       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3997       && gimple_code (def_stmt) == GIMPLE_PHI
3998       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3999       && vinfo_for_stmt (def_stmt)
4000       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4001           == vect_double_reduction_def)
4002     {
4003       *adjustment_def = NULL;
4004       return vect_create_destination_var (init_val, vectype);
4005     }
4006
4007   /* In case of a nested reduction do not use an adjustment def as
4008      that case is not supported by the epilogue generation correctly
4009      if ncopies is not one.  */
4010   if (adjustment_def && nested_in_vect_loop)
4011     {
4012       *adjustment_def = NULL;
4013       return vect_get_vec_def_for_operand (init_val, stmt);
4014     }
4015
4016   switch (code)
4017     {
4018       case WIDEN_SUM_EXPR:
4019       case DOT_PROD_EXPR:
4020       case SAD_EXPR:
4021       case PLUS_EXPR:
4022       case MINUS_EXPR:
4023       case BIT_IOR_EXPR:
4024       case BIT_XOR_EXPR:
4025       case MULT_EXPR:
4026       case BIT_AND_EXPR:
4027         /* ADJUSMENT_DEF is NULL when called from
4028            vect_create_epilog_for_reduction to vectorize double reduction.  */
4029         if (adjustment_def)
4030           *adjustment_def = init_val;
4031
4032         if (code == MULT_EXPR)
4033           {
4034             real_init_val = dconst1;
4035             int_init_val = 1;
4036           }
4037
4038         if (code == BIT_AND_EXPR)
4039           int_init_val = -1;
4040
4041         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4042           def_for_init = build_real (scalar_type, real_init_val);
4043         else
4044           def_for_init = build_int_cst (scalar_type, int_init_val);
4045
4046         /* Create a vector of '0' or '1' except the first element.  */
4047         elts = XALLOCAVEC (tree, nunits);
4048         for (i = nunits - 2; i >= 0; --i)
4049           elts[i + 1] = def_for_init;
4050
4051         /* Option1: the first element is '0' or '1' as well.  */
4052         if (adjustment_def)
4053           {
4054             elts[0] = def_for_init;
4055             init_def = build_vector (vectype, elts);
4056             break;
4057           }
4058
4059         /* Option2: the first element is INIT_VAL.  */
4060         elts[0] = init_val;
4061         if (TREE_CONSTANT (init_val))
4062           init_def = build_vector (vectype, elts);
4063         else
4064           {
4065             vec<constructor_elt, va_gc> *v;
4066             vec_alloc (v, nunits);
4067             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4068             for (i = 1; i < nunits; ++i)
4069               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4070             init_def = build_constructor (vectype, v);
4071           }
4072
4073         break;
4074
4075       case MIN_EXPR:
4076       case MAX_EXPR:
4077       case COND_EXPR:
4078         if (adjustment_def)
4079           {
4080             *adjustment_def = NULL_TREE;
4081             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4082               {
4083                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4084                 break;
4085               }
4086           }
4087         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4088         if (! gimple_seq_empty_p (stmts))
4089           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4090         init_def = build_vector_from_val (vectype, init_val);
4091         break;
4092
4093       default:
4094         gcc_unreachable ();
4095     }
4096
4097   return init_def;
4098 }
4099
4100 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4101    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4102
4103 static void
4104 get_initial_defs_for_reduction (slp_tree slp_node,
4105                                 vec<tree> *vec_oprnds,
4106                                 unsigned int number_of_vectors,
4107                                 enum tree_code code, bool reduc_chain)
4108 {
4109   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4110   gimple *stmt = stmts[0];
4111   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4112   unsigned nunits;
4113   tree vec_cst;
4114   tree *elts;
4115   unsigned j, number_of_places_left_in_vector;
4116   tree vector_type, scalar_type;
4117   tree vop;
4118   int group_size = stmts.length ();
4119   unsigned int vec_num, i;
4120   unsigned number_of_copies = 1;
4121   vec<tree> voprnds;
4122   voprnds.create (number_of_vectors);
4123   bool constant_p;
4124   tree neutral_op = NULL;
4125   struct loop *loop;
4126   gimple_seq ctor_seq = NULL;
4127
4128   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4129   scalar_type = TREE_TYPE (vector_type);
4130   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4131
4132   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4133
4134   loop = (gimple_bb (stmt))->loop_father;
4135   gcc_assert (loop);
4136
4137   /* op is the reduction operand of the first stmt already.  */
4138   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4139      we need either neutral operands or the original operands.  See
4140      get_initial_def_for_reduction() for details.  */
4141   switch (code)
4142     {
4143     case WIDEN_SUM_EXPR:
4144     case DOT_PROD_EXPR:
4145     case SAD_EXPR:
4146     case PLUS_EXPR:
4147     case MINUS_EXPR:
4148     case BIT_IOR_EXPR:
4149     case BIT_XOR_EXPR:
4150       neutral_op = build_zero_cst (scalar_type);
4151       break;
4152
4153     case MULT_EXPR:
4154       neutral_op = build_one_cst (scalar_type);
4155       break;
4156
4157     case BIT_AND_EXPR:
4158       neutral_op = build_all_ones_cst (scalar_type);
4159       break;
4160
4161     /* For MIN/MAX we don't have an easy neutral operand but
4162        the initial values can be used fine here.  Only for
4163        a reduction chain we have to force a neutral element.  */
4164     case MAX_EXPR:
4165     case MIN_EXPR:
4166       if (! reduc_chain)
4167         neutral_op = NULL;
4168       else
4169         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt,
4170                                             loop_preheader_edge (loop));
4171       break;
4172
4173     default:
4174       gcc_assert (! reduc_chain);
4175       neutral_op = NULL;
4176     }
4177
4178   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4179      created vectors. It is greater than 1 if unrolling is performed.
4180
4181      For example, we have two scalar operands, s1 and s2 (e.g., group of
4182      strided accesses of size two), while NUNITS is four (i.e., four scalars
4183      of this type can be packed in a vector).  The output vector will contain
4184      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4185      will be 2).
4186
4187      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4188      containing the operands.
4189
4190      For example, NUNITS is four as before, and the group size is 8
4191      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4192      {s5, s6, s7, s8}.  */
4193
4194   number_of_copies = nunits * number_of_vectors / group_size;
4195
4196   number_of_places_left_in_vector = nunits;
4197   constant_p = true;
4198   elts = XALLOCAVEC (tree, nunits);
4199   for (j = 0; j < number_of_copies; j++)
4200     {
4201       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4202         {
4203           tree op;
4204           /* Get the def before the loop.  In reduction chain we have only
4205              one initial value.  */
4206           if ((j != (number_of_copies - 1)
4207                || (reduc_chain && i != 0))
4208               && neutral_op)
4209             op = neutral_op;
4210           else
4211             op = PHI_ARG_DEF_FROM_EDGE (stmt,
4212                                         loop_preheader_edge (loop));
4213
4214           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4215           number_of_places_left_in_vector--;
4216           elts[number_of_places_left_in_vector] = op;
4217           if (!CONSTANT_CLASS_P (op))
4218             constant_p = false;
4219
4220           if (number_of_places_left_in_vector == 0)
4221             {
4222               if (constant_p)
4223                 vec_cst = build_vector (vector_type, elts);
4224               else
4225                 {
4226                   vec<constructor_elt, va_gc> *v;
4227                   unsigned k;
4228                   vec_alloc (v, nunits);
4229                   for (k = 0; k < nunits; ++k)
4230                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
4231                   vec_cst = build_constructor (vector_type, v);
4232                 }
4233               tree init;
4234               gimple_stmt_iterator gsi;
4235               init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
4236               if (ctor_seq != NULL)
4237                 {
4238                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
4239                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
4240                                                         GSI_SAME_STMT);
4241                   ctor_seq = NULL;
4242                 }
4243               voprnds.quick_push (init);
4244
4245               number_of_places_left_in_vector = nunits;
4246               constant_p = true;
4247             }
4248         }
4249     }
4250
4251   /* Since the vectors are created in the reverse order, we should invert
4252      them.  */
4253   vec_num = voprnds.length ();
4254   for (j = vec_num; j != 0; j--)
4255     {
4256       vop = voprnds[j - 1];
4257       vec_oprnds->quick_push (vop);
4258     }
4259
4260   voprnds.release ();
4261
4262   /* In case that VF is greater than the unrolling factor needed for the SLP
4263      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4264      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4265      to replicate the vectors.  */
4266   while (number_of_vectors > vec_oprnds->length ())
4267     {
4268       tree neutral_vec = NULL;
4269
4270       if (neutral_op)
4271         {
4272           if (!neutral_vec)
4273             neutral_vec = build_vector_from_val (vector_type, neutral_op);
4274
4275           vec_oprnds->quick_push (neutral_vec);
4276         }
4277       else
4278         {
4279           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4280             vec_oprnds->quick_push (vop);
4281         }
4282     }
4283 }
4284
4285
4286 /* Function vect_create_epilog_for_reduction
4287
4288    Create code at the loop-epilog to finalize the result of a reduction
4289    computation.
4290
4291    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4292      reduction statements.
4293    STMT is the scalar reduction stmt that is being vectorized.
4294    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4295      number of elements that we can fit in a vectype (nunits).  In this case
4296      we have to generate more than one vector stmt - i.e - we need to "unroll"
4297      the vector stmt by a factor VF/nunits.  For more details see documentation
4298      in vectorizable_operation.
4299    REDUC_CODE is the tree-code for the epilog reduction.
4300    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4301      computation.
4302    REDUC_INDEX is the index of the operand in the right hand side of the
4303      statement that is defined by REDUCTION_PHI.
4304    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4305    SLP_NODE is an SLP node containing a group of reduction statements. The
4306      first one in this group is STMT.
4307
4308    This function:
4309    1. Creates the reduction def-use cycles: sets the arguments for
4310       REDUCTION_PHIS:
4311       The loop-entry argument is the vectorized initial-value of the reduction.
4312       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4313       sums.
4314    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4315       by applying the operation specified by REDUC_CODE if available, or by
4316       other means (whole-vector shifts or a scalar loop).
4317       The function also creates a new phi node at the loop exit to preserve
4318       loop-closed form, as illustrated below.
4319
4320      The flow at the entry to this function:
4321
4322         loop:
4323           vec_def = phi <null, null>            # REDUCTION_PHI
4324           VECT_DEF = vector_stmt                # vectorized form of STMT
4325           s_loop = scalar_stmt                  # (scalar) STMT
4326         loop_exit:
4327           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4328           use <s_out0>
4329           use <s_out0>
4330
4331      The above is transformed by this function into:
4332
4333         loop:
4334           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4335           VECT_DEF = vector_stmt                # vectorized form of STMT
4336           s_loop = scalar_stmt                  # (scalar) STMT
4337         loop_exit:
4338           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4339           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4340           v_out2 = reduce <v_out1>
4341           s_out3 = extract_field <v_out2, 0>
4342           s_out4 = adjust_result <s_out3>
4343           use <s_out4>
4344           use <s_out4>
4345 */
4346
4347 static void
4348 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4349                                   gimple *reduc_def_stmt,
4350                                   int ncopies, enum tree_code reduc_code,
4351                                   vec<gimple *> reduction_phis,
4352                                   bool double_reduc,
4353                                   slp_tree slp_node,
4354                                   slp_instance slp_node_instance)
4355 {
4356   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4357   stmt_vec_info prev_phi_info;
4358   tree vectype;
4359   machine_mode mode;
4360   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4361   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4362   basic_block exit_bb;
4363   tree scalar_dest;
4364   tree scalar_type;
4365   gimple *new_phi = NULL, *phi;
4366   gimple_stmt_iterator exit_gsi;
4367   tree vec_dest;
4368   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4369   gimple *epilog_stmt = NULL;
4370   enum tree_code code = gimple_assign_rhs_code (stmt);
4371   gimple *exit_phi;
4372   tree bitsize;
4373   tree adjustment_def = NULL;
4374   tree vec_initial_def = NULL;
4375   tree expr, def, initial_def = NULL;
4376   tree orig_name, scalar_result;
4377   imm_use_iterator imm_iter, phi_imm_iter;
4378   use_operand_p use_p, phi_use_p;
4379   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4380   bool nested_in_vect_loop = false;
4381   auto_vec<gimple *> new_phis;
4382   auto_vec<gimple *> inner_phis;
4383   enum vect_def_type dt = vect_unknown_def_type;
4384   int j, i;
4385   auto_vec<tree> scalar_results;
4386   unsigned int group_size = 1, k, ratio;
4387   auto_vec<tree> vec_initial_defs;
4388   auto_vec<gimple *> phis;
4389   bool slp_reduc = false;
4390   tree new_phi_result;
4391   gimple *inner_phi = NULL;
4392   tree induction_index = NULL_TREE;
4393
4394   if (slp_node)
4395     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4396
4397   if (nested_in_vect_loop_p (loop, stmt))
4398     {
4399       outer_loop = loop;
4400       loop = loop->inner;
4401       nested_in_vect_loop = true;
4402       gcc_assert (!slp_node);
4403     }
4404
4405   vectype = STMT_VINFO_VECTYPE (stmt_info);
4406   gcc_assert (vectype);
4407   mode = TYPE_MODE (vectype);
4408
4409   /* 1. Create the reduction def-use cycle:
4410      Set the arguments of REDUCTION_PHIS, i.e., transform
4411
4412         loop:
4413           vec_def = phi <null, null>            # REDUCTION_PHI
4414           VECT_DEF = vector_stmt                # vectorized form of STMT
4415           ...
4416
4417      into:
4418
4419         loop:
4420           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4421           VECT_DEF = vector_stmt                # vectorized form of STMT
4422           ...
4423
4424      (in case of SLP, do it for all the phis). */
4425
4426   /* Get the loop-entry arguments.  */
4427   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4428   if (slp_node)
4429     {
4430       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4431       vec_initial_defs.reserve (vec_num);
4432       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4433                                       &vec_initial_defs, vec_num, code,
4434                                       GROUP_FIRST_ELEMENT (stmt_info));
4435     }
4436   else
4437     {
4438       /* Get at the scalar def before the loop, that defines the initial value
4439          of the reduction variable.  */
4440       gimple *def_stmt;
4441       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4442                                            loop_preheader_edge (loop));
4443       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4444       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4445                                                        &adjustment_def);
4446       vec_initial_defs.create (1);
4447       vec_initial_defs.quick_push (vec_initial_def);
4448     }
4449
4450   /* Set phi nodes arguments.  */
4451   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4452     {
4453       tree vec_init_def, def;
4454       gimple_seq stmts;
4455       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4456                                            true, NULL_TREE);
4457       if (stmts)
4458         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4459
4460       def = vect_defs[i];
4461       for (j = 0; j < ncopies; j++)
4462         {
4463           if (j != 0)
4464             {
4465               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4466               if (nested_in_vect_loop)
4467                 vec_init_def
4468                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4469                                                     vec_init_def);
4470             }
4471
4472           /* Set the loop-entry arg of the reduction-phi.  */
4473
4474           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4475               == INTEGER_INDUC_COND_REDUCTION)
4476             {
4477               /* Initialise the reduction phi to zero.  This prevents initial
4478                  values of non-zero interferring with the reduction op.  */
4479               gcc_assert (ncopies == 1);
4480               gcc_assert (i == 0);
4481
4482               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4483               tree zero_vec = build_zero_cst (vec_init_def_type);
4484
4485               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4486                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4487             }
4488           else
4489             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4490                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4491
4492           /* Set the loop-latch arg for the reduction-phi.  */
4493           if (j > 0)
4494             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4495
4496           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4497                        UNKNOWN_LOCATION);
4498
4499           if (dump_enabled_p ())
4500             {
4501               dump_printf_loc (MSG_NOTE, vect_location,
4502                                "transform reduction: created def-use cycle: ");
4503               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4504               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4505             }
4506         }
4507     }
4508
4509   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4510      which is updated with the current index of the loop for every match of
4511      the original loop's cond_expr (VEC_STMT).  This results in a vector
4512      containing the last time the condition passed for that vector lane.
4513      The first match will be a 1 to allow 0 to be used for non-matching
4514      indexes.  If there are no matches at all then the vector will be all
4515      zeroes.  */
4516   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4517     {
4518       tree indx_before_incr, indx_after_incr;
4519       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4520       int k;
4521
4522       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4523       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4524
4525       int scalar_precision
4526         = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype)));
4527       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4528       tree cr_index_vector_type = build_vector_type
4529         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4530
4531       /* First we create a simple vector induction variable which starts
4532          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4533          vector size (STEP).  */
4534
4535       /* Create a {1,2,3,...} vector.  */
4536       tree *vtemp = XALLOCAVEC (tree, nunits_out);
4537       for (k = 0; k < nunits_out; ++k)
4538         vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
4539       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4540
4541       /* Create a vector of the step value.  */
4542       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4543       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4544
4545       /* Create an induction variable.  */
4546       gimple_stmt_iterator incr_gsi;
4547       bool insert_after;
4548       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4549       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4550                  insert_after, &indx_before_incr, &indx_after_incr);
4551
4552       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4553          filled with zeros (VEC_ZERO).  */
4554
4555       /* Create a vector of 0s.  */
4556       tree zero = build_zero_cst (cr_index_scalar_type);
4557       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4558
4559       /* Create a vector phi node.  */
4560       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4561       new_phi = create_phi_node (new_phi_tree, loop->header);
4562       set_vinfo_for_stmt (new_phi,
4563                           new_stmt_vec_info (new_phi, loop_vinfo));
4564       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4565                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4566
4567       /* Now take the condition from the loops original cond_expr
4568          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4569          every match uses values from the induction variable
4570          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4571          (NEW_PHI_TREE).
4572          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4573          the new cond_expr (INDEX_COND_EXPR).  */
4574
4575       /* Duplicate the condition from vec_stmt.  */
4576       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4577
4578       /* Create a conditional, where the condition is taken from vec_stmt
4579          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4580          else is the phi (NEW_PHI_TREE).  */
4581       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4582                                      ccompare, indx_before_incr,
4583                                      new_phi_tree);
4584       induction_index = make_ssa_name (cr_index_vector_type);
4585       gimple *index_condition = gimple_build_assign (induction_index,
4586                                                      index_cond_expr);
4587       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4588       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4589                                                         loop_vinfo);
4590       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4591       set_vinfo_for_stmt (index_condition, index_vec_info);
4592
4593       /* Update the phi with the vec cond.  */
4594       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4595                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4596     }
4597
4598   /* 2. Create epilog code.
4599         The reduction epilog code operates across the elements of the vector
4600         of partial results computed by the vectorized loop.
4601         The reduction epilog code consists of:
4602
4603         step 1: compute the scalar result in a vector (v_out2)
4604         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4605         step 3: adjust the scalar result (s_out3) if needed.
4606
4607         Step 1 can be accomplished using one the following three schemes:
4608           (scheme 1) using reduc_code, if available.
4609           (scheme 2) using whole-vector shifts, if available.
4610           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4611                      combined.
4612
4613           The overall epilog code looks like this:
4614
4615           s_out0 = phi <s_loop>         # original EXIT_PHI
4616           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4617           v_out2 = reduce <v_out1>              # step 1
4618           s_out3 = extract_field <v_out2, 0>    # step 2
4619           s_out4 = adjust_result <s_out3>       # step 3
4620
4621           (step 3 is optional, and steps 1 and 2 may be combined).
4622           Lastly, the uses of s_out0 are replaced by s_out4.  */
4623
4624
4625   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4626          v_out1 = phi <VECT_DEF>
4627          Store them in NEW_PHIS.  */
4628
4629   exit_bb = single_exit (loop)->dest;
4630   prev_phi_info = NULL;
4631   new_phis.create (vect_defs.length ());
4632   FOR_EACH_VEC_ELT (vect_defs, i, def)
4633     {
4634       for (j = 0; j < ncopies; j++)
4635         {
4636           tree new_def = copy_ssa_name (def);
4637           phi = create_phi_node (new_def, exit_bb);
4638           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4639           if (j == 0)
4640             new_phis.quick_push (phi);
4641           else
4642             {
4643               def = vect_get_vec_def_for_stmt_copy (dt, def);
4644               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4645             }
4646
4647           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4648           prev_phi_info = vinfo_for_stmt (phi);
4649         }
4650     }
4651
4652   /* The epilogue is created for the outer-loop, i.e., for the loop being
4653      vectorized.  Create exit phis for the outer loop.  */
4654   if (double_reduc)
4655     {
4656       loop = outer_loop;
4657       exit_bb = single_exit (loop)->dest;
4658       inner_phis.create (vect_defs.length ());
4659       FOR_EACH_VEC_ELT (new_phis, i, phi)
4660         {
4661           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4662           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4663           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4664                            PHI_RESULT (phi));
4665           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4666                                                             loop_vinfo));
4667           inner_phis.quick_push (phi);
4668           new_phis[i] = outer_phi;
4669           prev_phi_info = vinfo_for_stmt (outer_phi);
4670           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4671             {
4672               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4673               new_result = copy_ssa_name (PHI_RESULT (phi));
4674               outer_phi = create_phi_node (new_result, exit_bb);
4675               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4676                                PHI_RESULT (phi));
4677               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4678                                                                 loop_vinfo));
4679               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4680               prev_phi_info = vinfo_for_stmt (outer_phi);
4681             }
4682         }
4683     }
4684
4685   exit_gsi = gsi_after_labels (exit_bb);
4686
4687   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4688          (i.e. when reduc_code is not available) and in the final adjustment
4689          code (if needed).  Also get the original scalar reduction variable as
4690          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4691          represents a reduction pattern), the tree-code and scalar-def are
4692          taken from the original stmt that the pattern-stmt (STMT) replaces.
4693          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4694          are taken from STMT.  */
4695
4696   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4697   if (!orig_stmt)
4698     {
4699       /* Regular reduction  */
4700       orig_stmt = stmt;
4701     }
4702   else
4703     {
4704       /* Reduction pattern  */
4705       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4706       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4707       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4708     }
4709
4710   code = gimple_assign_rhs_code (orig_stmt);
4711   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4712      partial results are added and not subtracted.  */
4713   if (code == MINUS_EXPR)
4714     code = PLUS_EXPR;
4715
4716   scalar_dest = gimple_assign_lhs (orig_stmt);
4717   scalar_type = TREE_TYPE (scalar_dest);
4718   scalar_results.create (group_size);
4719   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4720   bitsize = TYPE_SIZE (scalar_type);
4721
4722   /* In case this is a reduction in an inner-loop while vectorizing an outer
4723      loop - we don't need to extract a single scalar result at the end of the
4724      inner-loop (unless it is double reduction, i.e., the use of reduction is
4725      outside the outer-loop).  The final vector of partial results will be used
4726      in the vectorized outer-loop, or reduced to a scalar result at the end of
4727      the outer-loop.  */
4728   if (nested_in_vect_loop && !double_reduc)
4729     goto vect_finalize_reduction;
4730
4731   /* SLP reduction without reduction chain, e.g.,
4732      # a1 = phi <a2, a0>
4733      # b1 = phi <b2, b0>
4734      a2 = operation (a1)
4735      b2 = operation (b1)  */
4736   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4737
4738   /* In case of reduction chain, e.g.,
4739      # a1 = phi <a3, a0>
4740      a2 = operation (a1)
4741      a3 = operation (a2),
4742
4743      we may end up with more than one vector result.  Here we reduce them to
4744      one vector.  */
4745   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4746     {
4747       tree first_vect = PHI_RESULT (new_phis[0]);
4748       gassign *new_vec_stmt = NULL;
4749       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4750       for (k = 1; k < new_phis.length (); k++)
4751         {
4752           gimple *next_phi = new_phis[k];
4753           tree second_vect = PHI_RESULT (next_phi);
4754           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4755           new_vec_stmt = gimple_build_assign (tem, code,
4756                                               first_vect, second_vect);
4757           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4758           first_vect = tem;
4759         }
4760
4761       new_phi_result = first_vect;
4762       if (new_vec_stmt)
4763         {
4764           new_phis.truncate (0);
4765           new_phis.safe_push (new_vec_stmt);
4766         }
4767     }
4768   /* Likewise if we couldn't use a single defuse cycle.  */
4769   else if (ncopies > 1)
4770     {
4771       gcc_assert (new_phis.length () == 1);
4772       tree first_vect = PHI_RESULT (new_phis[0]);
4773       gassign *new_vec_stmt = NULL;
4774       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4775       gimple *next_phi = new_phis[0];
4776       for (int k = 1; k < ncopies; ++k)
4777         {
4778           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4779           tree second_vect = PHI_RESULT (next_phi);
4780           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4781           new_vec_stmt = gimple_build_assign (tem, code,
4782                                               first_vect, second_vect);
4783           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4784           first_vect = tem;
4785         }
4786       new_phi_result = first_vect;
4787       new_phis.truncate (0);
4788       new_phis.safe_push (new_vec_stmt);
4789     }
4790   else
4791     new_phi_result = PHI_RESULT (new_phis[0]);
4792
4793   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4794       && reduc_code != ERROR_MARK)
4795     {
4796       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4797          various data values where the condition matched and another vector
4798          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4799          need to extract the last matching index (which will be the index with
4800          highest value) and use this to index into the data vector.
4801          For the case where there were no matches, the data vector will contain
4802          all default values and the index vector will be all zeros.  */
4803
4804       /* Get various versions of the type of the vector of indexes.  */
4805       tree index_vec_type = TREE_TYPE (induction_index);
4806       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4807       tree index_scalar_type = TREE_TYPE (index_vec_type);
4808       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4809         (index_vec_type);
4810
4811       /* Get an unsigned integer version of the type of the data vector.  */
4812       int scalar_precision
4813         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4814       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4815       tree vectype_unsigned = build_vector_type
4816         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4817
4818       /* First we need to create a vector (ZERO_VEC) of zeros and another
4819          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4820          can create using a MAX reduction and then expanding.
4821          In the case where the loop never made any matches, the max index will
4822          be zero.  */
4823
4824       /* Vector of {0, 0, 0,...}.  */
4825       tree zero_vec = make_ssa_name (vectype);
4826       tree zero_vec_rhs = build_zero_cst (vectype);
4827       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4828       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4829
4830       /* Find maximum value from the vector of found indexes.  */
4831       tree max_index = make_ssa_name (index_scalar_type);
4832       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4833                                                     induction_index);
4834       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4835
4836       /* Vector of {max_index, max_index, max_index,...}.  */
4837       tree max_index_vec = make_ssa_name (index_vec_type);
4838       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4839                                                       max_index);
4840       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4841                                                         max_index_vec_rhs);
4842       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4843
4844       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4845          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4846          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4847          otherwise.  Only one value should match, resulting in a vector
4848          (VEC_COND) with one data value and the rest zeros.
4849          In the case where the loop never made any matches, every index will
4850          match, resulting in a vector with all data values (which will all be
4851          the default value).  */
4852
4853       /* Compare the max index vector to the vector of found indexes to find
4854          the position of the max value.  */
4855       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4856       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4857                                                       induction_index,
4858                                                       max_index_vec);
4859       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4860
4861       /* Use the compare to choose either values from the data vector or
4862          zero.  */
4863       tree vec_cond = make_ssa_name (vectype);
4864       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4865                                                    vec_compare, new_phi_result,
4866                                                    zero_vec);
4867       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4868
4869       /* Finally we need to extract the data value from the vector (VEC_COND)
4870          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4871          reduction, but because this doesn't exist, we can use a MAX reduction
4872          instead.  The data value might be signed or a float so we need to cast
4873          it first.
4874          In the case where the loop never made any matches, the data values are
4875          all identical, and so will reduce down correctly.  */
4876
4877       /* Make the matched data values unsigned.  */
4878       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4879       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4880                                        vec_cond);
4881       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4882                                                         VIEW_CONVERT_EXPR,
4883                                                         vec_cond_cast_rhs);
4884       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4885
4886       /* Reduce down to a scalar value.  */
4887       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4888       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4889                                       optab_default);
4890       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4891                   != CODE_FOR_nothing);
4892       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4893                                                      REDUC_MAX_EXPR,
4894                                                      vec_cond_cast);
4895       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4896
4897       /* Convert the reduced value back to the result type and set as the
4898          result.  */
4899       gimple_seq stmts = NULL;
4900       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4901                                data_reduc);
4902       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4903       scalar_results.safe_push (new_temp);
4904     }
4905   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4906            && reduc_code == ERROR_MARK)
4907     {
4908       /* Condition redution without supported REDUC_MAX_EXPR.  Generate
4909          idx = 0;
4910          idx_val = induction_index[0];
4911          val = data_reduc[0];
4912          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4913            if (induction_index[i] > idx_val)
4914              val = data_reduc[i], idx_val = induction_index[i];
4915          return val;  */
4916
4917       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4918       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4919       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4920       unsigned HOST_WIDE_INT v_size
4921         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4922       tree idx_val = NULL_TREE, val = NULL_TREE;
4923       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4924         {
4925           tree old_idx_val = idx_val;
4926           tree old_val = val;
4927           idx_val = make_ssa_name (idx_eltype);
4928           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4929                                              build3 (BIT_FIELD_REF, idx_eltype,
4930                                                      induction_index,
4931                                                      bitsize_int (el_size),
4932                                                      bitsize_int (off)));
4933           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4934           val = make_ssa_name (data_eltype);
4935           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4936                                              build3 (BIT_FIELD_REF,
4937                                                      data_eltype,
4938                                                      new_phi_result,
4939                                                      bitsize_int (el_size),
4940                                                      bitsize_int (off)));
4941           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942           if (off != 0)
4943             {
4944               tree new_idx_val = idx_val;
4945               tree new_val = val;
4946               if (off != v_size - el_size)
4947                 {
4948                   new_idx_val = make_ssa_name (idx_eltype);
4949                   epilog_stmt = gimple_build_assign (new_idx_val,
4950                                                      MAX_EXPR, idx_val,
4951                                                      old_idx_val);
4952                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4953                 }
4954               new_val = make_ssa_name (data_eltype);
4955               epilog_stmt = gimple_build_assign (new_val,
4956                                                  COND_EXPR,
4957                                                  build2 (GT_EXPR,
4958                                                          boolean_type_node,
4959                                                          idx_val,
4960                                                          old_idx_val),
4961                                                  val, old_val);
4962               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4963               idx_val = new_idx_val;
4964               val = new_val;
4965             }
4966         }
4967       /* Convert the reduced value back to the result type and set as the
4968          result.  */
4969       gimple_seq stmts = NULL;
4970       val = gimple_convert (&stmts, scalar_type, val);
4971       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4972       scalar_results.safe_push (val);
4973     }
4974
4975   /* 2.3 Create the reduction code, using one of the three schemes described
4976          above. In SLP we simply need to extract all the elements from the
4977          vector (without reducing them), so we use scalar shifts.  */
4978   else if (reduc_code != ERROR_MARK && !slp_reduc)
4979     {
4980       tree tmp;
4981       tree vec_elem_type;
4982
4983       /* Case 1:  Create:
4984          v_out2 = reduc_expr <v_out1>  */
4985
4986       if (dump_enabled_p ())
4987         dump_printf_loc (MSG_NOTE, vect_location,
4988                          "Reduce using direct vector reduction.\n");
4989
4990       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4991       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4992         {
4993           tree tmp_dest =
4994               vect_create_destination_var (scalar_dest, vec_elem_type);
4995           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4996           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4997           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4998           gimple_assign_set_lhs (epilog_stmt, new_temp);
4999           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5000
5001           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
5002         }
5003       else
5004         tmp = build1 (reduc_code, scalar_type, new_phi_result);
5005
5006       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
5007       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5008       gimple_assign_set_lhs (epilog_stmt, new_temp);
5009       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5010
5011       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5012           == INTEGER_INDUC_COND_REDUCTION)
5013         {
5014           /* Earlier we set the initial value to be zero.  Check the result
5015              and if it is zero then replace with the original initial
5016              value.  */
5017           tree zero = build_zero_cst (scalar_type);
5018           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5019
5020           tmp = make_ssa_name (new_scalar_dest);
5021           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5022                                              initial_def, new_temp);
5023           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5024           new_temp = tmp;
5025         }
5026
5027       scalar_results.safe_push (new_temp);
5028     }
5029   else
5030     {
5031       bool reduce_with_shift = have_whole_vector_shift (mode);
5032       int element_bitsize = tree_to_uhwi (bitsize);
5033       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5034       tree vec_temp;
5035
5036       /* COND reductions all do the final reduction with MAX_EXPR.  */
5037       if (code == COND_EXPR)
5038         code = MAX_EXPR;
5039
5040       /* Regardless of whether we have a whole vector shift, if we're
5041          emulating the operation via tree-vect-generic, we don't want
5042          to use it.  Only the first round of the reduction is likely
5043          to still be profitable via emulation.  */
5044       /* ??? It might be better to emit a reduction tree code here, so that
5045          tree-vect-generic can expand the first round via bit tricks.  */
5046       if (!VECTOR_MODE_P (mode))
5047         reduce_with_shift = false;
5048       else
5049         {
5050           optab optab = optab_for_tree_code (code, vectype, optab_default);
5051           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5052             reduce_with_shift = false;
5053         }
5054
5055       if (reduce_with_shift && !slp_reduc)
5056         {
5057           int nelements = vec_size_in_bits / element_bitsize;
5058           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
5059
5060           int elt_offset;
5061
5062           tree zero_vec = build_zero_cst (vectype);
5063           /* Case 2: Create:
5064              for (offset = nelements/2; offset >= 1; offset/=2)
5065                 {
5066                   Create:  va' = vec_shift <va, offset>
5067                   Create:  va = vop <va, va'>
5068                 }  */
5069
5070           tree rhs;
5071
5072           if (dump_enabled_p ())
5073             dump_printf_loc (MSG_NOTE, vect_location,
5074                              "Reduce using vector shifts\n");
5075
5076           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5077           new_temp = new_phi_result;
5078           for (elt_offset = nelements / 2;
5079                elt_offset >= 1;
5080                elt_offset /= 2)
5081             {
5082               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
5083               tree mask = vect_gen_perm_mask_any (vectype, sel);
5084               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5085                                                  new_temp, zero_vec, mask);
5086               new_name = make_ssa_name (vec_dest, epilog_stmt);
5087               gimple_assign_set_lhs (epilog_stmt, new_name);
5088               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5089
5090               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5091                                                  new_temp);
5092               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5093               gimple_assign_set_lhs (epilog_stmt, new_temp);
5094               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5095             }
5096
5097           /* 2.4  Extract the final scalar result.  Create:
5098              s_out3 = extract_field <v_out2, bitpos>  */
5099
5100           if (dump_enabled_p ())
5101             dump_printf_loc (MSG_NOTE, vect_location,
5102                              "extract scalar result\n");
5103
5104           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5105                         bitsize, bitsize_zero_node);
5106           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5107           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5108           gimple_assign_set_lhs (epilog_stmt, new_temp);
5109           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5110           scalar_results.safe_push (new_temp);
5111         }
5112       else
5113         {
5114           /* Case 3: Create:
5115              s = extract_field <v_out2, 0>
5116              for (offset = element_size;
5117                   offset < vector_size;
5118                   offset += element_size;)
5119                {
5120                  Create:  s' = extract_field <v_out2, offset>
5121                  Create:  s = op <s, s'>  // For non SLP cases
5122                }  */
5123
5124           if (dump_enabled_p ())
5125             dump_printf_loc (MSG_NOTE, vect_location,
5126                              "Reduce using scalar code.\n");
5127
5128           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5129           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5130             {
5131               int bit_offset;
5132               if (gimple_code (new_phi) == GIMPLE_PHI)
5133                 vec_temp = PHI_RESULT (new_phi);
5134               else
5135                 vec_temp = gimple_assign_lhs (new_phi);
5136               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5137                             bitsize_zero_node);
5138               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5139               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5140               gimple_assign_set_lhs (epilog_stmt, new_temp);
5141               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5142
5143               /* In SLP we don't need to apply reduction operation, so we just
5144                  collect s' values in SCALAR_RESULTS.  */
5145               if (slp_reduc)
5146                 scalar_results.safe_push (new_temp);
5147
5148               for (bit_offset = element_bitsize;
5149                    bit_offset < vec_size_in_bits;
5150                    bit_offset += element_bitsize)
5151                 {
5152                   tree bitpos = bitsize_int (bit_offset);
5153                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5154                                      bitsize, bitpos);
5155
5156                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5157                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5158                   gimple_assign_set_lhs (epilog_stmt, new_name);
5159                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5160
5161                   if (slp_reduc)
5162                     {
5163                       /* In SLP we don't need to apply reduction operation, so
5164                          we just collect s' values in SCALAR_RESULTS.  */
5165                       new_temp = new_name;
5166                       scalar_results.safe_push (new_name);
5167                     }
5168                   else
5169                     {
5170                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5171                                                          new_name, new_temp);
5172                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5173                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5174                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5175                     }
5176                 }
5177             }
5178
5179           /* The only case where we need to reduce scalar results in SLP, is
5180              unrolling.  If the size of SCALAR_RESULTS is greater than
5181              GROUP_SIZE, we reduce them combining elements modulo
5182              GROUP_SIZE.  */
5183           if (slp_reduc)
5184             {
5185               tree res, first_res, new_res;
5186               gimple *new_stmt;
5187
5188               /* Reduce multiple scalar results in case of SLP unrolling.  */
5189               for (j = group_size; scalar_results.iterate (j, &res);
5190                    j++)
5191                 {
5192                   first_res = scalar_results[j % group_size];
5193                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5194                                                   first_res, res);
5195                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5196                   gimple_assign_set_lhs (new_stmt, new_res);
5197                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5198                   scalar_results[j % group_size] = new_res;
5199                 }
5200             }
5201           else
5202             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5203             scalar_results.safe_push (new_temp);
5204         }
5205
5206       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5207           == INTEGER_INDUC_COND_REDUCTION)
5208         {
5209           /* Earlier we set the initial value to be zero.  Check the result
5210              and if it is zero then replace with the original initial
5211              value.  */
5212           tree zero = build_zero_cst (scalar_type);
5213           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5214
5215           tree tmp = make_ssa_name (new_scalar_dest);
5216           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5217                                              initial_def, new_temp);
5218           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5219           scalar_results[0] = tmp;
5220         }
5221     }
5222
5223 vect_finalize_reduction:
5224
5225   if (double_reduc)
5226     loop = loop->inner;
5227
5228   /* 2.5 Adjust the final result by the initial value of the reduction
5229          variable. (When such adjustment is not needed, then
5230          'adjustment_def' is zero).  For example, if code is PLUS we create:
5231          new_temp = loop_exit_def + adjustment_def  */
5232
5233   if (adjustment_def)
5234     {
5235       gcc_assert (!slp_reduc);
5236       if (nested_in_vect_loop)
5237         {
5238           new_phi = new_phis[0];
5239           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5240           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5241           new_dest = vect_create_destination_var (scalar_dest, vectype);
5242         }
5243       else
5244         {
5245           new_temp = scalar_results[0];
5246           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5247           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5248           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5249         }
5250
5251       epilog_stmt = gimple_build_assign (new_dest, expr);
5252       new_temp = make_ssa_name (new_dest, epilog_stmt);
5253       gimple_assign_set_lhs (epilog_stmt, new_temp);
5254       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5255       if (nested_in_vect_loop)
5256         {
5257           set_vinfo_for_stmt (epilog_stmt,
5258                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5259           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5260                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5261
5262           if (!double_reduc)
5263             scalar_results.quick_push (new_temp);
5264           else
5265             scalar_results[0] = new_temp;
5266         }
5267       else
5268         scalar_results[0] = new_temp;
5269
5270       new_phis[0] = epilog_stmt;
5271     }
5272
5273   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5274           phis with new adjusted scalar results, i.e., replace use <s_out0>
5275           with use <s_out4>.
5276
5277      Transform:
5278         loop_exit:
5279           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5280           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5281           v_out2 = reduce <v_out1>
5282           s_out3 = extract_field <v_out2, 0>
5283           s_out4 = adjust_result <s_out3>
5284           use <s_out0>
5285           use <s_out0>
5286
5287      into:
5288
5289         loop_exit:
5290           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5291           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5292           v_out2 = reduce <v_out1>
5293           s_out3 = extract_field <v_out2, 0>
5294           s_out4 = adjust_result <s_out3>
5295           use <s_out4>
5296           use <s_out4> */
5297
5298
5299   /* In SLP reduction chain we reduce vector results into one vector if
5300      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5301      the last stmt in the reduction chain, since we are looking for the loop
5302      exit phi node.  */
5303   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5304     {
5305       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5306       /* Handle reduction patterns.  */
5307       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5308         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5309
5310       scalar_dest = gimple_assign_lhs (dest_stmt);
5311       group_size = 1;
5312     }
5313
5314   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5315      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5316      need to match SCALAR_RESULTS with corresponding statements.  The first
5317      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5318      the first vector stmt, etc.
5319      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5320   if (group_size > new_phis.length ())
5321     {
5322       ratio = group_size / new_phis.length ();
5323       gcc_assert (!(group_size % new_phis.length ()));
5324     }
5325   else
5326     ratio = 1;
5327
5328   for (k = 0; k < group_size; k++)
5329     {
5330       if (k % ratio == 0)
5331         {
5332           epilog_stmt = new_phis[k / ratio];
5333           reduction_phi = reduction_phis[k / ratio];
5334           if (double_reduc)
5335             inner_phi = inner_phis[k / ratio];
5336         }
5337
5338       if (slp_reduc)
5339         {
5340           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5341
5342           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5343           /* SLP statements can't participate in patterns.  */
5344           gcc_assert (!orig_stmt);
5345           scalar_dest = gimple_assign_lhs (current_stmt);
5346         }
5347
5348       phis.create (3);
5349       /* Find the loop-closed-use at the loop exit of the original scalar
5350          result.  (The reduction result is expected to have two immediate uses -
5351          one at the latch block, and one at the loop exit).  */
5352       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5353         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5354             && !is_gimple_debug (USE_STMT (use_p)))
5355           phis.safe_push (USE_STMT (use_p));
5356
5357       /* While we expect to have found an exit_phi because of loop-closed-ssa
5358          form we can end up without one if the scalar cycle is dead.  */
5359
5360       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5361         {
5362           if (outer_loop)
5363             {
5364               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5365               gphi *vect_phi;
5366
5367               /* FORNOW. Currently not supporting the case that an inner-loop
5368                  reduction is not used in the outer-loop (but only outside the
5369                  outer-loop), unless it is double reduction.  */
5370               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5371                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5372                           || double_reduc);
5373
5374               if (double_reduc)
5375                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5376               else
5377                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5378               if (!double_reduc
5379                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5380                       != vect_double_reduction_def)
5381                 continue;
5382
5383               /* Handle double reduction:
5384
5385                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5386                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5387                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5388                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5389
5390                  At that point the regular reduction (stmt2 and stmt3) is
5391                  already vectorized, as well as the exit phi node, stmt4.
5392                  Here we vectorize the phi node of double reduction, stmt1, and
5393                  update all relevant statements.  */
5394
5395               /* Go through all the uses of s2 to find double reduction phi
5396                  node, i.e., stmt1 above.  */
5397               orig_name = PHI_RESULT (exit_phi);
5398               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5399                 {
5400                   stmt_vec_info use_stmt_vinfo;
5401                   stmt_vec_info new_phi_vinfo;
5402                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5403                   basic_block bb = gimple_bb (use_stmt);
5404                   gimple *use;
5405
5406                   /* Check that USE_STMT is really double reduction phi
5407                      node.  */
5408                   if (gimple_code (use_stmt) != GIMPLE_PHI
5409                       || gimple_phi_num_args (use_stmt) != 2
5410                       || bb->loop_father != outer_loop)
5411                     continue;
5412                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5413                   if (!use_stmt_vinfo
5414                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5415                           != vect_double_reduction_def)
5416                     continue;
5417
5418                   /* Create vector phi node for double reduction:
5419                      vs1 = phi <vs0, vs2>
5420                      vs1 was created previously in this function by a call to
5421                        vect_get_vec_def_for_operand and is stored in
5422                        vec_initial_def;
5423                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5424                      vs0 is created here.  */
5425
5426                   /* Create vector phi node.  */
5427                   vect_phi = create_phi_node (vec_initial_def, bb);
5428                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5429                                     loop_vec_info_for_loop (outer_loop));
5430                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5431
5432                   /* Create vs0 - initial def of the double reduction phi.  */
5433                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5434                                              loop_preheader_edge (outer_loop));
5435                   init_def = get_initial_def_for_reduction (stmt,
5436                                                           preheader_arg, NULL);
5437                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5438                                                     vectype, NULL);
5439
5440                   /* Update phi node arguments with vs0 and vs2.  */
5441                   add_phi_arg (vect_phi, vect_phi_init,
5442                                loop_preheader_edge (outer_loop),
5443                                UNKNOWN_LOCATION);
5444                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5445                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5446                   if (dump_enabled_p ())
5447                     {
5448                       dump_printf_loc (MSG_NOTE, vect_location,
5449                                        "created double reduction phi node: ");
5450                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5451                     }
5452
5453                   vect_phi_res = PHI_RESULT (vect_phi);
5454
5455                   /* Replace the use, i.e., set the correct vs1 in the regular
5456                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5457                      loop is redundant.  */
5458                   use = reduction_phi;
5459                   for (j = 0; j < ncopies; j++)
5460                     {
5461                       edge pr_edge = loop_preheader_edge (loop);
5462                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5463                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5464                     }
5465                 }
5466             }
5467         }
5468
5469       phis.release ();
5470       if (nested_in_vect_loop)
5471         {
5472           if (double_reduc)
5473             loop = outer_loop;
5474           else
5475             continue;
5476         }
5477
5478       phis.create (3);
5479       /* Find the loop-closed-use at the loop exit of the original scalar
5480          result.  (The reduction result is expected to have two immediate uses,
5481          one at the latch block, and one at the loop exit).  For double
5482          reductions we are looking for exit phis of the outer loop.  */
5483       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5484         {
5485           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5486             {
5487               if (!is_gimple_debug (USE_STMT (use_p)))
5488                 phis.safe_push (USE_STMT (use_p));
5489             }
5490           else
5491             {
5492               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5493                 {
5494                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5495
5496                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5497                     {
5498                       if (!flow_bb_inside_loop_p (loop,
5499                                              gimple_bb (USE_STMT (phi_use_p)))
5500                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5501                         phis.safe_push (USE_STMT (phi_use_p));
5502                     }
5503                 }
5504             }
5505         }
5506
5507       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5508         {
5509           /* Replace the uses:  */
5510           orig_name = PHI_RESULT (exit_phi);
5511           scalar_result = scalar_results[k];
5512           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5513             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5514               SET_USE (use_p, scalar_result);
5515         }
5516
5517       phis.release ();
5518     }
5519 }
5520
5521
5522 /* Function is_nonwrapping_integer_induction.
5523
5524    Check if STMT (which is part of loop LOOP) both increments and
5525    does not cause overflow.  */
5526
5527 static bool
5528 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5529 {
5530   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5531   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5532   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5533   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5534   widest_int ni, max_loop_value, lhs_max;
5535   bool overflow = false;
5536
5537   /* Make sure the loop is integer based.  */
5538   if (TREE_CODE (base) != INTEGER_CST
5539       || TREE_CODE (step) != INTEGER_CST)
5540     return false;
5541
5542   /* Check that the induction increments.  */
5543   if (tree_int_cst_sgn (step) == -1)
5544     return false;
5545
5546   /* Check that the max size of the loop will not wrap.  */
5547
5548   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5549     return true;
5550
5551   if (! max_stmt_executions (loop, &ni))
5552     return false;
5553
5554   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5555                             &overflow);
5556   if (overflow)
5557     return false;
5558
5559   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5560                             TYPE_SIGN (lhs_type), &overflow);
5561   if (overflow)
5562     return false;
5563
5564   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5565           <= TYPE_PRECISION (lhs_type));
5566 }
5567
5568 /* Function vectorizable_reduction.
5569
5570    Check if STMT performs a reduction operation that can be vectorized.
5571    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5572    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5573    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5574
5575    This function also handles reduction idioms (patterns) that have been
5576    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5577    of this form:
5578      X = pattern_expr (arg0, arg1, ..., X)
5579    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5580    sequence that had been detected and replaced by the pattern-stmt (STMT).
5581
5582    This function also handles reduction of condition expressions, for example:
5583      for (int i = 0; i < N; i++)
5584        if (a[i] < value)
5585          last = a[i];
5586    This is handled by vectorising the loop and creating an additional vector
5587    containing the loop indexes for which "a[i] < value" was true.  In the
5588    function epilogue this is reduced to a single max value and then used to
5589    index into the vector of results.
5590
5591    In some cases of reduction patterns, the type of the reduction variable X is
5592    different than the type of the other arguments of STMT.
5593    In such cases, the vectype that is used when transforming STMT into a vector
5594    stmt is different than the vectype that is used to determine the
5595    vectorization factor, because it consists of a different number of elements
5596    than the actual number of elements that are being operated upon in parallel.
5597
5598    For example, consider an accumulation of shorts into an int accumulator.
5599    On some targets it's possible to vectorize this pattern operating on 8
5600    shorts at a time (hence, the vectype for purposes of determining the
5601    vectorization factor should be V8HI); on the other hand, the vectype that
5602    is used to create the vector form is actually V4SI (the type of the result).
5603
5604    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5605    indicates what is the actual level of parallelism (V8HI in the example), so
5606    that the right vectorization factor would be derived.  This vectype
5607    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5608    be used to create the vectorized stmt.  The right vectype for the vectorized
5609    stmt is obtained from the type of the result X:
5610         get_vectype_for_scalar_type (TREE_TYPE (X))
5611
5612    This means that, contrary to "regular" reductions (or "regular" stmts in
5613    general), the following equation:
5614       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5615    does *NOT* necessarily hold for reduction patterns.  */
5616
5617 bool
5618 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5619                         gimple **vec_stmt, slp_tree slp_node,
5620                         slp_instance slp_node_instance)
5621 {
5622   tree vec_dest;
5623   tree scalar_dest;
5624   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5625   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5626   tree vectype_in = NULL_TREE;
5627   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5628   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5629   enum tree_code code, orig_code, epilog_reduc_code;
5630   machine_mode vec_mode;
5631   int op_type;
5632   optab optab, reduc_optab;
5633   tree new_temp = NULL_TREE;
5634   gimple *def_stmt;
5635   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5636   tree scalar_type;
5637   bool is_simple_use;
5638   gimple *orig_stmt;
5639   stmt_vec_info orig_stmt_info = NULL;
5640   int i;
5641   int ncopies;
5642   int epilog_copies;
5643   stmt_vec_info prev_stmt_info, prev_phi_info;
5644   bool single_defuse_cycle = false;
5645   gimple *new_stmt = NULL;
5646   int j;
5647   tree ops[3];
5648   enum vect_def_type dts[3];
5649   bool nested_cycle = false, found_nested_cycle_def = false;
5650   bool double_reduc = false;
5651   basic_block def_bb;
5652   struct loop * def_stmt_loop, *outer_loop = NULL;
5653   tree def_arg;
5654   gimple *def_arg_stmt;
5655   auto_vec<tree> vec_oprnds0;
5656   auto_vec<tree> vec_oprnds1;
5657   auto_vec<tree> vec_oprnds2;
5658   auto_vec<tree> vect_defs;
5659   auto_vec<gimple *> phis;
5660   int vec_num;
5661   tree def0, tem;
5662   bool first_p = true;
5663   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5664   tree cond_reduc_val = NULL_TREE;
5665
5666   /* Make sure it was already recognized as a reduction computation.  */
5667   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5668       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5669     return false;
5670
5671   if (nested_in_vect_loop_p (loop, stmt))
5672     {
5673       outer_loop = loop;
5674       loop = loop->inner;
5675       nested_cycle = true;
5676     }
5677
5678   /* In case of reduction chain we switch to the first stmt in the chain, but
5679      we don't update STMT_INFO, since only the last stmt is marked as reduction
5680      and has reduction properties.  */
5681   if (GROUP_FIRST_ELEMENT (stmt_info)
5682       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5683     {
5684       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5685       first_p = false;
5686     }
5687
5688   if (gimple_code (stmt) == GIMPLE_PHI)
5689     {
5690       /* Analysis is fully done on the reduction stmt invocation.  */
5691       if (! vec_stmt)
5692         {
5693           if (slp_node)
5694             slp_node_instance->reduc_phis = slp_node;
5695
5696           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5697           return true;
5698         }
5699
5700       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5701       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5702         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5703
5704       gcc_assert (is_gimple_assign (reduc_stmt));
5705       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5706         {
5707           tree op = gimple_op (reduc_stmt, k);
5708           if (op == gimple_phi_result (stmt))
5709             continue;
5710           if (k == 1
5711               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5712             continue;
5713           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5714           if (! vectype_in
5715               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5716             vectype_in = tem;
5717           break;
5718         }
5719       gcc_assert (vectype_in);
5720
5721       if (slp_node)
5722         ncopies = 1;
5723       else
5724         ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5725                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5726
5727       use_operand_p use_p;
5728       gimple *use_stmt;
5729       if (ncopies > 1
5730           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5731               <= vect_used_only_live)
5732           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5733           && (use_stmt == reduc_stmt
5734               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5735                   == reduc_stmt)))
5736         single_defuse_cycle = true;
5737
5738       /* Create the destination vector  */
5739       scalar_dest = gimple_assign_lhs (reduc_stmt);
5740       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5741
5742       if (slp_node)
5743         /* The size vect_schedule_slp_instance computes is off for us.  */
5744         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5745                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5746                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5747       else
5748         vec_num = 1;
5749
5750       /* Generate the reduction PHIs upfront.  */
5751       prev_phi_info = NULL;
5752       for (j = 0; j < ncopies; j++)
5753         {
5754           if (j == 0 || !single_defuse_cycle)
5755             {
5756               for (i = 0; i < vec_num; i++)
5757                 {
5758                   /* Create the reduction-phi that defines the reduction
5759                      operand.  */
5760                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5761                   set_vinfo_for_stmt (new_phi,
5762                                       new_stmt_vec_info (new_phi, loop_vinfo));
5763
5764                   if (slp_node)
5765                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5766                   else
5767                     {
5768                       if (j == 0)
5769                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5770                       else
5771                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5772                       prev_phi_info = vinfo_for_stmt (new_phi);
5773                     }
5774                 }
5775             }
5776         }
5777
5778       return true;
5779     }
5780
5781   /* 1. Is vectorizable reduction?  */
5782   /* Not supportable if the reduction variable is used in the loop, unless
5783      it's a reduction chain.  */
5784   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5785       && !GROUP_FIRST_ELEMENT (stmt_info))
5786     return false;
5787
5788   /* Reductions that are not used even in an enclosing outer-loop,
5789      are expected to be "live" (used out of the loop).  */
5790   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5791       && !STMT_VINFO_LIVE_P (stmt_info))
5792     return false;
5793
5794   /* 2. Has this been recognized as a reduction pattern?
5795
5796      Check if STMT represents a pattern that has been recognized
5797      in earlier analysis stages.  For stmts that represent a pattern,
5798      the STMT_VINFO_RELATED_STMT field records the last stmt in
5799      the original sequence that constitutes the pattern.  */
5800
5801   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5802   if (orig_stmt)
5803     {
5804       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5805       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5806       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5807     }
5808
5809   /* 3. Check the operands of the operation.  The first operands are defined
5810         inside the loop body. The last operand is the reduction variable,
5811         which is defined by the loop-header-phi.  */
5812
5813   gcc_assert (is_gimple_assign (stmt));
5814
5815   /* Flatten RHS.  */
5816   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5817     {
5818     case GIMPLE_BINARY_RHS:
5819       code = gimple_assign_rhs_code (stmt);
5820       op_type = TREE_CODE_LENGTH (code);
5821       gcc_assert (op_type == binary_op);
5822       ops[0] = gimple_assign_rhs1 (stmt);
5823       ops[1] = gimple_assign_rhs2 (stmt);
5824       break;
5825
5826     case GIMPLE_TERNARY_RHS:
5827       code = gimple_assign_rhs_code (stmt);
5828       op_type = TREE_CODE_LENGTH (code);
5829       gcc_assert (op_type == ternary_op);
5830       ops[0] = gimple_assign_rhs1 (stmt);
5831       ops[1] = gimple_assign_rhs2 (stmt);
5832       ops[2] = gimple_assign_rhs3 (stmt);
5833       break;
5834
5835     case GIMPLE_UNARY_RHS:
5836       return false;
5837
5838     default:
5839       gcc_unreachable ();
5840     }
5841
5842   if (code == COND_EXPR && slp_node)
5843     return false;
5844
5845   scalar_dest = gimple_assign_lhs (stmt);
5846   scalar_type = TREE_TYPE (scalar_dest);
5847   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5848       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5849     return false;
5850
5851   /* Do not try to vectorize bit-precision reductions.  */
5852   if (!type_has_mode_precision_p (scalar_type))
5853     return false;
5854
5855   /* All uses but the last are expected to be defined in the loop.
5856      The last use is the reduction variable.  In case of nested cycle this
5857      assumption is not true: we use reduc_index to record the index of the
5858      reduction variable.  */
5859   gimple *reduc_def_stmt = NULL;
5860   int reduc_index = -1;
5861   for (i = 0; i < op_type; i++)
5862     {
5863       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5864       if (i == 0 && code == COND_EXPR)
5865         continue;
5866
5867       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5868                                           &def_stmt, &dts[i], &tem);
5869       dt = dts[i];
5870       gcc_assert (is_simple_use);
5871       if (dt == vect_reduction_def)
5872         {
5873           reduc_def_stmt = def_stmt;
5874           reduc_index = i;
5875           continue;
5876         }
5877       else
5878         {
5879           if (!vectype_in)
5880             vectype_in = tem;
5881         }
5882
5883       if (dt != vect_internal_def
5884           && dt != vect_external_def
5885           && dt != vect_constant_def
5886           && dt != vect_induction_def
5887           && !(dt == vect_nested_cycle && nested_cycle))
5888         return false;
5889
5890       if (dt == vect_nested_cycle)
5891         {
5892           found_nested_cycle_def = true;
5893           reduc_def_stmt = def_stmt;
5894           reduc_index = i;
5895         }
5896
5897       if (i == 1 && code == COND_EXPR)
5898         {
5899           /* Record how value of COND_EXPR is defined.  */
5900           if (dt == vect_constant_def)
5901             {
5902               cond_reduc_dt = dt;
5903               cond_reduc_val = ops[i];
5904             }
5905           if (dt == vect_induction_def && def_stmt != NULL
5906               && is_nonwrapping_integer_induction (def_stmt, loop))
5907             cond_reduc_dt = dt;
5908         }
5909     }
5910
5911   if (!vectype_in)
5912     vectype_in = vectype_out;
5913
5914   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5915      directy used in stmt.  */
5916   if (reduc_index == -1)
5917     {
5918       if (orig_stmt)
5919         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5920       else
5921         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5922     }
5923
5924   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5925     return false;
5926
5927   if (!(reduc_index == -1
5928         || dts[reduc_index] == vect_reduction_def
5929         || dts[reduc_index] == vect_nested_cycle
5930         || ((dts[reduc_index] == vect_internal_def
5931              || dts[reduc_index] == vect_external_def
5932              || dts[reduc_index] == vect_constant_def
5933              || dts[reduc_index] == vect_induction_def)
5934             && nested_cycle && found_nested_cycle_def)))
5935     {
5936       /* For pattern recognized stmts, orig_stmt might be a reduction,
5937          but some helper statements for the pattern might not, or
5938          might be COND_EXPRs with reduction uses in the condition.  */
5939       gcc_assert (orig_stmt);
5940       return false;
5941     }
5942
5943   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5944   enum vect_reduction_type v_reduc_type
5945     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5946   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5947
5948   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5949   /* If we have a condition reduction, see if we can simplify it further.  */
5950   if (v_reduc_type == COND_REDUCTION)
5951     {
5952       if (cond_reduc_dt == vect_induction_def)
5953         {
5954           if (dump_enabled_p ())
5955             dump_printf_loc (MSG_NOTE, vect_location,
5956                              "condition expression based on "
5957                              "integer induction.\n");
5958           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5959             = INTEGER_INDUC_COND_REDUCTION;
5960         }
5961
5962       /* Loop peeling modifies initial value of reduction PHI, which
5963          makes the reduction stmt to be transformed different to the
5964          original stmt analyzed.  We need to record reduction code for
5965          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5966          it can be used directly at transform stage.  */
5967       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5968           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5969         {
5970           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5971           gcc_assert (cond_reduc_dt == vect_constant_def);
5972           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5973         }
5974       else if (cond_reduc_dt == vect_constant_def)
5975         {
5976           enum vect_def_type cond_initial_dt;
5977           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5978           tree cond_initial_val
5979             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5980
5981           gcc_assert (cond_reduc_val != NULL_TREE);
5982           vect_is_simple_use (cond_initial_val, loop_vinfo,
5983                               &def_stmt, &cond_initial_dt);
5984           if (cond_initial_dt == vect_constant_def
5985               && types_compatible_p (TREE_TYPE (cond_initial_val),
5986                                      TREE_TYPE (cond_reduc_val)))
5987             {
5988               tree e = fold_binary (LE_EXPR, boolean_type_node,
5989                                     cond_initial_val, cond_reduc_val);
5990               if (e && (integer_onep (e) || integer_zerop (e)))
5991                 {
5992                   if (dump_enabled_p ())
5993                     dump_printf_loc (MSG_NOTE, vect_location,
5994                                      "condition expression based on "
5995                                      "compile time constant.\n");
5996                   /* Record reduction code at analysis stage.  */
5997                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5998                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5999                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6000                     = CONST_COND_REDUCTION;
6001                 }
6002             }
6003         }
6004     }
6005
6006   if (orig_stmt)
6007     gcc_assert (tmp == orig_stmt
6008                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6009   else
6010     /* We changed STMT to be the first stmt in reduction chain, hence we
6011        check that in this case the first element in the chain is STMT.  */
6012     gcc_assert (stmt == tmp
6013                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6014
6015   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6016     return false;
6017
6018   if (slp_node)
6019     ncopies = 1;
6020   else
6021     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6022                / TYPE_VECTOR_SUBPARTS (vectype_in));
6023
6024   gcc_assert (ncopies >= 1);
6025
6026   vec_mode = TYPE_MODE (vectype_in);
6027
6028   if (code == COND_EXPR)
6029     {
6030       /* Only call during the analysis stage, otherwise we'll lose
6031          STMT_VINFO_TYPE.  */
6032       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6033                                                 ops[reduc_index], 0, NULL))
6034         {
6035           if (dump_enabled_p ())
6036             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6037                              "unsupported condition in reduction\n");
6038           return false;
6039         }
6040     }
6041   else
6042     {
6043       /* 4. Supportable by target?  */
6044
6045       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6046           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6047         {
6048           /* Shifts and rotates are only supported by vectorizable_shifts,
6049              not vectorizable_reduction.  */
6050           if (dump_enabled_p ())
6051             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6052                              "unsupported shift or rotation.\n");
6053           return false;
6054         }
6055
6056       /* 4.1. check support for the operation in the loop  */
6057       optab = optab_for_tree_code (code, vectype_in, optab_default);
6058       if (!optab)
6059         {
6060           if (dump_enabled_p ())
6061             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6062                              "no optab.\n");
6063
6064           return false;
6065         }
6066
6067       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6068         {
6069           if (dump_enabled_p ())
6070             dump_printf (MSG_NOTE, "op not supported by target.\n");
6071
6072           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6073               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6074                   < vect_min_worthwhile_factor (code))
6075             return false;
6076
6077           if (dump_enabled_p ())
6078             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6079         }
6080
6081       /* Worthwhile without SIMD support?  */
6082       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6083           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6084              < vect_min_worthwhile_factor (code))
6085         {
6086           if (dump_enabled_p ())
6087             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6088                              "not worthwhile without SIMD support.\n");
6089
6090           return false;
6091         }
6092     }
6093
6094   /* 4.2. Check support for the epilog operation.
6095
6096           If STMT represents a reduction pattern, then the type of the
6097           reduction variable may be different than the type of the rest
6098           of the arguments.  For example, consider the case of accumulation
6099           of shorts into an int accumulator; The original code:
6100                         S1: int_a = (int) short_a;
6101           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6102
6103           was replaced with:
6104                         STMT: int_acc = widen_sum <short_a, int_acc>
6105
6106           This means that:
6107           1. The tree-code that is used to create the vector operation in the
6108              epilog code (that reduces the partial results) is not the
6109              tree-code of STMT, but is rather the tree-code of the original
6110              stmt from the pattern that STMT is replacing.  I.e, in the example
6111              above we want to use 'widen_sum' in the loop, but 'plus' in the
6112              epilog.
6113           2. The type (mode) we use to check available target support
6114              for the vector operation to be created in the *epilog*, is
6115              determined by the type of the reduction variable (in the example
6116              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6117              However the type (mode) we use to check available target support
6118              for the vector operation to be created *inside the loop*, is
6119              determined by the type of the other arguments to STMT (in the
6120              example we'd check this: optab_handler (widen_sum_optab,
6121              vect_short_mode)).
6122
6123           This is contrary to "regular" reductions, in which the types of all
6124           the arguments are the same as the type of the reduction variable.
6125           For "regular" reductions we can therefore use the same vector type
6126           (and also the same tree-code) when generating the epilog code and
6127           when generating the code inside the loop.  */
6128
6129   if (orig_stmt)
6130     {
6131       /* This is a reduction pattern: get the vectype from the type of the
6132          reduction variable, and get the tree-code from orig_stmt.  */
6133       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6134                   == TREE_CODE_REDUCTION);
6135       orig_code = gimple_assign_rhs_code (orig_stmt);
6136       gcc_assert (vectype_out);
6137       vec_mode = TYPE_MODE (vectype_out);
6138     }
6139   else
6140     {
6141       /* Regular reduction: use the same vectype and tree-code as used for
6142          the vector code inside the loop can be used for the epilog code. */
6143       orig_code = code;
6144
6145       if (code == MINUS_EXPR)
6146         orig_code = PLUS_EXPR;
6147
6148       /* For simple condition reductions, replace with the actual expression
6149          we want to base our reduction around.  */
6150       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6151         {
6152           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6153           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6154         }
6155       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6156                  == INTEGER_INDUC_COND_REDUCTION)
6157         orig_code = MAX_EXPR;
6158     }
6159
6160   if (nested_cycle)
6161     {
6162       def_bb = gimple_bb (reduc_def_stmt);
6163       def_stmt_loop = def_bb->loop_father;
6164       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6165                                        loop_preheader_edge (def_stmt_loop));
6166       if (TREE_CODE (def_arg) == SSA_NAME
6167           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6168           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6169           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6170           && vinfo_for_stmt (def_arg_stmt)
6171           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6172               == vect_double_reduction_def)
6173         double_reduc = true;
6174     }
6175
6176   epilog_reduc_code = ERROR_MARK;
6177
6178   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6179     {
6180       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6181         {
6182           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6183                                          optab_default);
6184           if (!reduc_optab)
6185             {
6186               if (dump_enabled_p ())
6187                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6188                                  "no optab for reduction.\n");
6189
6190               epilog_reduc_code = ERROR_MARK;
6191             }
6192           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6193             {
6194               if (dump_enabled_p ())
6195                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196                                  "reduc op not supported by target.\n");
6197
6198               epilog_reduc_code = ERROR_MARK;
6199             }
6200         }
6201       else
6202         {
6203           if (!nested_cycle || double_reduc)
6204             {
6205               if (dump_enabled_p ())
6206                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6207                                  "no reduc code for scalar code.\n");
6208
6209               return false;
6210             }
6211         }
6212     }
6213   else
6214     {
6215       int scalar_precision
6216         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6217       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6218       cr_index_vector_type = build_vector_type
6219         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6220
6221       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6222                                    optab_default);
6223       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6224           != CODE_FOR_nothing)
6225         epilog_reduc_code = REDUC_MAX_EXPR;
6226     }
6227
6228   if ((double_reduc
6229        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6230       && ncopies > 1)
6231     {
6232       if (dump_enabled_p ())
6233         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6234                          "multiple types in double reduction or condition "
6235                          "reduction.\n");
6236       return false;
6237     }
6238
6239   /* In case of widenning multiplication by a constant, we update the type
6240      of the constant to be the type of the other operand.  We check that the
6241      constant fits the type in the pattern recognition pass.  */
6242   if (code == DOT_PROD_EXPR
6243       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6244     {
6245       if (TREE_CODE (ops[0]) == INTEGER_CST)
6246         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6247       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6248         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6249       else
6250         {
6251           if (dump_enabled_p ())
6252             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6253                              "invalid types in dot-prod\n");
6254
6255           return false;
6256         }
6257     }
6258
6259   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6260     {
6261       widest_int ni;
6262
6263       if (! max_loop_iterations (loop, &ni))
6264         {
6265           if (dump_enabled_p ())
6266             dump_printf_loc (MSG_NOTE, vect_location,
6267                              "loop count not known, cannot create cond "
6268                              "reduction.\n");
6269           return false;
6270         }
6271       /* Convert backedges to iterations.  */
6272       ni += 1;
6273
6274       /* The additional index will be the same type as the condition.  Check
6275          that the loop can fit into this less one (because we'll use up the
6276          zero slot for when there are no matches).  */
6277       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6278       if (wi::geu_p (ni, wi::to_widest (max_index)))
6279         {
6280           if (dump_enabled_p ())
6281             dump_printf_loc (MSG_NOTE, vect_location,
6282                              "loop size is greater than data size.\n");
6283           return false;
6284         }
6285     }
6286
6287   /* In case the vectorization factor (VF) is bigger than the number
6288      of elements that we can fit in a vectype (nunits), we have to generate
6289      more than one vector stmt - i.e - we need to "unroll" the
6290      vector stmt by a factor VF/nunits.  For more details see documentation
6291      in vectorizable_operation.  */
6292
6293   /* If the reduction is used in an outer loop we need to generate
6294      VF intermediate results, like so (e.g. for ncopies=2):
6295         r0 = phi (init, r0)
6296         r1 = phi (init, r1)
6297         r0 = x0 + r0;
6298         r1 = x1 + r1;
6299     (i.e. we generate VF results in 2 registers).
6300     In this case we have a separate def-use cycle for each copy, and therefore
6301     for each copy we get the vector def for the reduction variable from the
6302     respective phi node created for this copy.
6303
6304     Otherwise (the reduction is unused in the loop nest), we can combine
6305     together intermediate results, like so (e.g. for ncopies=2):
6306         r = phi (init, r)
6307         r = x0 + r;
6308         r = x1 + r;
6309    (i.e. we generate VF/2 results in a single register).
6310    In this case for each copy we get the vector def for the reduction variable
6311    from the vectorized reduction operation generated in the previous iteration.
6312
6313    This only works when we see both the reduction PHI and its only consumer
6314    in vectorizable_reduction and there are no intermediate stmts
6315    participating.  */
6316   use_operand_p use_p;
6317   gimple *use_stmt;
6318   if (ncopies > 1
6319       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6320       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6321       && (use_stmt == stmt
6322           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6323     {
6324       single_defuse_cycle = true;
6325       epilog_copies = 1;
6326     }
6327   else
6328     epilog_copies = ncopies;
6329
6330   /* If the reduction stmt is one of the patterns that have lane
6331      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6332   if ((ncopies > 1
6333        && ! single_defuse_cycle)
6334       && (code == DOT_PROD_EXPR
6335           || code == WIDEN_SUM_EXPR
6336           || code == SAD_EXPR))
6337     {
6338       if (dump_enabled_p ())
6339         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6340                          "multi def-use cycle not possible for lane-reducing "
6341                          "reduction operation\n");
6342       return false;
6343     }
6344
6345   if (!vec_stmt) /* transformation not required.  */
6346     {
6347       if (first_p)
6348         vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6349       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6350       return true;
6351     }
6352
6353   /* Transform.  */
6354
6355   if (dump_enabled_p ())
6356     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6357
6358   /* FORNOW: Multiple types are not supported for condition.  */
6359   if (code == COND_EXPR)
6360     gcc_assert (ncopies == 1);
6361
6362   /* Create the destination vector  */
6363   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6364
6365   prev_stmt_info = NULL;
6366   prev_phi_info = NULL;
6367   if (slp_node)
6368     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6369   else
6370     {
6371       vec_num = 1;
6372       vec_oprnds0.create (1);
6373       vec_oprnds1.create (1);
6374       if (op_type == ternary_op)
6375         vec_oprnds2.create (1);
6376     }
6377
6378   phis.create (vec_num);
6379   vect_defs.create (vec_num);
6380   if (!slp_node)
6381     vect_defs.quick_push (NULL_TREE);
6382
6383   if (slp_node)
6384     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6385   else
6386     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6387
6388   for (j = 0; j < ncopies; j++)
6389     {
6390       if (code == COND_EXPR)
6391         {
6392           gcc_assert (!slp_node);
6393           vectorizable_condition (stmt, gsi, vec_stmt,
6394                                   PHI_RESULT (phis[0]),
6395                                   reduc_index, NULL);
6396           /* Multiple types are not supported for condition.  */
6397           break;
6398         }
6399
6400       /* Handle uses.  */
6401       if (j == 0)
6402         {
6403           if (slp_node)
6404             {
6405               /* Get vec defs for all the operands except the reduction index,
6406                  ensuring the ordering of the ops in the vector is kept.  */
6407               auto_vec<tree, 3> slp_ops;
6408               auto_vec<vec<tree>, 3> vec_defs;
6409
6410               slp_ops.quick_push (ops[0]);
6411               slp_ops.quick_push (ops[1]);
6412               if (op_type == ternary_op)
6413                 slp_ops.quick_push (ops[2]);
6414
6415               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6416
6417               vec_oprnds0.safe_splice (vec_defs[0]);
6418               vec_defs[0].release ();
6419               vec_oprnds1.safe_splice (vec_defs[1]);
6420               vec_defs[1].release ();
6421               if (op_type == ternary_op)
6422                 {
6423                   vec_oprnds2.safe_splice (vec_defs[2]);
6424                   vec_defs[2].release ();
6425                 }
6426             }
6427           else
6428             {
6429               vec_oprnds0.quick_push
6430                 (vect_get_vec_def_for_operand (ops[0], stmt));
6431               vec_oprnds1.quick_push
6432                 (vect_get_vec_def_for_operand (ops[1], stmt));
6433               if (op_type == ternary_op)
6434                 vec_oprnds2.quick_push
6435                   (vect_get_vec_def_for_operand (ops[2], stmt));
6436             }
6437         }
6438       else
6439         {
6440           if (!slp_node)
6441             {
6442               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6443
6444               if (single_defuse_cycle && reduc_index == 0)
6445                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6446               else
6447                 vec_oprnds0[0]
6448                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6449               if (single_defuse_cycle && reduc_index == 1)
6450                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6451               else
6452                 vec_oprnds1[0]
6453                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6454               if (op_type == ternary_op)
6455                 {
6456                   if (single_defuse_cycle && reduc_index == 2)
6457                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6458                   else
6459                     vec_oprnds2[0]
6460                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6461                 }
6462             }
6463         }
6464
6465       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6466         {
6467           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6468           if (op_type == ternary_op)
6469             vop[2] = vec_oprnds2[i];
6470
6471           new_temp = make_ssa_name (vec_dest, new_stmt);
6472           new_stmt = gimple_build_assign (new_temp, code,
6473                                           vop[0], vop[1], vop[2]);
6474           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6475
6476           if (slp_node)
6477             {
6478               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6479               vect_defs.quick_push (new_temp);
6480             }
6481           else
6482             vect_defs[0] = new_temp;
6483         }
6484
6485       if (slp_node)
6486         continue;
6487
6488       if (j == 0)
6489         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6490       else
6491         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6492
6493       prev_stmt_info = vinfo_for_stmt (new_stmt);
6494     }
6495
6496   /* Finalize the reduction-phi (set its arguments) and create the
6497      epilog reduction code.  */
6498   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6499     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6500
6501   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6502                                     epilog_copies,
6503                                     epilog_reduc_code, phis,
6504                                     double_reduc, slp_node, slp_node_instance);
6505
6506   return true;
6507 }
6508
6509 /* Function vect_min_worthwhile_factor.
6510
6511    For a loop where we could vectorize the operation indicated by CODE,
6512    return the minimum vectorization factor that makes it worthwhile
6513    to use generic vectors.  */
6514 int
6515 vect_min_worthwhile_factor (enum tree_code code)
6516 {
6517   switch (code)
6518     {
6519     case PLUS_EXPR:
6520     case MINUS_EXPR:
6521     case NEGATE_EXPR:
6522       return 4;
6523
6524     case BIT_AND_EXPR:
6525     case BIT_IOR_EXPR:
6526     case BIT_XOR_EXPR:
6527     case BIT_NOT_EXPR:
6528       return 2;
6529
6530     default:
6531       return INT_MAX;
6532     }
6533 }
6534
6535
6536 /* Function vectorizable_induction
6537
6538    Check if PHI performs an induction computation that can be vectorized.
6539    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6540    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6541    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6542
6543 bool
6544 vectorizable_induction (gimple *phi,
6545                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6546                         gimple **vec_stmt, slp_tree slp_node)
6547 {
6548   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6549   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6550   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6551   unsigned ncopies;
6552   bool nested_in_vect_loop = false;
6553   struct loop *iv_loop;
6554   tree vec_def;
6555   edge pe = loop_preheader_edge (loop);
6556   basic_block new_bb;
6557   tree new_vec, vec_init, vec_step, t;
6558   tree new_name;
6559   gimple *new_stmt;
6560   gphi *induction_phi;
6561   tree induc_def, vec_dest;
6562   tree init_expr, step_expr;
6563   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6564   unsigned i;
6565   tree expr;
6566   gimple_seq stmts;
6567   imm_use_iterator imm_iter;
6568   use_operand_p use_p;
6569   gimple *exit_phi;
6570   edge latch_e;
6571   tree loop_arg;
6572   gimple_stmt_iterator si;
6573   basic_block bb = gimple_bb (phi);
6574
6575   if (gimple_code (phi) != GIMPLE_PHI)
6576     return false;
6577
6578   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6579     return false;
6580
6581   /* Make sure it was recognized as induction computation.  */
6582   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6583     return false;
6584
6585   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6586   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6587
6588   if (slp_node)
6589     ncopies = 1;
6590   else
6591     ncopies = vf / nunits;
6592   gcc_assert (ncopies >= 1);
6593
6594   /* FORNOW. These restrictions should be relaxed.  */
6595   if (nested_in_vect_loop_p (loop, phi))
6596     {
6597       imm_use_iterator imm_iter;
6598       use_operand_p use_p;
6599       gimple *exit_phi;
6600       edge latch_e;
6601       tree loop_arg;
6602
6603       if (ncopies > 1)
6604         {
6605           if (dump_enabled_p ())
6606             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6607                              "multiple types in nested loop.\n");
6608           return false;
6609         }
6610
6611       /* FORNOW: outer loop induction with SLP not supported.  */
6612       if (STMT_SLP_TYPE (stmt_info))
6613         return false;
6614
6615       exit_phi = NULL;
6616       latch_e = loop_latch_edge (loop->inner);
6617       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6618       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6619         {
6620           gimple *use_stmt = USE_STMT (use_p);
6621           if (is_gimple_debug (use_stmt))
6622             continue;
6623
6624           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6625             {
6626               exit_phi = use_stmt;
6627               break;
6628             }
6629         }
6630       if (exit_phi)
6631         {
6632           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6633           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6634                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6635             {
6636               if (dump_enabled_p ())
6637                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6638                                  "inner-loop induction only used outside "
6639                                  "of the outer vectorized loop.\n");
6640               return false;
6641             }
6642         }
6643
6644       nested_in_vect_loop = true;
6645       iv_loop = loop->inner;
6646     }
6647   else
6648     iv_loop = loop;
6649   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6650
6651   if (!vec_stmt) /* transformation not required.  */
6652     {
6653       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6654       if (dump_enabled_p ())
6655         dump_printf_loc (MSG_NOTE, vect_location,
6656                          "=== vectorizable_induction ===\n");
6657       vect_model_induction_cost (stmt_info, ncopies);
6658       return true;
6659     }
6660
6661   /* Transform.  */
6662
6663   /* Compute a vector variable, initialized with the first VF values of
6664      the induction variable.  E.g., for an iv with IV_PHI='X' and
6665      evolution S, for a vector of 4 units, we want to compute:
6666      [X, X + S, X + 2*S, X + 3*S].  */
6667
6668   if (dump_enabled_p ())
6669     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6670
6671   latch_e = loop_latch_edge (iv_loop);
6672   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6673
6674   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6675   gcc_assert (step_expr != NULL_TREE);
6676
6677   pe = loop_preheader_edge (iv_loop);
6678   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6679                                      loop_preheader_edge (iv_loop));
6680
6681   /* Convert the step to the desired type.  */
6682   stmts = NULL;
6683   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6684   if (stmts)
6685     {
6686       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6687       gcc_assert (!new_bb);
6688     }
6689
6690   /* Find the first insertion point in the BB.  */
6691   si = gsi_after_labels (bb);
6692
6693   /* For SLP induction we have to generate several IVs as for example
6694      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6695      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6696      [VF*S, VF*S, VF*S, VF*S] for all.  */
6697   if (slp_node)
6698     {
6699       /* Convert the init to the desired type.  */
6700       stmts = NULL;
6701       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6702       if (stmts)
6703         {
6704           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6705           gcc_assert (!new_bb);
6706         }
6707
6708       /* Generate [VF*S, VF*S, ... ].  */
6709       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6710         {
6711           expr = build_int_cst (integer_type_node, vf);
6712           expr = fold_convert (TREE_TYPE (step_expr), expr);
6713         }
6714       else
6715         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6716       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6717                               expr, step_expr);
6718       if (! CONSTANT_CLASS_P (new_name))
6719         new_name = vect_init_vector (phi, new_name,
6720                                      TREE_TYPE (step_expr), NULL);
6721       new_vec = build_vector_from_val (vectype, new_name);
6722       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6723
6724       /* Now generate the IVs.  */
6725       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6726       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6727       unsigned elts = nunits * nvects;
6728       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6729       gcc_assert (elts % group_size == 0);
6730       tree elt = init_expr;
6731       unsigned ivn;
6732       for (ivn = 0; ivn < nivs; ++ivn)
6733         {
6734           tree *elts = XALLOCAVEC (tree, nunits);
6735           bool constant_p = true;
6736           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6737             {
6738               if (ivn*nunits + eltn >= group_size
6739                   && (ivn*nunits + eltn) % group_size == 0)
6740                 {
6741                   stmts = NULL;
6742                   elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6743                                       elt, step_expr);
6744                   if (stmts)
6745                     {
6746                       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6747                       gcc_assert (!new_bb);
6748                     }
6749                 }
6750               if (! CONSTANT_CLASS_P (elt))
6751                 constant_p = false;
6752               elts[eltn] = elt;
6753             }
6754           if (constant_p)
6755             new_vec = build_vector (vectype, elts);
6756           else
6757             {
6758               vec<constructor_elt, va_gc> *v;
6759               vec_alloc (v, nunits);
6760               for (i = 0; i < nunits; ++i)
6761                 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
6762               new_vec = build_constructor (vectype, v);
6763             }
6764           vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6765
6766           /* Create the induction-phi that defines the induction-operand.  */
6767           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6768           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6769           set_vinfo_for_stmt (induction_phi,
6770                               new_stmt_vec_info (induction_phi, loop_vinfo));
6771           induc_def = PHI_RESULT (induction_phi);
6772
6773           /* Create the iv update inside the loop  */
6774           vec_def = make_ssa_name (vec_dest);
6775           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6776           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6777           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6778
6779           /* Set the arguments of the phi node:  */
6780           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6781           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6782                        UNKNOWN_LOCATION);
6783
6784           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6785         }
6786
6787       /* Re-use IVs when we can.  */
6788       if (ivn < nvects)
6789         {
6790           unsigned vfp
6791             = least_common_multiple (group_size, nunits) / group_size;
6792           /* Generate [VF'*S, VF'*S, ... ].  */
6793           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6794             {
6795               expr = build_int_cst (integer_type_node, vfp);
6796               expr = fold_convert (TREE_TYPE (step_expr), expr);
6797             }
6798           else
6799             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6800           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6801                                   expr, step_expr);
6802           if (! CONSTANT_CLASS_P (new_name))
6803             new_name = vect_init_vector (phi, new_name,
6804                                          TREE_TYPE (step_expr), NULL);
6805           new_vec = build_vector_from_val (vectype, new_name);
6806           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6807           for (; ivn < nvects; ++ivn)
6808             {
6809               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6810               tree def;
6811               if (gimple_code (iv) == GIMPLE_PHI)
6812                 def = gimple_phi_result (iv);
6813               else
6814                 def = gimple_assign_lhs (iv);
6815               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6816                                               PLUS_EXPR,
6817                                               def, vec_step);
6818               if (gimple_code (iv) == GIMPLE_PHI)
6819                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6820               else
6821                 {
6822                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6823                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6824                 }
6825               set_vinfo_for_stmt (new_stmt,
6826                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6827               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6828             }
6829         }
6830
6831       return true;
6832     }
6833
6834   /* Create the vector that holds the initial_value of the induction.  */
6835   if (nested_in_vect_loop)
6836     {
6837       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6838          been created during vectorization of previous stmts.  We obtain it
6839          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6840       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6841       /* If the initial value is not of proper type, convert it.  */
6842       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6843         {
6844           new_stmt
6845             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6846                                                           vect_simple_var,
6847                                                           "vec_iv_"),
6848                                    VIEW_CONVERT_EXPR,
6849                                    build1 (VIEW_CONVERT_EXPR, vectype,
6850                                            vec_init));
6851           vec_init = gimple_assign_lhs (new_stmt);
6852           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6853                                                  new_stmt);
6854           gcc_assert (!new_bb);
6855           set_vinfo_for_stmt (new_stmt,
6856                               new_stmt_vec_info (new_stmt, loop_vinfo));
6857         }
6858     }
6859   else
6860     {
6861       vec<constructor_elt, va_gc> *v;
6862
6863       /* iv_loop is the loop to be vectorized. Create:
6864          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6865       stmts = NULL;
6866       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6867
6868       vec_alloc (v, nunits);
6869       bool constant_p = is_gimple_min_invariant (new_name);
6870       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6871       for (i = 1; i < nunits; i++)
6872         {
6873           /* Create: new_name_i = new_name + step_expr  */
6874           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6875                                    new_name, step_expr);
6876           if (!is_gimple_min_invariant (new_name))
6877             constant_p = false;
6878           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6879         }
6880       if (stmts)
6881         {
6882           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6883           gcc_assert (!new_bb);
6884         }
6885
6886       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
6887       if (constant_p)
6888         new_vec = build_vector_from_ctor (vectype, v);
6889       else
6890         new_vec = build_constructor (vectype, v);
6891       vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6892     }
6893
6894
6895   /* Create the vector that holds the step of the induction.  */
6896   if (nested_in_vect_loop)
6897     /* iv_loop is nested in the loop to be vectorized. Generate:
6898        vec_step = [S, S, S, S]  */
6899     new_name = step_expr;
6900   else
6901     {
6902       /* iv_loop is the loop to be vectorized. Generate:
6903           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6904       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6905         {
6906           expr = build_int_cst (integer_type_node, vf);
6907           expr = fold_convert (TREE_TYPE (step_expr), expr);
6908         }
6909       else
6910         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6911       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6912                               expr, step_expr);
6913       if (TREE_CODE (step_expr) == SSA_NAME)
6914         new_name = vect_init_vector (phi, new_name,
6915                                      TREE_TYPE (step_expr), NULL);
6916     }
6917
6918   t = unshare_expr (new_name);
6919   gcc_assert (CONSTANT_CLASS_P (new_name)
6920               || TREE_CODE (new_name) == SSA_NAME);
6921   new_vec = build_vector_from_val (vectype, t);
6922   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6923
6924
6925   /* Create the following def-use cycle:
6926      loop prolog:
6927          vec_init = ...
6928          vec_step = ...
6929      loop:
6930          vec_iv = PHI <vec_init, vec_loop>
6931          ...
6932          STMT
6933          ...
6934          vec_loop = vec_iv + vec_step;  */
6935
6936   /* Create the induction-phi that defines the induction-operand.  */
6937   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6938   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6939   set_vinfo_for_stmt (induction_phi,
6940                       new_stmt_vec_info (induction_phi, loop_vinfo));
6941   induc_def = PHI_RESULT (induction_phi);
6942
6943   /* Create the iv update inside the loop  */
6944   vec_def = make_ssa_name (vec_dest);
6945   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6946   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6947   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6948
6949   /* Set the arguments of the phi node:  */
6950   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6951   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6952                UNKNOWN_LOCATION);
6953
6954   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6955
6956   /* In case that vectorization factor (VF) is bigger than the number
6957      of elements that we can fit in a vectype (nunits), we have to generate
6958      more than one vector stmt - i.e - we need to "unroll" the
6959      vector stmt by a factor VF/nunits.  For more details see documentation
6960      in vectorizable_operation.  */
6961
6962   if (ncopies > 1)
6963     {
6964       stmt_vec_info prev_stmt_vinfo;
6965       /* FORNOW. This restriction should be relaxed.  */
6966       gcc_assert (!nested_in_vect_loop);
6967
6968       /* Create the vector that holds the step of the induction.  */
6969       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6970         {
6971           expr = build_int_cst (integer_type_node, nunits);
6972           expr = fold_convert (TREE_TYPE (step_expr), expr);
6973         }
6974       else
6975         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6976       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6977                               expr, step_expr);
6978       if (TREE_CODE (step_expr) == SSA_NAME)
6979         new_name = vect_init_vector (phi, new_name,
6980                                      TREE_TYPE (step_expr), NULL);
6981       t = unshare_expr (new_name);
6982       gcc_assert (CONSTANT_CLASS_P (new_name)
6983                   || TREE_CODE (new_name) == SSA_NAME);
6984       new_vec = build_vector_from_val (vectype, t);
6985       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6986
6987       vec_def = induc_def;
6988       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6989       for (i = 1; i < ncopies; i++)
6990         {
6991           /* vec_i = vec_prev + vec_step  */
6992           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6993                                           vec_def, vec_step);
6994           vec_def = make_ssa_name (vec_dest, new_stmt);
6995           gimple_assign_set_lhs (new_stmt, vec_def);
6996
6997           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6998           set_vinfo_for_stmt (new_stmt,
6999                               new_stmt_vec_info (new_stmt, loop_vinfo));
7000           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7001           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7002         }
7003     }
7004
7005   if (nested_in_vect_loop)
7006     {
7007       /* Find the loop-closed exit-phi of the induction, and record
7008          the final vector of induction results:  */
7009       exit_phi = NULL;
7010       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7011         {
7012           gimple *use_stmt = USE_STMT (use_p);
7013           if (is_gimple_debug (use_stmt))
7014             continue;
7015
7016           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7017             {
7018               exit_phi = use_stmt;
7019               break;
7020             }
7021         }
7022       if (exit_phi)
7023         {
7024           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7025           /* FORNOW. Currently not supporting the case that an inner-loop induction
7026              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7027           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7028                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7029
7030           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7031           if (dump_enabled_p ())
7032             {
7033               dump_printf_loc (MSG_NOTE, vect_location,
7034                                "vector of inductions after inner-loop:");
7035               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7036             }
7037         }
7038     }
7039
7040
7041   if (dump_enabled_p ())
7042     {
7043       dump_printf_loc (MSG_NOTE, vect_location,
7044                        "transform induction: created def-use cycle: ");
7045       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7046       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7047                         SSA_NAME_DEF_STMT (vec_def), 0);
7048     }
7049
7050   return true;
7051 }
7052
7053 /* Function vectorizable_live_operation.
7054
7055    STMT computes a value that is used outside the loop.  Check if
7056    it can be supported.  */
7057
7058 bool
7059 vectorizable_live_operation (gimple *stmt,
7060                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7061                              slp_tree slp_node, int slp_index,
7062                              gimple **vec_stmt)
7063 {
7064   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7065   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7066   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7067   imm_use_iterator imm_iter;
7068   tree lhs, lhs_type, bitsize, vec_bitsize;
7069   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7070   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7071   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
7072   gimple *use_stmt;
7073   auto_vec<tree> vec_oprnds;
7074
7075   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7076
7077   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7078     return false;
7079
7080   /* FORNOW.  CHECKME.  */
7081   if (nested_in_vect_loop_p (loop, stmt))
7082     return false;
7083
7084   /* If STMT is not relevant and it is a simple assignment and its inputs are
7085      invariant then it can remain in place, unvectorized.  The original last
7086      scalar value that it computes will be used.  */
7087   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7088     {
7089       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7090       if (dump_enabled_p ())
7091         dump_printf_loc (MSG_NOTE, vect_location,
7092                          "statement is simple and uses invariant.  Leaving in "
7093                          "place.\n");
7094       return true;
7095     }
7096
7097   if (!vec_stmt)
7098     /* No transformation required.  */
7099     return true;
7100
7101   /* If stmt has a related stmt, then use that for getting the lhs.  */
7102   if (is_pattern_stmt_p (stmt_info))
7103     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7104
7105   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7106         : gimple_get_lhs (stmt);
7107   lhs_type = TREE_TYPE (lhs);
7108
7109   bitsize = TYPE_SIZE (TREE_TYPE (vectype));
7110   vec_bitsize = TYPE_SIZE (vectype);
7111
7112   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7113   tree vec_lhs, bitstart;
7114   if (slp_node)
7115     {
7116       gcc_assert (slp_index >= 0);
7117
7118       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7119       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7120
7121       /* Get the last occurrence of the scalar index from the concatenation of
7122          all the slp vectors. Calculate which slp vector it is and the index
7123          within.  */
7124       int pos = (num_vec * nunits) - num_scalar + slp_index;
7125       int vec_entry = pos / nunits;
7126       int vec_index = pos % nunits;
7127
7128       /* Get the correct slp vectorized stmt.  */
7129       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7130
7131       /* Get entry to use.  */
7132       bitstart = build_int_cst (unsigned_type_node, vec_index);
7133       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7134     }
7135   else
7136     {
7137       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7138       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7139
7140       /* For multiple copies, get the last copy.  */
7141       for (int i = 1; i < ncopies; ++i)
7142         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7143                                                   vec_lhs);
7144
7145       /* Get the last lane in the vector.  */
7146       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7147     }
7148
7149   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7150      loop.  */
7151   gimple_seq stmts = NULL;
7152   tree bftype = TREE_TYPE (vectype);
7153   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7154     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7155   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7156   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7157                                    true, NULL_TREE);
7158   if (stmts)
7159     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7160
7161   /* Replace use of lhs with newly computed result.  If the use stmt is a
7162      single arg PHI, just replace all uses of PHI result.  It's necessary
7163      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7164   use_operand_p use_p;
7165   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7166     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7167         && !is_gimple_debug (use_stmt))
7168     {
7169       if (gimple_code (use_stmt) == GIMPLE_PHI
7170           && gimple_phi_num_args (use_stmt) == 1)
7171         {
7172           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7173         }
7174       else
7175         {
7176           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7177             SET_USE (use_p, new_tree);
7178         }
7179       update_stmt (use_stmt);
7180     }
7181
7182   return true;
7183 }
7184
7185 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7186
7187 static void
7188 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7189 {
7190   ssa_op_iter op_iter;
7191   imm_use_iterator imm_iter;
7192   def_operand_p def_p;
7193   gimple *ustmt;
7194
7195   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7196     {
7197       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7198         {
7199           basic_block bb;
7200
7201           if (!is_gimple_debug (ustmt))
7202             continue;
7203
7204           bb = gimple_bb (ustmt);
7205
7206           if (!flow_bb_inside_loop_p (loop, bb))
7207             {
7208               if (gimple_debug_bind_p (ustmt))
7209                 {
7210                   if (dump_enabled_p ())
7211                     dump_printf_loc (MSG_NOTE, vect_location,
7212                                      "killing debug use\n");
7213
7214                   gimple_debug_bind_reset_value (ustmt);
7215                   update_stmt (ustmt);
7216                 }
7217               else
7218                 gcc_unreachable ();
7219             }
7220         }
7221     }
7222 }
7223
7224 /* Given loop represented by LOOP_VINFO, return true if computation of
7225    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7226    otherwise.  */
7227
7228 static bool
7229 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7230 {
7231   /* Constant case.  */
7232   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7233     {
7234       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7235       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7236
7237       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7238       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7239       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7240         return true;
7241     }
7242
7243   widest_int max;
7244   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7245   /* Check the upper bound of loop niters.  */
7246   if (get_max_loop_iterations (loop, &max))
7247     {
7248       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7249       signop sgn = TYPE_SIGN (type);
7250       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7251       if (max < type_max)
7252         return true;
7253     }
7254   return false;
7255 }
7256
7257 /* Scale profiling counters by estimation for LOOP which is vectorized
7258    by factor VF.  */
7259
7260 static void
7261 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7262 {
7263   edge preheader = loop_preheader_edge (loop);
7264   /* Reduce loop iterations by the vectorization factor.  */
7265   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7266   profile_count freq_h = loop->header->count, freq_e = preheader->count;
7267
7268   /* Use frequency only if counts are zero.  */
7269   if (!(freq_h > 0) && !(freq_e > 0))
7270     {
7271       freq_h = profile_count::from_gcov_type (loop->header->frequency);
7272       freq_e = profile_count::from_gcov_type (EDGE_FREQUENCY (preheader));
7273     }
7274   if (freq_h > 0)
7275     {
7276       profile_probability p;
7277
7278       /* Avoid dropping loop body profile counter to 0 because of zero count
7279          in loop's preheader.  */
7280       if (!(freq_e > profile_count::from_gcov_type (1)))
7281        freq_e = profile_count::from_gcov_type (1);
7282       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7283       scale_loop_frequencies (loop, p);
7284     }
7285
7286   basic_block exit_bb = single_pred (loop->latch);
7287   edge exit_e = single_exit (loop);
7288   exit_e->count = loop_preheader_edge (loop)->count;
7289   exit_e->probability = profile_probability::always ()
7290                                  .apply_scale (1, new_est_niter + 1);
7291
7292   edge exit_l = single_pred_edge (loop->latch);
7293   profile_probability prob = exit_l->probability;
7294   exit_l->probability = exit_e->probability.invert ();
7295   exit_l->count = exit_bb->count - exit_e->count;
7296   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7297     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7298 }
7299
7300 /* Function vect_transform_loop.
7301
7302    The analysis phase has determined that the loop is vectorizable.
7303    Vectorize the loop - created vectorized stmts to replace the scalar
7304    stmts in the loop, and update the loop exit condition.
7305    Returns scalar epilogue loop if any.  */
7306
7307 struct loop *
7308 vect_transform_loop (loop_vec_info loop_vinfo)
7309 {
7310   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7311   struct loop *epilogue = NULL;
7312   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7313   int nbbs = loop->num_nodes;
7314   int i;
7315   tree niters_vector = NULL;
7316   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7317   bool grouped_store;
7318   bool slp_scheduled = false;
7319   gimple *stmt, *pattern_stmt;
7320   gimple_seq pattern_def_seq = NULL;
7321   gimple_stmt_iterator pattern_def_si = gsi_none ();
7322   bool transform_pattern_stmt = false;
7323   bool check_profitability = false;
7324   int th;
7325
7326   if (dump_enabled_p ())
7327     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7328
7329   /* Use the more conservative vectorization threshold.  If the number
7330      of iterations is constant assume the cost check has been performed
7331      by our caller.  If the threshold makes all loops profitable that
7332      run at least the vectorization factor number of times checking
7333      is pointless, too.  */
7334   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7335   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7336       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7337     {
7338       if (dump_enabled_p ())
7339         dump_printf_loc (MSG_NOTE, vect_location,
7340                          "Profitability threshold is %d loop iterations.\n",
7341                          th);
7342       check_profitability = true;
7343     }
7344
7345   /* Make sure there exists a single-predecessor exit bb.  Do this before
7346      versioning.   */
7347   edge e = single_exit (loop);
7348   if (! single_pred_p (e->dest))
7349     {
7350       split_loop_exit_edge (e);
7351       if (dump_enabled_p ())
7352         dump_printf (MSG_NOTE, "split exit edge\n");
7353     }
7354
7355   /* Version the loop first, if required, so the profitability check
7356      comes first.  */
7357
7358   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7359     {
7360       vect_loop_versioning (loop_vinfo, th, check_profitability);
7361       check_profitability = false;
7362     }
7363
7364   /* Make sure there exists a single-predecessor exit bb also on the
7365      scalar loop copy.  Do this after versioning but before peeling
7366      so CFG structure is fine for both scalar and if-converted loop
7367      to make slpeel_duplicate_current_defs_from_edges face matched
7368      loop closed PHI nodes on the exit.  */
7369   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7370     {
7371       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7372       if (! single_pred_p (e->dest))
7373         {
7374           split_loop_exit_edge (e);
7375           if (dump_enabled_p ())
7376             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7377         }
7378     }
7379
7380   tree niters = vect_build_loop_niters (loop_vinfo);
7381   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7382   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7383   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7384   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7385                               check_profitability, niters_no_overflow);
7386   if (niters_vector == NULL_TREE)
7387     {
7388       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7389         niters_vector
7390           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7391                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7392       else
7393         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7394                                      niters_no_overflow);
7395     }
7396
7397   /* 1) Make sure the loop header has exactly two entries
7398      2) Make sure we have a preheader basic block.  */
7399
7400   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7401
7402   split_edge (loop_preheader_edge (loop));
7403
7404   /* FORNOW: the vectorizer supports only loops which body consist
7405      of one basic block (header + empty latch). When the vectorizer will
7406      support more involved loop forms, the order by which the BBs are
7407      traversed need to be reconsidered.  */
7408
7409   for (i = 0; i < nbbs; i++)
7410     {
7411       basic_block bb = bbs[i];
7412       stmt_vec_info stmt_info;
7413
7414       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7415            gsi_next (&si))
7416         {
7417           gphi *phi = si.phi ();
7418           if (dump_enabled_p ())
7419             {
7420               dump_printf_loc (MSG_NOTE, vect_location,
7421                                "------>vectorizing phi: ");
7422               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7423             }
7424           stmt_info = vinfo_for_stmt (phi);
7425           if (!stmt_info)
7426             continue;
7427
7428           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7429             vect_loop_kill_debug_uses (loop, phi);
7430
7431           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7432               && !STMT_VINFO_LIVE_P (stmt_info))
7433             continue;
7434
7435           if (STMT_VINFO_VECTYPE (stmt_info)
7436               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7437                   != (unsigned HOST_WIDE_INT) vf)
7438               && dump_enabled_p ())
7439             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7440
7441           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7442                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7443                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7444               && ! PURE_SLP_STMT (stmt_info))
7445             {
7446               if (dump_enabled_p ())
7447                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7448               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7449             }
7450         }
7451
7452       pattern_stmt = NULL;
7453       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7454            !gsi_end_p (si) || transform_pattern_stmt;)
7455         {
7456           bool is_store;
7457
7458           if (transform_pattern_stmt)
7459             stmt = pattern_stmt;
7460           else
7461             {
7462               stmt = gsi_stmt (si);
7463               /* During vectorization remove existing clobber stmts.  */
7464               if (gimple_clobber_p (stmt))
7465                 {
7466                   unlink_stmt_vdef (stmt);
7467                   gsi_remove (&si, true);
7468                   release_defs (stmt);
7469                   continue;
7470                 }
7471             }
7472
7473           if (dump_enabled_p ())
7474             {
7475               dump_printf_loc (MSG_NOTE, vect_location,
7476                                "------>vectorizing statement: ");
7477               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7478             }
7479
7480           stmt_info = vinfo_for_stmt (stmt);
7481
7482           /* vector stmts created in the outer-loop during vectorization of
7483              stmts in an inner-loop may not have a stmt_info, and do not
7484              need to be vectorized.  */
7485           if (!stmt_info)
7486             {
7487               gsi_next (&si);
7488               continue;
7489             }
7490
7491           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7492             vect_loop_kill_debug_uses (loop, stmt);
7493
7494           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7495               && !STMT_VINFO_LIVE_P (stmt_info))
7496             {
7497               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7498                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7499                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7500                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7501                 {
7502                   stmt = pattern_stmt;
7503                   stmt_info = vinfo_for_stmt (stmt);
7504                 }
7505               else
7506                 {
7507                   gsi_next (&si);
7508                   continue;
7509                 }
7510             }
7511           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7512                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7513                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7514                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7515             transform_pattern_stmt = true;
7516
7517           /* If pattern statement has def stmts, vectorize them too.  */
7518           if (is_pattern_stmt_p (stmt_info))
7519             {
7520               if (pattern_def_seq == NULL)
7521                 {
7522                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7523                   pattern_def_si = gsi_start (pattern_def_seq);
7524                 }
7525               else if (!gsi_end_p (pattern_def_si))
7526                 gsi_next (&pattern_def_si);
7527               if (pattern_def_seq != NULL)
7528                 {
7529                   gimple *pattern_def_stmt = NULL;
7530                   stmt_vec_info pattern_def_stmt_info = NULL;
7531
7532                   while (!gsi_end_p (pattern_def_si))
7533                     {
7534                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7535                       pattern_def_stmt_info
7536                         = vinfo_for_stmt (pattern_def_stmt);
7537                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7538                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7539                         break;
7540                       gsi_next (&pattern_def_si);
7541                     }
7542
7543                   if (!gsi_end_p (pattern_def_si))
7544                     {
7545                       if (dump_enabled_p ())
7546                         {
7547                           dump_printf_loc (MSG_NOTE, vect_location,
7548                                            "==> vectorizing pattern def "
7549                                            "stmt: ");
7550                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7551                                             pattern_def_stmt, 0);
7552                         }
7553
7554                       stmt = pattern_def_stmt;
7555                       stmt_info = pattern_def_stmt_info;
7556                     }
7557                   else
7558                     {
7559                       pattern_def_si = gsi_none ();
7560                       transform_pattern_stmt = false;
7561                     }
7562                 }
7563               else
7564                 transform_pattern_stmt = false;
7565             }
7566
7567           if (STMT_VINFO_VECTYPE (stmt_info))
7568             {
7569               unsigned int nunits
7570                 = (unsigned int)
7571                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7572               if (!STMT_SLP_TYPE (stmt_info)
7573                   && nunits != (unsigned int) vf
7574                   && dump_enabled_p ())
7575                   /* For SLP VF is set according to unrolling factor, and not
7576                      to vector size, hence for SLP this print is not valid.  */
7577                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7578             }
7579
7580           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7581              reached.  */
7582           if (STMT_SLP_TYPE (stmt_info))
7583             {
7584               if (!slp_scheduled)
7585                 {
7586                   slp_scheduled = true;
7587
7588                   if (dump_enabled_p ())
7589                     dump_printf_loc (MSG_NOTE, vect_location,
7590                                      "=== scheduling SLP instances ===\n");
7591
7592                   vect_schedule_slp (loop_vinfo);
7593                 }
7594
7595               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7596               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7597                 {
7598                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7599                     {
7600                       pattern_def_seq = NULL;
7601                       gsi_next (&si);
7602                     }
7603                   continue;
7604                 }
7605             }
7606
7607           /* -------- vectorize statement ------------ */
7608           if (dump_enabled_p ())
7609             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7610
7611           grouped_store = false;
7612           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7613           if (is_store)
7614             {
7615               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7616                 {
7617                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7618                      interleaving chain was completed - free all the stores in
7619                      the chain.  */
7620                   gsi_next (&si);
7621                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7622                 }
7623               else
7624                 {
7625                   /* Free the attached stmt_vec_info and remove the stmt.  */
7626                   gimple *store = gsi_stmt (si);
7627                   free_stmt_vec_info (store);
7628                   unlink_stmt_vdef (store);
7629                   gsi_remove (&si, true);
7630                   release_defs (store);
7631                 }
7632
7633               /* Stores can only appear at the end of pattern statements.  */
7634               gcc_assert (!transform_pattern_stmt);
7635               pattern_def_seq = NULL;
7636             }
7637           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7638             {
7639               pattern_def_seq = NULL;
7640               gsi_next (&si);
7641             }
7642         }                       /* stmts in BB */
7643     }                           /* BBs in loop */
7644
7645   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7646
7647   scale_profile_for_vect_loop (loop, vf);
7648
7649   /* The minimum number of iterations performed by the epilogue.  This
7650      is 1 when peeling for gaps because we always need a final scalar
7651      iteration.  */
7652   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7653   /* +1 to convert latch counts to loop iteration counts,
7654      -min_epilogue_iters to remove iterations that cannot be performed
7655        by the vector code.  */
7656   int bias = 1 - min_epilogue_iters;
7657   /* In these calculations the "- 1" converts loop iteration counts
7658      back to latch counts.  */
7659   if (loop->any_upper_bound)
7660     loop->nb_iterations_upper_bound
7661       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7662   if (loop->any_likely_upper_bound)
7663     loop->nb_iterations_likely_upper_bound
7664       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7665   if (loop->any_estimate)
7666     loop->nb_iterations_estimate
7667       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7668
7669   if (dump_enabled_p ())
7670     {
7671       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7672         {
7673           dump_printf_loc (MSG_NOTE, vect_location,
7674                            "LOOP VECTORIZED\n");
7675           if (loop->inner)
7676             dump_printf_loc (MSG_NOTE, vect_location,
7677                              "OUTER LOOP VECTORIZED\n");
7678           dump_printf (MSG_NOTE, "\n");
7679         }
7680       else
7681         dump_printf_loc (MSG_NOTE, vect_location,
7682                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7683                          current_vector_size);
7684     }
7685
7686   /* Free SLP instances here because otherwise stmt reference counting
7687      won't work.  */
7688   slp_instance instance;
7689   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7690     vect_free_slp_instance (instance);
7691   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7692   /* Clear-up safelen field since its value is invalid after vectorization
7693      since vectorized loop can have loop-carried dependencies.  */
7694   loop->safelen = 0;
7695
7696   /* Don't vectorize epilogue for epilogue.  */
7697   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7698     epilogue = NULL;
7699
7700   if (epilogue)
7701     {
7702         unsigned int vector_sizes
7703           = targetm.vectorize.autovectorize_vector_sizes ();
7704         vector_sizes &= current_vector_size - 1;
7705
7706         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7707           epilogue = NULL;
7708         else if (!vector_sizes)
7709           epilogue = NULL;
7710         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7711                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7712           {
7713             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7714             int ratio = current_vector_size / smallest_vec_size;
7715             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7716               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7717             eiters = eiters % vf;
7718
7719             epilogue->nb_iterations_upper_bound = eiters - 1;
7720
7721             if (eiters < vf / ratio)
7722               epilogue = NULL;
7723             }
7724     }
7725
7726   if (epilogue)
7727     {
7728       epilogue->force_vectorize = loop->force_vectorize;
7729       epilogue->safelen = loop->safelen;
7730       epilogue->dont_vectorize = false;
7731
7732       /* We may need to if-convert epilogue to vectorize it.  */
7733       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7734         tree_if_conversion (epilogue);
7735     }
7736
7737   return epilogue;
7738 }
7739
7740 /* The code below is trying to perform simple optimization - revert
7741    if-conversion for masked stores, i.e. if the mask of a store is zero
7742    do not perform it and all stored value producers also if possible.
7743    For example,
7744      for (i=0; i<n; i++)
7745        if (c[i])
7746         {
7747           p1[i] += 1;
7748           p2[i] = p3[i] +2;
7749         }
7750    this transformation will produce the following semi-hammock:
7751
7752    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7753      {
7754        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7755        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7756        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7757        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7758        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7759        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7760      }
7761 */
7762
7763 void
7764 optimize_mask_stores (struct loop *loop)
7765 {
7766   basic_block *bbs = get_loop_body (loop);
7767   unsigned nbbs = loop->num_nodes;
7768   unsigned i;
7769   basic_block bb;
7770   struct loop *bb_loop;
7771   gimple_stmt_iterator gsi;
7772   gimple *stmt;
7773   auto_vec<gimple *> worklist;
7774
7775   vect_location = find_loop_location (loop);
7776   /* Pick up all masked stores in loop if any.  */
7777   for (i = 0; i < nbbs; i++)
7778     {
7779       bb = bbs[i];
7780       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7781            gsi_next (&gsi))
7782         {
7783           stmt = gsi_stmt (gsi);
7784           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7785             worklist.safe_push (stmt);
7786         }
7787     }
7788
7789   free (bbs);
7790   if (worklist.is_empty ())
7791     return;
7792
7793   /* Loop has masked stores.  */
7794   while (!worklist.is_empty ())
7795     {
7796       gimple *last, *last_store;
7797       edge e, efalse;
7798       tree mask;
7799       basic_block store_bb, join_bb;
7800       gimple_stmt_iterator gsi_to;
7801       tree vdef, new_vdef;
7802       gphi *phi;
7803       tree vectype;
7804       tree zero;
7805
7806       last = worklist.pop ();
7807       mask = gimple_call_arg (last, 2);
7808       bb = gimple_bb (last);
7809       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7810          the same loop as if_bb.  It could be different to LOOP when two
7811          level loop-nest is vectorized and mask_store belongs to the inner
7812          one.  */
7813       e = split_block (bb, last);
7814       bb_loop = bb->loop_father;
7815       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7816       join_bb = e->dest;
7817       store_bb = create_empty_bb (bb);
7818       add_bb_to_loop (store_bb, bb_loop);
7819       e->flags = EDGE_TRUE_VALUE;
7820       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7821       /* Put STORE_BB to likely part.  */
7822       efalse->probability = profile_probability::unlikely ();
7823       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7824       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7825       if (dom_info_available_p (CDI_DOMINATORS))
7826         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7827       if (dump_enabled_p ())
7828         dump_printf_loc (MSG_NOTE, vect_location,
7829                          "Create new block %d to sink mask stores.",
7830                          store_bb->index);
7831       /* Create vector comparison with boolean result.  */
7832       vectype = TREE_TYPE (mask);
7833       zero = build_zero_cst (vectype);
7834       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7835       gsi = gsi_last_bb (bb);
7836       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7837       /* Create new PHI node for vdef of the last masked store:
7838          .MEM_2 = VDEF <.MEM_1>
7839          will be converted to
7840          .MEM.3 = VDEF <.MEM_1>
7841          and new PHI node will be created in join bb
7842          .MEM_2 = PHI <.MEM_1, .MEM_3>
7843       */
7844       vdef = gimple_vdef (last);
7845       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7846       gimple_set_vdef (last, new_vdef);
7847       phi = create_phi_node (vdef, join_bb);
7848       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7849
7850       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7851       while (true)
7852         {
7853           gimple_stmt_iterator gsi_from;
7854           gimple *stmt1 = NULL;
7855
7856           /* Move masked store to STORE_BB.  */
7857           last_store = last;
7858           gsi = gsi_for_stmt (last);
7859           gsi_from = gsi;
7860           /* Shift GSI to the previous stmt for further traversal.  */
7861           gsi_prev (&gsi);
7862           gsi_to = gsi_start_bb (store_bb);
7863           gsi_move_before (&gsi_from, &gsi_to);
7864           /* Setup GSI_TO to the non-empty block start.  */
7865           gsi_to = gsi_start_bb (store_bb);
7866           if (dump_enabled_p ())
7867             {
7868               dump_printf_loc (MSG_NOTE, vect_location,
7869                                "Move stmt to created bb\n");
7870               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7871             }
7872           /* Move all stored value producers if possible.  */
7873           while (!gsi_end_p (gsi))
7874             {
7875               tree lhs;
7876               imm_use_iterator imm_iter;
7877               use_operand_p use_p;
7878               bool res;
7879
7880               /* Skip debug statements.  */
7881               if (is_gimple_debug (gsi_stmt (gsi)))
7882                 {
7883                   gsi_prev (&gsi);
7884                   continue;
7885                 }
7886               stmt1 = gsi_stmt (gsi);
7887               /* Do not consider statements writing to memory or having
7888                  volatile operand.  */
7889               if (gimple_vdef (stmt1)
7890                   || gimple_has_volatile_ops (stmt1))
7891                 break;
7892               gsi_from = gsi;
7893               gsi_prev (&gsi);
7894               lhs = gimple_get_lhs (stmt1);
7895               if (!lhs)
7896                 break;
7897
7898               /* LHS of vectorized stmt must be SSA_NAME.  */
7899               if (TREE_CODE (lhs) != SSA_NAME)
7900                 break;
7901
7902               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7903                 {
7904                   /* Remove dead scalar statement.  */
7905                   if (has_zero_uses (lhs))
7906                     {
7907                       gsi_remove (&gsi_from, true);
7908                       continue;
7909                     }
7910                 }
7911
7912               /* Check that LHS does not have uses outside of STORE_BB.  */
7913               res = true;
7914               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7915                 {
7916                   gimple *use_stmt;
7917                   use_stmt = USE_STMT (use_p);
7918                   if (is_gimple_debug (use_stmt))
7919                     continue;
7920                   if (gimple_bb (use_stmt) != store_bb)
7921                     {
7922                       res = false;
7923                       break;
7924                     }
7925                 }
7926               if (!res)
7927                 break;
7928
7929               if (gimple_vuse (stmt1)
7930                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7931                 break;
7932
7933               /* Can move STMT1 to STORE_BB.  */
7934               if (dump_enabled_p ())
7935                 {
7936                   dump_printf_loc (MSG_NOTE, vect_location,
7937                                    "Move stmt to created bb\n");
7938                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7939                 }
7940               gsi_move_before (&gsi_from, &gsi_to);
7941               /* Shift GSI_TO for further insertion.  */
7942               gsi_prev (&gsi_to);
7943             }
7944           /* Put other masked stores with the same mask to STORE_BB.  */
7945           if (worklist.is_empty ()
7946               || gimple_call_arg (worklist.last (), 2) != mask
7947               || worklist.last () != stmt1)
7948             break;
7949           last = worklist.pop ();
7950         }
7951       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7952     }
7953 }