gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53
  54 /* Loop Vectorization Pass.
  55
  56    This pass tries to vectorize loops.
  57
  58    For example, the vectorizer transforms the following simple loop:
  59
  60         short a[N]; short b[N]; short c[N]; int i;
  61
  62         for (i=0; i<N; i++){
  63           a[i] = b[i] + c[i];
  64         }
  65
  66    as if it was manually vectorized by rewriting the source code into:
  67
  68         typedef int __attribute__((mode(V8HI))) v8hi;
  69         short a[N];  short b[N]; short c[N];   int i;
  70         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  71         v8hi va, vb, vc;
  72
  73         for (i=0; i<N/8; i++){
  74           vb = pb[i];
  75           vc = pc[i];
  76           va = vb + vc;
  77           pa[i] = va;
  78         }
  79
  80         The main entry to this pass is vectorize_loops(), in which
  81    the vectorizer applies a set of analyses on a given set of loops,
  82    followed by the actual vectorization transformation for the loops that
  83    had successfully passed the analysis phase.
  84         Throughout this pass we make a distinction between two types of
  85    data: scalars (which are represented by SSA_NAMES), and memory references
  86    ("data-refs").  These two types of data require different handling both
  87    during analysis and transformation. The types of data-refs that the
  88    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  89    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  90    accesses are required to have a simple (consecutive) access pattern.
  91
  92    Analysis phase:
  93    ===============
  94         The driver for the analysis phase is vect_analyze_loop().
  95    It applies a set of analyses, some of which rely on the scalar evolution
  96    analyzer (scev) developed by Sebastian Pop.
  97
  98         During the analysis phase the vectorizer records some information
  99    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 100    loop, as well as general information about the loop as a whole, which is
 101    recorded in a "loop_vec_info" struct attached to each loop.
 102
 103    Transformation phase:
 104    =====================
 105         The loop transformation phase scans all the stmts in the loop, and
 106    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 107    the loop that needs to be vectorized.  It inserts the vector code sequence
 108    just before the scalar stmt S, and records a pointer to the vector code
 109    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 110    attached to S).  This pointer will be used for the vectorization of following
 111    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 112    otherwise, we rely on dead code elimination for removing it.
 113
 114         For example, say stmt S1 was vectorized into stmt VS1:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    S2:  a = b;
 119
 120    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 121    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 122    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 123    resulting sequence would be:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    VS2: va = vb;
 128    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 129
 130         Operands that are not SSA_NAMEs, are data-refs that appear in
 131    load/store operations (like 'x[i]' in S1), and are handled differently.
 132
 133    Target modeling:
 134    =================
 135         Currently the only target specific information that is used is the
 136    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 137    Targets that can support different sizes of vectors, for now will need
 138    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 139    flexibility will be added in the future.
 140
 141         Since we only vectorize operations which vector form can be
 142    expressed using existing tree codes, to verify that an operation is
 143    supported, the vectorizer checks the relevant optab at the relevant
 144    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 145    the value found is CODE_FOR_nothing, then there's no target support, and
 146    we can't vectorize the stmt.
 147
 148    For additional information on this project see:
 149    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 150 */
 151
 152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 153
 154 /* Function vect_determine_vectorization_factor
 155
 156    Determine the vectorization factor (VF).  VF is the number of data elements
 157    that are operated upon in parallel in a single iteration of the vectorized
 158    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 159    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 160    elements can fit in a single vector register.
 161
 162    We currently support vectorization of loops in which all types operated upon
 163    are of the same size.  Therefore this function currently sets VF according to
 164    the size of the types operated upon, and fails if there are multiple sizes
 165    in the loop.
 166
 167    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 168    original loop:
 169         for (i=0; i<N; i++){
 170           a[i] = b[i] + c[i];
 171         }
 172
 173    vectorized loop:
 174         for (i=0; i<N; i+=VF){
 175           a[i:VF] = b[i:VF] + c[i:VF];
 176         }
 177 */
 178
 179 static bool
 180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 181 {
 182   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 183   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 184   unsigned nbbs = loop->num_nodes;
 185   unsigned int vectorization_factor = 0;
 186   tree scalar_type = NULL_TREE;
 187   gphi *phi;
 188   tree vectype;
 189   unsigned int nunits;
 190   stmt_vec_info stmt_info;
 191   unsigned i;
 192   HOST_WIDE_INT dummy;
 193   gimple *stmt, *pattern_stmt = NULL;
 194   gimple_seq pattern_def_seq = NULL;
 195   gimple_stmt_iterator pattern_def_si = gsi_none ();
 196   bool analyze_pattern_stmt = false;
 197   bool bool_result;
 198   auto_vec<stmt_vec_info> mask_producers;
 199
 200   if (dump_enabled_p ())
 201     dump_printf_loc (MSG_NOTE, vect_location,
 202                      "=== vect_determine_vectorization_factor ===\n");
 203
 204   for (i = 0; i < nbbs; i++)
 205     {
 206       basic_block bb = bbs[i];
 207
 208       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 209            gsi_next (&si))
 210         {
 211           phi = si.phi ();
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217             }
 218
 219           gcc_assert (stmt_info);
 220
 221           if (STMT_VINFO_RELEVANT_P (stmt_info)
 222               || STMT_VINFO_LIVE_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 554           if (dump_enabled_p ())
 555             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 556           if (!vectorization_factor
 557               || (nunits > vectorization_factor))
 558             vectorization_factor = nunits;
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 571                      vectorization_factor);
 572   if (vectorization_factor <= 1)
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "not vectorized: unsupported data-type\n");
 577       return false;
 578     }
 579   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 580
 581   for (i = 0; i < mask_producers.length (); i++)
 582     {
 583       tree mask_type = NULL;
 584
 585       stmt = STMT_VINFO_STMT (mask_producers[i]);
 586
 587       if (is_gimple_assign (stmt)
 588           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 589           && !VECT_SCALAR_BOOLEAN_TYPE_P
 590                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                     }
 623                   return false;
 624                 }
 625
 626               /* No vectype probably means external definition.
 627                  Allow it in case there is another operand which
 628                  allows to determine mask type.  */
 629               if (!vectype)
 630                 continue;
 631
 632               if (!mask_type)
 633                 mask_type = vectype;
 634               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 635                        != TYPE_VECTOR_SUBPARTS (vectype))
 636                 {
 637                   if (dump_enabled_p ())
 638                     {
 639                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                                        "not vectorized: different sized masks "
 641                                        "types in statement, ");
 642                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 643                                          mask_type);
 644                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 645                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 646                                          vectype);
 647                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 648                     }
 649                   return false;
 650                 }
 651               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 652                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 653                 {
 654                   if (dump_enabled_p ())
 655                     {
 656                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                                        "not vectorized: mixed mask and "
 658                                        "nonmask vector types in statement, ");
 659                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 660                                          mask_type);
 661                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 662                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 663                                          vectype);
 664                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 665                     }
 666                   return false;
 667                 }
 668             }
 669
 670           /* We may compare boolean value loaded as vector of integers.
 671              Fix mask_type in such case.  */
 672           if (mask_type
 673               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 674               && gimple_code (stmt) == GIMPLE_ASSIGN
 675               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 676             mask_type = build_same_sized_truth_vector_type (mask_type);
 677         }
 678
 679       /* No mask_type should mean loop invariant predicate.
 680          This is probably a subject for optimization in
 681          if-conversion.  */
 682       if (!mask_type)
 683         {
 684           if (dump_enabled_p ())
 685             {
 686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                                "not vectorized: can't compute mask type "
 688                                "for statement, ");
 689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 690                                 0);
 691             }
 692           return false;
 693         }
 694
 695       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 696     }
 697
 698   return true;
 699 }
 700
 701
 702 /* Function vect_is_simple_iv_evolution.
 703
 704    FORNOW: A simple evolution of an induction variables in the loop is
 705    considered a polynomial evolution.  */
 706
 707 static bool
 708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 709                              tree * step)
 710 {
 711   tree init_expr;
 712   tree step_expr;
 713   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 714   basic_block bb;
 715
 716   /* When there is no evolution in this loop, the evolution function
 717      is not "simple".  */
 718   if (evolution_part == NULL_TREE)
 719     return false;
 720
 721   /* When the evolution is a polynomial of degree >= 2
 722      the evolution function is not "simple".  */
 723   if (tree_is_chrec (evolution_part))
 724     return false;
 725
 726   step_expr = evolution_part;
 727   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 728
 729   if (dump_enabled_p ())
 730     {
 731       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 732       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 733       dump_printf (MSG_NOTE, ",  init: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 735       dump_printf (MSG_NOTE, "\n");
 736     }
 737
 738   *init = init_expr;
 739   *step = step_expr;
 740
 741   if (TREE_CODE (step_expr) != INTEGER_CST
 742       && (TREE_CODE (step_expr) != SSA_NAME
 743           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 744               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 745           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 746               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 747                   || !flag_associative_math)))
 748       && (TREE_CODE (step_expr) != REAL_CST
 749           || !flag_associative_math))
 750     {
 751       if (dump_enabled_p ())
 752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                          "step unknown.\n");
 754       return false;
 755     }
 756
 757   return true;
 758 }
 759
 760 /* Function vect_analyze_scalar_cycles_1.
 761
 762    Examine the cross iteration def-use cycles of scalar variables
 763    in LOOP.  LOOP_VINFO represents the loop that is now being
 764    considered for vectorization (can be LOOP, or an outer-loop
 765    enclosing LOOP).  */
 766
 767 static void
 768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 769 {
 770   basic_block bb = loop->header;
 771   tree init, step;
 772   auto_vec<gimple *, 64> worklist;
 773   gphi_iterator gsi;
 774   bool double_reduc;
 775
 776   if (dump_enabled_p ())
 777     dump_printf_loc (MSG_NOTE, vect_location,
 778                      "=== vect_analyze_scalar_cycles ===\n");
 779
 780   /* First - identify all inductions.  Reduction detection assumes that all the
 781      inductions have been identified, therefore, this order must not be
 782      changed.  */
 783   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 784     {
 785       gphi *phi = gsi.phi ();
 786       tree access_fn = NULL;
 787       tree def = PHI_RESULT (phi);
 788       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 789
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 793           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 794         }
 795
 796       /* Skip virtual phi's.  The data dependences that are associated with
 797          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 798       if (virtual_operand_p (def))
 799         continue;
 800
 801       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 802
 803       /* Analyze the evolution function.  */
 804       access_fn = analyze_scalar_evolution (loop, def);
 805       if (access_fn)
 806         {
 807           STRIP_NOPS (access_fn);
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "Access function of PHI: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 816             = initial_condition_in_loop_num (access_fn, loop->num);
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 818             = evolution_part_in_loop_num (access_fn, loop->num);
 819         }
 820
 821       if (!access_fn
 822           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 823           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 824               && TREE_CODE (step) != INTEGER_CST))
 825         {
 826           worklist.safe_push (phi);
 827           continue;
 828         }
 829
 830       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 831                   != NULL_TREE);
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 833
 834       if (dump_enabled_p ())
 835         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 836       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 837     }
 838
 839
 840   /* Second - identify all reductions and nested cycles.  */
 841   while (worklist.length () > 0)
 842     {
 843       gimple *phi = worklist.pop ();
 844       tree def = PHI_RESULT (phi);
 845       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 846       gimple *reduc_stmt;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852         }
 853
 854       gcc_assert (!virtual_operand_p (def)
 855                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 856
 857       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 858                                                 &double_reduc, false);
 859       if (reduc_stmt)
 860         {
 861           if (double_reduc)
 862             {
 863               if (dump_enabled_p ())
 864                 dump_printf_loc (MSG_NOTE, vect_location,
 865                                  "Detected double reduction.\n");
 866
 867               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 868               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 869                                                     vect_double_reduction_def;
 870             }
 871           else
 872             {
 873               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 874                 {
 875                   if (dump_enabled_p ())
 876                     dump_printf_loc (MSG_NOTE, vect_location,
 877                                      "Detected vectorizable nested cycle.\n");
 878
 879                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 880                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 881                                                              vect_nested_cycle;
 882                 }
 883               else
 884                 {
 885                   if (dump_enabled_p ())
 886                     dump_printf_loc (MSG_NOTE, vect_location,
 887                                      "Detected reduction.\n");
 888
 889                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 890                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 891                                                            vect_reduction_def;
 892                   /* Store the reduction cycles for possible vectorization in
 893                      loop-aware SLP if it was not detected as reduction
 894                      chain.  */
 895                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 896                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 984         while (next)
 985           {
 986             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 987               break;
 988             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 989           }
 990         /* If not all stmt in the chain are patterns try to handle
 991            the chain without patterns.  */
 992         if (! next)
 993           {
 994             vect_fixup_reduc_chain (first);
 995             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 996               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 997           }
 998       }
 999 }
1000
1001 /* Function vect_get_loop_niters.
1002
1003    Determine how many iterations the loop is executed and place it
1004    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1005    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1006    niter information holds in ASSUMPTIONS.
1007
1008    Return the loop exit condition.  */
1009
1010
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013                       tree *number_of_iterations, tree *number_of_iterationsm1)
1014 {
1015   edge exit = single_exit (loop);
1016   struct tree_niter_desc niter_desc;
1017   tree niter_assumptions, niter, may_be_zero;
1018   gcond *cond = get_loop_exit_condition (loop);
1019
1020   *assumptions = boolean_true_node;
1021   *number_of_iterationsm1 = chrec_dont_know;
1022   *number_of_iterations = chrec_dont_know;
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   if (!exit)
1028     return cond;
1029
1030   niter = chrec_dont_know;
1031   may_be_zero = NULL_TREE;
1032   niter_assumptions = boolean_true_node;
1033   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034       || chrec_contains_undetermined (niter_desc.niter))
1035     return cond;
1036
1037   niter_assumptions = niter_desc.assumptions;
1038   may_be_zero = niter_desc.may_be_zero;
1039   niter = niter_desc.niter;
1040
1041   if (may_be_zero && integer_zerop (may_be_zero))
1042     may_be_zero = NULL_TREE;
1043
1044   if (may_be_zero)
1045     {
1046       if (COMPARISON_CLASS_P (may_be_zero))
1047         {
1048           /* Try to combine may_be_zero with assumptions, this can simplify
1049              computation of niter expression.  */
1050           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052                                              niter_assumptions,
1053                                              fold_build1 (TRUTH_NOT_EXPR,
1054                                                           boolean_type_node,
1055                                                           may_be_zero));
1056           else
1057             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1059
1060           may_be_zero = NULL_TREE;
1061         }
1062       else if (integer_nonzerop (may_be_zero))
1063         {
1064           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066           return cond;
1067         }
1068       else
1069         return cond;
1070     }
1071
1072   *assumptions = niter_assumptions;
1073   *number_of_iterationsm1 = niter;
1074
1075   /* We want the number of loop header executions which is the number
1076      of latch executions plus one.
1077      ???  For UINT_MAX latch executions this number overflows to zero
1078      for loops like do { n++; } while (n != 0);  */
1079   if (niter && !chrec_contains_undetermined (niter))
1080     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081                           build_int_cst (TREE_TYPE (niter), 1));
1082   *number_of_iterations = niter;
1083
1084   return cond;
1085 }
1086
1087 /* Function bb_in_loop_p
1088
1089    Used as predicate for dfs order traversal of the loop bbs.  */
1090
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1093 {
1094   const struct loop *const loop = (const struct loop *)data;
1095   if (flow_bb_inside_loop_p (loop, bb))
1096     return true;
1097   return false;
1098 }
1099
1100
1101 /* Function new_loop_vec_info.
1102
1103    Create and initialize a new loop_vec_info struct for LOOP, as well as
1104    stmt_vec_info structs for all the stmts in LOOP.  */
1105
1106 static loop_vec_info
1107 new_loop_vec_info (struct loop *loop)
1108 {
1109   loop_vec_info res;
1110   basic_block *bbs;
1111   gimple_stmt_iterator si;
1112   unsigned int i, nbbs;
1113
1114   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1115   res->kind = vec_info::loop;
1116   LOOP_VINFO_LOOP (res) = loop;
1117
1118   bbs = get_loop_body (loop);
1119
1120   /* Create/Update stmt_info for all stmts in the loop.  */
1121   for (i = 0; i < loop->num_nodes; i++)
1122     {
1123       basic_block bb = bbs[i];
1124
1125       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1126         {
1127           gimple *phi = gsi_stmt (si);
1128           gimple_set_uid (phi, 0);
1129           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1130         }
1131
1132       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1133         {
1134           gimple *stmt = gsi_stmt (si);
1135           gimple_set_uid (stmt, 0);
1136           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1137         }
1138     }
1139
1140   /* CHECKME: We want to visit all BBs before their successors (except for
1141      latch blocks, for which this assertion wouldn't hold).  In the simple
1142      case of the loop forms we allow, a dfs order of the BBs would the same
1143      as reversed postorder traversal, so we are safe.  */
1144
1145    free (bbs);
1146    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1147    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1148                               bbs, loop->num_nodes, loop);
1149    gcc_assert (nbbs == loop->num_nodes);
1150
1151   LOOP_VINFO_BBS (res) = bbs;
1152   LOOP_VINFO_NITERSM1 (res) = NULL;
1153   LOOP_VINFO_NITERS (res) = NULL;
1154   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1155   LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL;
1156   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1157   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1158   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1159   LOOP_VINFO_VECT_FACTOR (res) = 0;
1160   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1161   LOOP_VINFO_DATAREFS (res) = vNULL;
1162   LOOP_VINFO_DDRS (res) = vNULL;
1163   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1164   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1165   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1166   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1167   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1168   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1169   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1170   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1171   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1172   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1173   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1174   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1175   LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
1176
1177   return res;
1178 }
1179
1180
1181 /* Function destroy_loop_vec_info.
1182
1183    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1184    stmts in the loop.  */
1185
1186 void
1187 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1188 {
1189   struct loop *loop;
1190   basic_block *bbs;
1191   int nbbs;
1192   gimple_stmt_iterator si;
1193   int j;
1194   vec<slp_instance> slp_instances;
1195   slp_instance instance;
1196   bool swapped;
1197
1198   if (!loop_vinfo)
1199     return;
1200
1201   loop = LOOP_VINFO_LOOP (loop_vinfo);
1202
1203   bbs = LOOP_VINFO_BBS (loop_vinfo);
1204   nbbs = clean_stmts ? loop->num_nodes : 0;
1205   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1206
1207   for (j = 0; j < nbbs; j++)
1208     {
1209       basic_block bb = bbs[j];
1210       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1211         free_stmt_vec_info (gsi_stmt (si));
1212
1213       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1214         {
1215           gimple *stmt = gsi_stmt (si);
1216
1217           /* We may have broken canonical form by moving a constant
1218              into RHS1 of a commutative op.  Fix such occurrences.  */
1219           if (swapped && is_gimple_assign (stmt))
1220             {
1221               enum tree_code code = gimple_assign_rhs_code (stmt);
1222
1223               if ((code == PLUS_EXPR
1224                    || code == POINTER_PLUS_EXPR
1225                    || code == MULT_EXPR)
1226                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1227                 swap_ssa_operands (stmt,
1228                                    gimple_assign_rhs1_ptr (stmt),
1229                                    gimple_assign_rhs2_ptr (stmt));
1230               else if (code == COND_EXPR
1231                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1232                 {
1233                   tree cond_expr = gimple_assign_rhs1 (stmt);
1234                   enum tree_code cond_code = TREE_CODE (cond_expr);
1235
1236                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1237                     {
1238                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1239                                                                   0));
1240                       cond_code = invert_tree_comparison (cond_code,
1241                                                           honor_nans);
1242                       if (cond_code != ERROR_MARK)
1243                         {
1244                           TREE_SET_CODE (cond_expr, cond_code);
1245                           swap_ssa_operands (stmt,
1246                                              gimple_assign_rhs2_ptr (stmt),
1247                                              gimple_assign_rhs3_ptr (stmt));
1248                         }
1249                     }
1250                 }
1251             }
1252
1253           /* Free stmt_vec_info.  */
1254           free_stmt_vec_info (stmt);
1255           gsi_next (&si);
1256         }
1257     }
1258
1259   free (LOOP_VINFO_BBS (loop_vinfo));
1260   vect_destroy_datarefs (loop_vinfo);
1261   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1262   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1263   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1264   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1265   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1266   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1267   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1268     vect_free_slp_instance (instance);
1269
1270   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1271   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1272   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1273   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1274
1275   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1276   loop_vinfo->scalar_cost_vec.release ();
1277
1278   free (loop_vinfo);
1279   loop->aux = NULL;
1280 }
1281
1282
1283 /* Calculate the cost of one scalar iteration of the loop.  */
1284 static void
1285 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1286 {
1287   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1289   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1290   int innerloop_iters, i;
1291
1292   /* Count statements in scalar loop.  Using this as scalar cost for a single
1293      iteration for now.
1294
1295      TODO: Add outer loop support.
1296
1297      TODO: Consider assigning different costs to different scalar
1298      statements.  */
1299
1300   /* FORNOW.  */
1301   innerloop_iters = 1;
1302   if (loop->inner)
1303     innerloop_iters = 50; /* FIXME */
1304
1305   for (i = 0; i < nbbs; i++)
1306     {
1307       gimple_stmt_iterator si;
1308       basic_block bb = bbs[i];
1309
1310       if (bb->loop_father == loop->inner)
1311         factor = innerloop_iters;
1312       else
1313         factor = 1;
1314
1315       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1316         {
1317           gimple *stmt = gsi_stmt (si);
1318           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1319
1320           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1321             continue;
1322
1323           /* Skip stmts that are not vectorized inside the loop.  */
1324           if (stmt_info
1325               && !STMT_VINFO_RELEVANT_P (stmt_info)
1326               && (!STMT_VINFO_LIVE_P (stmt_info)
1327                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1328               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1329             continue;
1330
1331           vect_cost_for_stmt kind;
1332           if (STMT_VINFO_DATA_REF (stmt_info))
1333             {
1334               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1335                kind = scalar_load;
1336              else
1337                kind = scalar_store;
1338             }
1339           else
1340             kind = scalar_stmt;
1341
1342           scalar_single_iter_cost
1343             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1344                                  factor, kind, stmt_info, 0, vect_prologue);
1345         }
1346     }
1347   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1348     = scalar_single_iter_cost;
1349 }
1350
1351
1352 /* Function vect_analyze_loop_form_1.
1353
1354    Verify that certain CFG restrictions hold, including:
1355    - the loop has a pre-header
1356    - the loop has a single entry and exit
1357    - the loop exit condition is simple enough
1358    - the number of iterations can be analyzed, i.e, a countable loop.  The
1359      niter could be analyzed under some assumptions.  */
1360
1361 bool
1362 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1363                           tree *assumptions, tree *number_of_iterationsm1,
1364                           tree *number_of_iterations, gcond **inner_loop_cond)
1365 {
1366   if (dump_enabled_p ())
1367     dump_printf_loc (MSG_NOTE, vect_location,
1368                      "=== vect_analyze_loop_form ===\n");
1369
1370   /* Different restrictions apply when we are considering an inner-most loop,
1371      vs. an outer (nested) loop.
1372      (FORNOW. May want to relax some of these restrictions in the future).  */
1373
1374   if (!loop->inner)
1375     {
1376       /* Inner-most loop.  We currently require that the number of BBs is
1377          exactly 2 (the header and latch).  Vectorizable inner-most loops
1378          look like this:
1379
1380                         (pre-header)
1381                            |
1382                           header <--------+
1383                            | |            |
1384                            | +--> latch --+
1385                            |
1386                         (exit-bb)  */
1387
1388       if (loop->num_nodes != 2)
1389         {
1390           if (dump_enabled_p ())
1391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392                              "not vectorized: control flow in loop.\n");
1393           return false;
1394         }
1395
1396       if (empty_block_p (loop->header))
1397         {
1398           if (dump_enabled_p ())
1399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                              "not vectorized: empty loop.\n");
1401           return false;
1402         }
1403     }
1404   else
1405     {
1406       struct loop *innerloop = loop->inner;
1407       edge entryedge;
1408
1409       /* Nested loop. We currently require that the loop is doubly-nested,
1410          contains a single inner loop, and the number of BBs is exactly 5.
1411          Vectorizable outer-loops look like this:
1412
1413                         (pre-header)
1414                            |
1415                           header <---+
1416                            |         |
1417                           inner-loop |
1418                            |         |
1419                           tail ------+
1420                            |
1421                         (exit-bb)
1422
1423          The inner-loop has the properties expected of inner-most loops
1424          as described above.  */
1425
1426       if ((loop->inner)->inner || (loop->inner)->next)
1427         {
1428           if (dump_enabled_p ())
1429             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430                              "not vectorized: multiple nested loops.\n");
1431           return false;
1432         }
1433
1434       if (loop->num_nodes != 5)
1435         {
1436           if (dump_enabled_p ())
1437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                              "not vectorized: control flow in loop.\n");
1439           return false;
1440         }
1441
1442       entryedge = loop_preheader_edge (innerloop);
1443       if (entryedge->src != loop->header
1444           || !single_exit (innerloop)
1445           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1446         {
1447           if (dump_enabled_p ())
1448             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                              "not vectorized: unsupported outerloop form.\n");
1450           return false;
1451         }
1452
1453       /* Analyze the inner-loop.  */
1454       tree inner_niterm1, inner_niter, inner_assumptions;
1455       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1456                                       &inner_assumptions, &inner_niterm1,
1457                                       &inner_niter, NULL)
1458           /* Don't support analyzing niter under assumptions for inner
1459              loop.  */
1460           || !integer_onep (inner_assumptions))
1461         {
1462           if (dump_enabled_p ())
1463             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                              "not vectorized: Bad inner loop.\n");
1465           return false;
1466         }
1467
1468       if (!expr_invariant_in_loop_p (loop, inner_niter))
1469         {
1470           if (dump_enabled_p ())
1471             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472                              "not vectorized: inner-loop count not"
1473                              " invariant.\n");
1474           return false;
1475         }
1476
1477       if (dump_enabled_p ())
1478         dump_printf_loc (MSG_NOTE, vect_location,
1479                          "Considering outer-loop vectorization.\n");
1480     }
1481
1482   if (!single_exit (loop)
1483       || EDGE_COUNT (loop->header->preds) != 2)
1484     {
1485       if (dump_enabled_p ())
1486         {
1487           if (!single_exit (loop))
1488             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                              "not vectorized: multiple exits.\n");
1490           else if (EDGE_COUNT (loop->header->preds) != 2)
1491             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492                              "not vectorized: too many incoming edges.\n");
1493         }
1494       return false;
1495     }
1496
1497   /* We assume that the loop exit condition is at the end of the loop. i.e,
1498      that the loop is represented as a do-while (with a proper if-guard
1499      before the loop if needed), where the loop header contains all the
1500      executable statements, and the latch is empty.  */
1501   if (!empty_block_p (loop->latch)
1502       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1503     {
1504       if (dump_enabled_p ())
1505         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506                          "not vectorized: latch block not empty.\n");
1507       return false;
1508     }
1509
1510   /* Make sure the exit is not abnormal.  */
1511   edge e = single_exit (loop);
1512   if (e->flags & EDGE_ABNORMAL)
1513     {
1514       if (dump_enabled_p ())
1515         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516                          "not vectorized: abnormal loop exit edge.\n");
1517       return false;
1518     }
1519
1520   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1521                                      number_of_iterationsm1);
1522   if (!*loop_cond)
1523     {
1524       if (dump_enabled_p ())
1525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526                          "not vectorized: complicated exit condition.\n");
1527       return false;
1528     }
1529
1530   if (integer_zerop (*assumptions)
1531       || !*number_of_iterations
1532       || chrec_contains_undetermined (*number_of_iterations))
1533     {
1534       if (dump_enabled_p ())
1535         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1536                          "not vectorized: number of iterations cannot be "
1537                          "computed.\n");
1538       return false;
1539     }
1540
1541   if (integer_zerop (*number_of_iterations))
1542     {
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                          "not vectorized: number of iterations = 0.\n");
1546       return false;
1547     }
1548
1549   return true;
1550 }
1551
1552 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1553
1554 loop_vec_info
1555 vect_analyze_loop_form (struct loop *loop)
1556 {
1557   tree assumptions, number_of_iterations, number_of_iterationsm1;
1558   gcond *loop_cond, *inner_loop_cond = NULL;
1559
1560   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1561                                   &assumptions, &number_of_iterationsm1,
1562                                   &number_of_iterations, &inner_loop_cond))
1563     return NULL;
1564
1565   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1566   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1567   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1568   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1569   if (!integer_onep (assumptions))
1570     {
1571       /* We consider to vectorize this loop by versioning it under
1572          some assumptions.  In order to do this, we need to clear
1573          existing information computed by scev and niter analyzer.  */
1574       scev_reset_htab ();
1575       free_numbers_of_iterations_estimates (loop);
1576       /* Also set flag for this loop so that following scev and niter
1577          analysis are done under the assumptions.  */
1578       loop_constraint_set (loop, LOOP_C_FINITE);
1579       /* Also record the assumptions for versioning.  */
1580       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1581     }
1582
1583   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1584     {
1585       if (dump_enabled_p ())
1586         {
1587           dump_printf_loc (MSG_NOTE, vect_location,
1588                            "Symbolic number of iterations is ");
1589           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1590           dump_printf (MSG_NOTE, "\n");
1591         }
1592     }
1593
1594   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1595   if (inner_loop_cond)
1596     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1597       = loop_exit_ctrl_vec_info_type;
1598
1599   gcc_assert (!loop->aux);
1600   loop->aux = loop_vinfo;
1601   return loop_vinfo;
1602 }
1603
1604
1605
1606 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1607    statements update the vectorization factor.  */
1608
1609 static void
1610 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1611 {
1612   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1613   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1614   int nbbs = loop->num_nodes;
1615   unsigned int vectorization_factor;
1616   int i;
1617
1618   if (dump_enabled_p ())
1619     dump_printf_loc (MSG_NOTE, vect_location,
1620                      "=== vect_update_vf_for_slp ===\n");
1621
1622   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1623   gcc_assert (vectorization_factor != 0);
1624
1625   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1626      vectorization factor of the loop is the unrolling factor required by
1627      the SLP instances.  If that unrolling factor is 1, we say, that we
1628      perform pure SLP on loop - cross iteration parallelism is not
1629      exploited.  */
1630   bool only_slp_in_loop = true;
1631   for (i = 0; i < nbbs; i++)
1632     {
1633       basic_block bb = bbs[i];
1634       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1635            gsi_next (&si))
1636         {
1637           gimple *stmt = gsi_stmt (si);
1638           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1639           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1640               && STMT_VINFO_RELATED_STMT (stmt_info))
1641             {
1642               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1643               stmt_info = vinfo_for_stmt (stmt);
1644             }
1645           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1646                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1647               && !PURE_SLP_STMT (stmt_info))
1648             /* STMT needs both SLP and loop-based vectorization.  */
1649             only_slp_in_loop = false;
1650         }
1651     }
1652
1653   if (only_slp_in_loop)
1654     {
1655       dump_printf_loc (MSG_NOTE, vect_location,
1656                        "Loop contains only SLP stmts\n");
1657       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1658     }
1659   else
1660     {
1661       dump_printf_loc (MSG_NOTE, vect_location,
1662                        "Loop contains SLP and non-SLP stmts\n");
1663       vectorization_factor
1664         = least_common_multiple (vectorization_factor,
1665                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1666     }
1667
1668   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1669   if (dump_enabled_p ())
1670     dump_printf_loc (MSG_NOTE, vect_location,
1671                      "Updating vectorization factor to %d\n",
1672                      vectorization_factor);
1673 }
1674
1675 /* Function vect_analyze_loop_operations.
1676
1677    Scan the loop stmts and make sure they are all vectorizable.  */
1678
1679 static bool
1680 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1681 {
1682   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1683   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1684   int nbbs = loop->num_nodes;
1685   int i;
1686   stmt_vec_info stmt_info;
1687   bool need_to_vectorize = false;
1688   bool ok;
1689
1690   if (dump_enabled_p ())
1691     dump_printf_loc (MSG_NOTE, vect_location,
1692                      "=== vect_analyze_loop_operations ===\n");
1693
1694   for (i = 0; i < nbbs; i++)
1695     {
1696       basic_block bb = bbs[i];
1697
1698       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1699            gsi_next (&si))
1700         {
1701           gphi *phi = si.phi ();
1702           ok = true;
1703
1704           stmt_info = vinfo_for_stmt (phi);
1705           if (dump_enabled_p ())
1706             {
1707               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1708               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1709             }
1710           if (virtual_operand_p (gimple_phi_result (phi)))
1711             continue;
1712
1713           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1714              (i.e., a phi in the tail of the outer-loop).  */
1715           if (! is_loop_header_bb_p (bb))
1716             {
1717               /* FORNOW: we currently don't support the case that these phis
1718                  are not used in the outerloop (unless it is double reduction,
1719                  i.e., this phi is vect_reduction_def), cause this case
1720                  requires to actually do something here.  */
1721               if (STMT_VINFO_LIVE_P (stmt_info)
1722                   && STMT_VINFO_DEF_TYPE (stmt_info)
1723                      != vect_double_reduction_def)
1724                 {
1725                   if (dump_enabled_p ())
1726                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727                                      "Unsupported loop-closed phi in "
1728                                      "outer-loop.\n");
1729                   return false;
1730                 }
1731
1732               /* If PHI is used in the outer loop, we check that its operand
1733                  is defined in the inner loop.  */
1734               if (STMT_VINFO_RELEVANT_P (stmt_info))
1735                 {
1736                   tree phi_op;
1737                   gimple *op_def_stmt;
1738
1739                   if (gimple_phi_num_args (phi) != 1)
1740                     return false;
1741
1742                   phi_op = PHI_ARG_DEF (phi, 0);
1743                   if (TREE_CODE (phi_op) != SSA_NAME)
1744                     return false;
1745
1746                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1747                   if (gimple_nop_p (op_def_stmt)
1748                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1749                       || !vinfo_for_stmt (op_def_stmt))
1750                     return false;
1751
1752                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1753                         != vect_used_in_outer
1754                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1755                            != vect_used_in_outer_by_reduction)
1756                     return false;
1757                 }
1758
1759               continue;
1760             }
1761
1762           gcc_assert (stmt_info);
1763
1764           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1765                || STMT_VINFO_LIVE_P (stmt_info))
1766               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1767             {
1768               /* A scalar-dependence cycle that we don't support.  */
1769               if (dump_enabled_p ())
1770                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1771                                  "not vectorized: scalar dependence cycle.\n");
1772               return false;
1773             }
1774
1775           if (STMT_VINFO_RELEVANT_P (stmt_info))
1776             {
1777               need_to_vectorize = true;
1778               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1779                   && ! PURE_SLP_STMT (stmt_info))
1780                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1781               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1782                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1783                        && ! PURE_SLP_STMT (stmt_info))
1784                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1785             }
1786
1787           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1788             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1789
1790           if (!ok)
1791             {
1792               if (dump_enabled_p ())
1793                 {
1794                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1795                                    "not vectorized: relevant phi not "
1796                                    "supported: ");
1797                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1798                 }
1799               return false;
1800             }
1801         }
1802
1803       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1804            gsi_next (&si))
1805         {
1806           gimple *stmt = gsi_stmt (si);
1807           if (!gimple_clobber_p (stmt)
1808               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1809             return false;
1810         }
1811     } /* bbs */
1812
1813   /* All operations in the loop are either irrelevant (deal with loop
1814      control, or dead), or only used outside the loop and can be moved
1815      out of the loop (e.g. invariants, inductions).  The loop can be
1816      optimized away by scalar optimizations.  We're better off not
1817      touching this loop.  */
1818   if (!need_to_vectorize)
1819     {
1820       if (dump_enabled_p ())
1821         dump_printf_loc (MSG_NOTE, vect_location,
1822                          "All the computation can be taken out of the loop.\n");
1823       if (dump_enabled_p ())
1824         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825                          "not vectorized: redundant loop. no profit to "
1826                          "vectorize.\n");
1827       return false;
1828     }
1829
1830   return true;
1831 }
1832
1833
1834 /* Function vect_analyze_loop_2.
1835
1836    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1837    for it.  The different analyses will record information in the
1838    loop_vec_info struct.  */
1839 static bool
1840 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1841 {
1842   bool ok;
1843   int max_vf = MAX_VECTORIZATION_FACTOR;
1844   int min_vf = 2;
1845   unsigned int n_stmts = 0;
1846
1847   /* The first group of checks is independent of the vector size.  */
1848   fatal = true;
1849
1850   /* Find all data references in the loop (which correspond to vdefs/vuses)
1851      and analyze their evolution in the loop.  */
1852
1853   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1854
1855   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1856   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1857     {
1858       if (dump_enabled_p ())
1859         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860                          "not vectorized: loop nest containing two "
1861                          "or more consecutive inner loops cannot be "
1862                          "vectorized\n");
1863       return false;
1864     }
1865
1866   for (unsigned i = 0; i < loop->num_nodes; i++)
1867     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1868          !gsi_end_p (gsi); gsi_next (&gsi))
1869       {
1870         gimple *stmt = gsi_stmt (gsi);
1871         if (is_gimple_debug (stmt))
1872           continue;
1873         ++n_stmts;
1874         if (!find_data_references_in_stmt (loop, stmt,
1875                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1876           {
1877             if (is_gimple_call (stmt) && loop->safelen)
1878               {
1879                 tree fndecl = gimple_call_fndecl (stmt), op;
1880                 if (fndecl != NULL_TREE)
1881                   {
1882                     cgraph_node *node = cgraph_node::get (fndecl);
1883                     if (node != NULL && node->simd_clones != NULL)
1884                       {
1885                         unsigned int j, n = gimple_call_num_args (stmt);
1886                         for (j = 0; j < n; j++)
1887                           {
1888                             op = gimple_call_arg (stmt, j);
1889                             if (DECL_P (op)
1890                                 || (REFERENCE_CLASS_P (op)
1891                                     && get_base_address (op)))
1892                               break;
1893                           }
1894                         op = gimple_call_lhs (stmt);
1895                         /* Ignore #pragma omp declare simd functions
1896                            if they don't have data references in the
1897                            call stmt itself.  */
1898                         if (j == n
1899                             && !(op
1900                                  && (DECL_P (op)
1901                                      || (REFERENCE_CLASS_P (op)
1902                                          && get_base_address (op)))))
1903                           continue;
1904                       }
1905                   }
1906               }
1907             if (dump_enabled_p ())
1908               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909                                "not vectorized: loop contains function "
1910                                "calls or data references that cannot "
1911                                "be analyzed\n");
1912             return false;
1913           }
1914       }
1915
1916   /* Analyze the data references and also adjust the minimal
1917      vectorization factor according to the loads and stores.  */
1918
1919   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1920   if (!ok)
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                          "bad data references.\n");
1925       return false;
1926     }
1927
1928   /* Classify all cross-iteration scalar data-flow cycles.
1929      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1930   vect_analyze_scalar_cycles (loop_vinfo);
1931
1932   vect_pattern_recog (loop_vinfo);
1933
1934   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1935
1936   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1937      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1938
1939   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1940   if (!ok)
1941     {
1942       if (dump_enabled_p ())
1943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944                          "bad data access.\n");
1945       return false;
1946     }
1947
1948   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1949
1950   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1951   if (!ok)
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955                          "unexpected pattern.\n");
1956       return false;
1957     }
1958
1959   /* While the rest of the analysis below depends on it in some way.  */
1960   fatal = false;
1961
1962   /* Analyze data dependences between the data-refs in the loop
1963      and adjust the maximum vectorization factor according to
1964      the dependences.
1965      FORNOW: fail at the first data dependence that we encounter.  */
1966
1967   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1968   if (!ok
1969       || max_vf < min_vf)
1970     {
1971       if (dump_enabled_p ())
1972             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1973                              "bad data dependence.\n");
1974       return false;
1975     }
1976
1977   ok = vect_determine_vectorization_factor (loop_vinfo);
1978   if (!ok)
1979     {
1980       if (dump_enabled_p ())
1981         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1982                          "can't determine vectorization factor.\n");
1983       return false;
1984     }
1985   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1986     {
1987       if (dump_enabled_p ())
1988         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989                          "bad data dependence.\n");
1990       return false;
1991     }
1992
1993   /* Compute the scalar iteration cost.  */
1994   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1995
1996   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997   HOST_WIDE_INT estimated_niter;
1998   unsigned th;
1999   int min_scalar_loop_bound;
2000
2001   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2002   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2003   if (!ok)
2004     return false;
2005
2006   /* If there are any SLP instances mark them as pure_slp.  */
2007   bool slp = vect_make_slp_decision (loop_vinfo);
2008   if (slp)
2009     {
2010       /* Find stmts that need to be both vectorized and SLPed.  */
2011       vect_detect_hybrid_slp (loop_vinfo);
2012
2013       /* Update the vectorization factor based on the SLP decision.  */
2014       vect_update_vf_for_slp (loop_vinfo);
2015     }
2016
2017   /* This is the point where we can re-start analysis with SLP forced off.  */
2018 start_over:
2019
2020   /* Now the vectorization factor is final.  */
2021   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2022   gcc_assert (vectorization_factor != 0);
2023
2024   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2025     dump_printf_loc (MSG_NOTE, vect_location,
2026                      "vectorization_factor = %d, niters = "
2027                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
2028                      LOOP_VINFO_INT_NITERS (loop_vinfo));
2029
2030   HOST_WIDE_INT max_niter
2031     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2032   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2033        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
2034       || (max_niter != -1
2035           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
2036     {
2037       if (dump_enabled_p ())
2038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2039                          "not vectorized: iteration count smaller than "
2040                          "vectorization factor.\n");
2041       return false;
2042     }
2043
2044   /* Analyze the alignment of the data-refs in the loop.
2045      Fail if a data reference is found that cannot be vectorized.  */
2046
2047   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2048   if (!ok)
2049     {
2050       if (dump_enabled_p ())
2051         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2052                          "bad data alignment.\n");
2053       return false;
2054     }
2055
2056   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2057      It is important to call pruning after vect_analyze_data_ref_accesses,
2058      since we use grouping information gathered by interleaving analysis.  */
2059   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2060   if (!ok)
2061     return false;
2062
2063   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2064      vectorization.  */
2065   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2066     {
2067     /* This pass will decide on using loop versioning and/or loop peeling in
2068        order to enhance the alignment of data references in the loop.  */
2069     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2070     if (!ok)
2071       {
2072         if (dump_enabled_p ())
2073           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2074                            "bad data alignment.\n");
2075         return false;
2076       }
2077     }
2078
2079   if (slp)
2080     {
2081       /* Analyze operations in the SLP instances.  Note this may
2082          remove unsupported SLP instances which makes the above
2083          SLP kind detection invalid.  */
2084       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2085       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2086                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2087       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2088         goto again;
2089     }
2090
2091   /* Scan all the remaining operations in the loop that are not subject
2092      to SLP and make sure they are vectorizable.  */
2093   ok = vect_analyze_loop_operations (loop_vinfo);
2094   if (!ok)
2095     {
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "bad operation or unsupported loop bound.\n");
2099       return false;
2100     }
2101
2102   /* If epilog loop is required because of data accesses with gaps,
2103      one additional iteration needs to be peeled.  Check if there is
2104      enough iterations for vectorization.  */
2105   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2106       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2107     {
2108       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2109       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110
2111       if (wi::to_widest (scalar_niters) < vf)
2112         {
2113           if (dump_enabled_p ())
2114             dump_printf_loc (MSG_NOTE, vect_location,
2115                              "loop has no enough iterations to support"
2116                              " peeling for gaps.\n");
2117           return false;
2118         }
2119     }
2120
2121   /* Analyze cost.  Decide if worth while to vectorize.  */
2122   int min_profitable_estimate, min_profitable_iters;
2123   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2124                                       &min_profitable_estimate);
2125
2126   if (min_profitable_iters < 0)
2127     {
2128       if (dump_enabled_p ())
2129         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2130                          "not vectorized: vectorization not profitable.\n");
2131       if (dump_enabled_p ())
2132         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133                          "not vectorized: vector version will never be "
2134                          "profitable.\n");
2135       goto again;
2136     }
2137
2138   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2139                            * vectorization_factor);
2140
2141   /* Use the cost model only if it is more conservative than user specified
2142      threshold.  */
2143   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2144
2145   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2146
2147   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2148       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2149     {
2150       if (dump_enabled_p ())
2151         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2152                          "not vectorized: vectorization not profitable.\n");
2153       if (dump_enabled_p ())
2154         dump_printf_loc (MSG_NOTE, vect_location,
2155                          "not vectorized: iteration count smaller than user "
2156                          "specified loop bound parameter or minimum profitable "
2157                          "iterations (whichever is more conservative).\n");
2158       goto again;
2159     }
2160
2161   estimated_niter
2162     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2163   if (estimated_niter == -1)
2164     estimated_niter = max_niter;
2165   if (estimated_niter != -1
2166       && ((unsigned HOST_WIDE_INT) estimated_niter
2167           < MAX (th, (unsigned) min_profitable_estimate)))
2168     {
2169       if (dump_enabled_p ())
2170         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2171                          "not vectorized: estimated iteration count too "
2172                          "small.\n");
2173       if (dump_enabled_p ())
2174         dump_printf_loc (MSG_NOTE, vect_location,
2175                          "not vectorized: estimated iteration count smaller "
2176                          "than specified loop bound parameter or minimum "
2177                          "profitable iterations (whichever is more "
2178                          "conservative).\n");
2179       goto again;
2180     }
2181
2182   /* Decide whether we need to create an epilogue loop to handle
2183      remaining scalar iterations.  */
2184   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2185          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2186         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2187
2188   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2189       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2190     {
2191       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2192                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2193           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2194         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2195     }
2196   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2197            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2198                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2199                /* In case of versioning, check if the maximum number of
2200                   iterations is greater than th.  If they are identical,
2201                   the epilogue is unnecessary.  */
2202                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2203                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2204     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2205
2206   /* If an epilogue loop is required make sure we can create one.  */
2207   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2208       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2209     {
2210       if (dump_enabled_p ())
2211         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2212       if (!vect_can_advance_ivs_p (loop_vinfo)
2213           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2214                                            single_exit (LOOP_VINFO_LOOP
2215                                                          (loop_vinfo))))
2216         {
2217           if (dump_enabled_p ())
2218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2219                              "not vectorized: can't create required "
2220                              "epilog loop\n");
2221           goto again;
2222         }
2223     }
2224
2225   /* During peeling, we need to check if number of loop iterations is
2226      enough for both peeled prolog loop and vector loop.  This check
2227      can be merged along with threshold check of loop versioning, so
2228      increase threshold for this case if necessary.  */
2229   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2230       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2231           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2232     {
2233       unsigned niters_th;
2234
2235       /* Niters for peeled prolog loop.  */
2236       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2237         {
2238           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2239           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2240
2241           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2242         }
2243       else
2244         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2245
2246       /* Niters for at least one iteration of vectorized loop.  */
2247       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2248       /* One additional iteration because of peeling for gap.  */
2249       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2250         niters_th++;
2251       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2252         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2253     }
2254
2255   gcc_assert (vectorization_factor
2256               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2257
2258   /* Ok to vectorize!  */
2259   return true;
2260
2261 again:
2262   /* Try again with SLP forced off but if we didn't do any SLP there is
2263      no point in re-trying.  */
2264   if (!slp)
2265     return false;
2266
2267   /* If there are reduction chains re-trying will fail anyway.  */
2268   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2269     return false;
2270
2271   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2272      via interleaving or lane instructions.  */
2273   slp_instance instance;
2274   slp_tree node;
2275   unsigned i, j;
2276   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2277     {
2278       stmt_vec_info vinfo;
2279       vinfo = vinfo_for_stmt
2280           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2281       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2282         continue;
2283       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2284       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2285       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2286       if (! vect_store_lanes_supported (vectype, size)
2287           && ! vect_grouped_store_supported (vectype, size))
2288         return false;
2289       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2290         {
2291           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2292           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2293           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2294           size = STMT_VINFO_GROUP_SIZE (vinfo);
2295           vectype = STMT_VINFO_VECTYPE (vinfo);
2296           if (! vect_load_lanes_supported (vectype, size)
2297               && ! vect_grouped_load_supported (vectype, single_element_p,
2298                                                 size))
2299             return false;
2300         }
2301     }
2302
2303   if (dump_enabled_p ())
2304     dump_printf_loc (MSG_NOTE, vect_location,
2305                      "re-trying with SLP disabled\n");
2306
2307   /* Roll back state appropriately.  No SLP this time.  */
2308   slp = false;
2309   /* Restore vectorization factor as it were without SLP.  */
2310   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2311   /* Free the SLP instances.  */
2312   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2313     vect_free_slp_instance (instance);
2314   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2315   /* Reset SLP type to loop_vect on all stmts.  */
2316   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2317     {
2318       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2319       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2320            !gsi_end_p (si); gsi_next (&si))
2321         {
2322           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2323           STMT_SLP_TYPE (stmt_info) = loop_vect;
2324         }
2325       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2326            !gsi_end_p (si); gsi_next (&si))
2327         {
2328           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2329           STMT_SLP_TYPE (stmt_info) = loop_vect;
2330           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2331             {
2332               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2333               STMT_SLP_TYPE (stmt_info) = loop_vect;
2334               for (gimple_stmt_iterator pi
2335                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2336                    !gsi_end_p (pi); gsi_next (&pi))
2337                 {
2338                   gimple *pstmt = gsi_stmt (pi);
2339                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2340                 }
2341             }
2342         }
2343     }
2344   /* Free optimized alias test DDRS.  */
2345   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2346   /* Reset target cost data.  */
2347   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2348   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2349     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2350   /* Reset assorted flags.  */
2351   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2352   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2353   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2354
2355   goto start_over;
2356 }
2357
2358 /* Function vect_analyze_loop.
2359
2360    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2361    for it.  The different analyses will record information in the
2362    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2363    be vectorized.  */
2364 loop_vec_info
2365 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2366 {
2367   loop_vec_info loop_vinfo;
2368   unsigned int vector_sizes;
2369
2370   /* Autodetect first vector size we try.  */
2371   current_vector_size = 0;
2372   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2373
2374   if (dump_enabled_p ())
2375     dump_printf_loc (MSG_NOTE, vect_location,
2376                      "===== analyze_loop_nest =====\n");
2377
2378   if (loop_outer (loop)
2379       && loop_vec_info_for_loop (loop_outer (loop))
2380       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2381     {
2382       if (dump_enabled_p ())
2383         dump_printf_loc (MSG_NOTE, vect_location,
2384                          "outer-loop already vectorized.\n");
2385       return NULL;
2386     }
2387
2388   while (1)
2389     {
2390       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2391       loop_vinfo = vect_analyze_loop_form (loop);
2392       if (!loop_vinfo)
2393         {
2394           if (dump_enabled_p ())
2395             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2396                              "bad loop form.\n");
2397           return NULL;
2398         }
2399
2400       bool fatal = false;
2401
2402       if (orig_loop_vinfo)
2403         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2404
2405       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2406         {
2407           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2408
2409           return loop_vinfo;
2410         }
2411
2412       destroy_loop_vec_info (loop_vinfo, true);
2413
2414       vector_sizes &= ~current_vector_size;
2415       if (fatal
2416           || vector_sizes == 0
2417           || current_vector_size == 0)
2418         return NULL;
2419
2420       /* Try the next biggest vector size.  */
2421       current_vector_size = 1 << floor_log2 (vector_sizes);
2422       if (dump_enabled_p ())
2423         dump_printf_loc (MSG_NOTE, vect_location,
2424                          "***** Re-trying analysis with "
2425                          "vector size %d\n", current_vector_size);
2426     }
2427 }
2428
2429
2430 /* Function reduction_code_for_scalar_code
2431
2432    Input:
2433    CODE - tree_code of a reduction operations.
2434
2435    Output:
2436    REDUC_CODE - the corresponding tree-code to be used to reduce the
2437       vector of partial results into a single scalar result, or ERROR_MARK
2438       if the operation is a supported reduction operation, but does not have
2439       such a tree-code.
2440
2441    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2442
2443 static bool
2444 reduction_code_for_scalar_code (enum tree_code code,
2445                                 enum tree_code *reduc_code)
2446 {
2447   switch (code)
2448     {
2449       case MAX_EXPR:
2450         *reduc_code = REDUC_MAX_EXPR;
2451         return true;
2452
2453       case MIN_EXPR:
2454         *reduc_code = REDUC_MIN_EXPR;
2455         return true;
2456
2457       case PLUS_EXPR:
2458         *reduc_code = REDUC_PLUS_EXPR;
2459         return true;
2460
2461       case MULT_EXPR:
2462       case MINUS_EXPR:
2463       case BIT_IOR_EXPR:
2464       case BIT_XOR_EXPR:
2465       case BIT_AND_EXPR:
2466         *reduc_code = ERROR_MARK;
2467         return true;
2468
2469       default:
2470        return false;
2471     }
2472 }
2473
2474
2475 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2476    STMT is printed with a message MSG. */
2477
2478 static void
2479 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2480 {
2481   dump_printf_loc (msg_type, vect_location, "%s", msg);
2482   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2483 }
2484
2485
2486 /* Detect SLP reduction of the form:
2487
2488    #a1 = phi <a5, a0>
2489    a2 = operation (a1)
2490    a3 = operation (a2)
2491    a4 = operation (a3)
2492    a5 = operation (a4)
2493
2494    #a = phi <a5>
2495
2496    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2497    FIRST_STMT is the first reduction stmt in the chain
2498    (a2 = operation (a1)).
2499
2500    Return TRUE if a reduction chain was detected.  */
2501
2502 static bool
2503 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2504                        gimple *first_stmt)
2505 {
2506   struct loop *loop = (gimple_bb (phi))->loop_father;
2507   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2508   enum tree_code code;
2509   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2510   stmt_vec_info use_stmt_info, current_stmt_info;
2511   tree lhs;
2512   imm_use_iterator imm_iter;
2513   use_operand_p use_p;
2514   int nloop_uses, size = 0, n_out_of_loop_uses;
2515   bool found = false;
2516
2517   if (loop != vect_loop)
2518     return false;
2519
2520   lhs = PHI_RESULT (phi);
2521   code = gimple_assign_rhs_code (first_stmt);
2522   while (1)
2523     {
2524       nloop_uses = 0;
2525       n_out_of_loop_uses = 0;
2526       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2527         {
2528           gimple *use_stmt = USE_STMT (use_p);
2529           if (is_gimple_debug (use_stmt))
2530             continue;
2531
2532           /* Check if we got back to the reduction phi.  */
2533           if (use_stmt == phi)
2534             {
2535               loop_use_stmt = use_stmt;
2536               found = true;
2537               break;
2538             }
2539
2540           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2541             {
2542               loop_use_stmt = use_stmt;
2543               nloop_uses++;
2544             }
2545            else
2546              n_out_of_loop_uses++;
2547
2548            /* There are can be either a single use in the loop or two uses in
2549               phi nodes.  */
2550            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2551              return false;
2552         }
2553
2554       if (found)
2555         break;
2556
2557       /* We reached a statement with no loop uses.  */
2558       if (nloop_uses == 0)
2559         return false;
2560
2561       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2562       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2563         return false;
2564
2565       if (!is_gimple_assign (loop_use_stmt)
2566           || code != gimple_assign_rhs_code (loop_use_stmt)
2567           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2568         return false;
2569
2570       /* Insert USE_STMT into reduction chain.  */
2571       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2572       if (current_stmt)
2573         {
2574           current_stmt_info = vinfo_for_stmt (current_stmt);
2575           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2576           GROUP_FIRST_ELEMENT (use_stmt_info)
2577             = GROUP_FIRST_ELEMENT (current_stmt_info);
2578         }
2579       else
2580         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2581
2582       lhs = gimple_assign_lhs (loop_use_stmt);
2583       current_stmt = loop_use_stmt;
2584       size++;
2585    }
2586
2587   if (!found || loop_use_stmt != phi || size < 2)
2588     return false;
2589
2590   /* Swap the operands, if needed, to make the reduction operand be the second
2591      operand.  */
2592   lhs = PHI_RESULT (phi);
2593   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2594   while (next_stmt)
2595     {
2596       if (gimple_assign_rhs2 (next_stmt) == lhs)
2597         {
2598           tree op = gimple_assign_rhs1 (next_stmt);
2599           gimple *def_stmt = NULL;
2600
2601           if (TREE_CODE (op) == SSA_NAME)
2602             def_stmt = SSA_NAME_DEF_STMT (op);
2603
2604           /* Check that the other def is either defined in the loop
2605              ("vect_internal_def"), or it's an induction (defined by a
2606              loop-header phi-node).  */
2607           if (def_stmt
2608               && gimple_bb (def_stmt)
2609               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2610               && (is_gimple_assign (def_stmt)
2611                   || is_gimple_call (def_stmt)
2612                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2613                            == vect_induction_def
2614                   || (gimple_code (def_stmt) == GIMPLE_PHI
2615                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2616                                   == vect_internal_def
2617                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2618             {
2619               lhs = gimple_assign_lhs (next_stmt);
2620               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2621               continue;
2622             }
2623
2624           return false;
2625         }
2626       else
2627         {
2628           tree op = gimple_assign_rhs2 (next_stmt);
2629           gimple *def_stmt = NULL;
2630
2631           if (TREE_CODE (op) == SSA_NAME)
2632             def_stmt = SSA_NAME_DEF_STMT (op);
2633
2634           /* Check that the other def is either defined in the loop
2635             ("vect_internal_def"), or it's an induction (defined by a
2636             loop-header phi-node).  */
2637           if (def_stmt
2638               && gimple_bb (def_stmt)
2639               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2640               && (is_gimple_assign (def_stmt)
2641                   || is_gimple_call (def_stmt)
2642                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2643                               == vect_induction_def
2644                   || (gimple_code (def_stmt) == GIMPLE_PHI
2645                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2646                                   == vect_internal_def
2647                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2648             {
2649               if (dump_enabled_p ())
2650                 {
2651                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2652                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2653                 }
2654
2655               swap_ssa_operands (next_stmt,
2656                                  gimple_assign_rhs1_ptr (next_stmt),
2657                                  gimple_assign_rhs2_ptr (next_stmt));
2658               update_stmt (next_stmt);
2659
2660               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2661                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2662             }
2663           else
2664             return false;
2665         }
2666
2667       lhs = gimple_assign_lhs (next_stmt);
2668       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2669     }
2670
2671   /* Save the chain for further analysis in SLP detection.  */
2672   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2673   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2674   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2675
2676   return true;
2677 }
2678
2679
2680 /* Function vect_is_simple_reduction
2681
2682    (1) Detect a cross-iteration def-use cycle that represents a simple
2683    reduction computation.  We look for the following pattern:
2684
2685    loop_header:
2686      a1 = phi < a0, a2 >
2687      a3 = ...
2688      a2 = operation (a3, a1)
2689
2690    or
2691
2692    a3 = ...
2693    loop_header:
2694      a1 = phi < a0, a2 >
2695      a2 = operation (a3, a1)
2696
2697    such that:
2698    1. operation is commutative and associative and it is safe to
2699       change the order of the computation
2700    2. no uses for a2 in the loop (a2 is used out of the loop)
2701    3. no uses of a1 in the loop besides the reduction operation
2702    4. no uses of a1 outside the loop.
2703
2704    Conditions 1,4 are tested here.
2705    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2706
2707    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2708    nested cycles.
2709
2710    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2711    reductions:
2712
2713      a1 = phi < a0, a2 >
2714      inner loop (def of a3)
2715      a2 = phi < a3 >
2716
2717    (4) Detect condition expressions, ie:
2718      for (int i = 0; i < N; i++)
2719        if (a[i] < val)
2720         ret_val = a[i];
2721
2722 */
2723
2724 static gimple *
2725 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2726                           bool *double_reduc,
2727                           bool need_wrapping_integral_overflow,
2728                           enum vect_reduction_type *v_reduc_type)
2729 {
2730   struct loop *loop = (gimple_bb (phi))->loop_father;
2731   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2732   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2733   enum tree_code orig_code, code;
2734   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2735   tree type;
2736   int nloop_uses;
2737   tree name;
2738   imm_use_iterator imm_iter;
2739   use_operand_p use_p;
2740   bool phi_def;
2741
2742   *double_reduc = false;
2743   *v_reduc_type = TREE_CODE_REDUCTION;
2744
2745   name = PHI_RESULT (phi);
2746   /* ???  If there are no uses of the PHI result the inner loop reduction
2747      won't be detected as possibly double-reduction by vectorizable_reduction
2748      because that tries to walk the PHI arg from the preheader edge which
2749      can be constant.  See PR60382.  */
2750   if (has_zero_uses (name))
2751     return NULL;
2752   nloop_uses = 0;
2753   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2754     {
2755       gimple *use_stmt = USE_STMT (use_p);
2756       if (is_gimple_debug (use_stmt))
2757         continue;
2758
2759       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2760         {
2761           if (dump_enabled_p ())
2762             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2763                              "intermediate value used outside loop.\n");
2764
2765           return NULL;
2766         }
2767
2768       nloop_uses++;
2769       if (nloop_uses > 1)
2770         {
2771           if (dump_enabled_p ())
2772             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2773                              "reduction value used in loop.\n");
2774           return NULL;
2775         }
2776
2777       phi_use_stmt = use_stmt;
2778     }
2779
2780   edge latch_e = loop_latch_edge (loop);
2781   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2782   if (TREE_CODE (loop_arg) != SSA_NAME)
2783     {
2784       if (dump_enabled_p ())
2785         {
2786           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2787                            "reduction: not ssa_name: ");
2788           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2789           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2790         }
2791       return NULL;
2792     }
2793
2794   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2795   if (is_gimple_assign (def_stmt))
2796     {
2797       name = gimple_assign_lhs (def_stmt);
2798       phi_def = false;
2799     }
2800   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2801     {
2802       name = PHI_RESULT (def_stmt);
2803       phi_def = true;
2804     }
2805   else
2806     {
2807       if (dump_enabled_p ())
2808         {
2809           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2810                            "reduction: unhandled reduction operation: ");
2811           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2812         }
2813       return NULL;
2814     }
2815
2816   nloop_uses = 0;
2817   auto_vec<gphi *, 3> lcphis;
2818   if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2819     FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2820       {
2821         gimple *use_stmt = USE_STMT (use_p);
2822         if (is_gimple_debug (use_stmt))
2823           continue;
2824         if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2825           nloop_uses++;
2826         else
2827           /* We can have more than one loop-closed PHI.  */
2828           lcphis.safe_push (as_a <gphi *> (use_stmt));
2829         if (nloop_uses > 1)
2830           {
2831             if (dump_enabled_p ())
2832               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2833                                "reduction used in loop.\n");
2834             return NULL;
2835           }
2836       }
2837
2838   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2839      defined in the inner loop.  */
2840   if (phi_def)
2841     {
2842       op1 = PHI_ARG_DEF (def_stmt, 0);
2843
2844       if (gimple_phi_num_args (def_stmt) != 1
2845           || TREE_CODE (op1) != SSA_NAME)
2846         {
2847           if (dump_enabled_p ())
2848             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2849                              "unsupported phi node definition.\n");
2850
2851           return NULL;
2852         }
2853
2854       def1 = SSA_NAME_DEF_STMT (op1);
2855       if (gimple_bb (def1)
2856           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2857           && loop->inner
2858           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2859           && is_gimple_assign (def1)
2860           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2861         {
2862           if (dump_enabled_p ())
2863             report_vect_op (MSG_NOTE, def_stmt,
2864                             "detected double reduction: ");
2865
2866           *double_reduc = true;
2867           return def_stmt;
2868         }
2869
2870       return NULL;
2871     }
2872
2873   /* If we are vectorizing an inner reduction we are executing that
2874      in the original order only in case we are not dealing with a
2875      double reduction.  */
2876   bool check_reduction = true;
2877   if (flow_loop_nested_p (vect_loop, loop))
2878     {
2879       gphi *lcphi;
2880       unsigned i;
2881       check_reduction = false;
2882       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2883         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2884           {
2885             gimple *use_stmt = USE_STMT (use_p);
2886             if (is_gimple_debug (use_stmt))
2887               continue;
2888             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2889               check_reduction = true;
2890           }
2891     }
2892
2893   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2894   code = orig_code = gimple_assign_rhs_code (def_stmt);
2895
2896   /* We can handle "res -= x[i]", which is non-associative by
2897      simply rewriting this into "res += -x[i]".  Avoid changing
2898      gimple instruction for the first simple tests and only do this
2899      if we're allowed to change code at all.  */
2900   if (code == MINUS_EXPR
2901       && ! ((op1 = gimple_assign_rhs2 (def_stmt))
2902             && TREE_CODE (op1) == SSA_NAME
2903             && SSA_NAME_DEF_STMT (op1) == phi))
2904     code = PLUS_EXPR;
2905
2906   if (code == COND_EXPR)
2907     {
2908       if (! nested_in_vect_loop)
2909         *v_reduc_type = COND_REDUCTION;
2910
2911       op3 = gimple_assign_rhs1 (def_stmt);
2912       if (COMPARISON_CLASS_P (op3))
2913         {
2914           op4 = TREE_OPERAND (op3, 1);
2915           op3 = TREE_OPERAND (op3, 0);
2916         }
2917
2918       op1 = gimple_assign_rhs2 (def_stmt);
2919       op2 = gimple_assign_rhs3 (def_stmt);
2920     }
2921   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2922     {
2923       if (dump_enabled_p ())
2924         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2925                         "reduction: not commutative/associative: ");
2926       return NULL;
2927     }
2928   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2929     {
2930       op1 = gimple_assign_rhs1 (def_stmt);
2931       op2 = gimple_assign_rhs2 (def_stmt);
2932     }
2933   else
2934     {
2935       if (dump_enabled_p ())
2936         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2937                         "reduction: not handled operation: ");
2938       return NULL;
2939     }
2940
2941   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2942     {
2943       if (dump_enabled_p ())
2944         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2945                         "reduction: both uses not ssa_names: ");
2946
2947       return NULL;
2948     }
2949
2950   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2951   if ((TREE_CODE (op1) == SSA_NAME
2952        && !types_compatible_p (type,TREE_TYPE (op1)))
2953       || (TREE_CODE (op2) == SSA_NAME
2954           && !types_compatible_p (type, TREE_TYPE (op2)))
2955       || (op3 && TREE_CODE (op3) == SSA_NAME
2956           && !types_compatible_p (type, TREE_TYPE (op3)))
2957       || (op4 && TREE_CODE (op4) == SSA_NAME
2958           && !types_compatible_p (type, TREE_TYPE (op4))))
2959     {
2960       if (dump_enabled_p ())
2961         {
2962           dump_printf_loc (MSG_NOTE, vect_location,
2963                            "reduction: multiple types: operation type: ");
2964           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2965           dump_printf (MSG_NOTE, ", operands types: ");
2966           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2967                              TREE_TYPE (op1));
2968           dump_printf (MSG_NOTE, ",");
2969           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2970                              TREE_TYPE (op2));
2971           if (op3)
2972             {
2973               dump_printf (MSG_NOTE, ",");
2974               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2975                                  TREE_TYPE (op3));
2976             }
2977
2978           if (op4)
2979             {
2980               dump_printf (MSG_NOTE, ",");
2981               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2982                                  TREE_TYPE (op4));
2983             }
2984           dump_printf (MSG_NOTE, "\n");
2985         }
2986
2987       return NULL;
2988     }
2989
2990   /* Check that it's ok to change the order of the computation.
2991      Generally, when vectorizing a reduction we change the order of the
2992      computation.  This may change the behavior of the program in some
2993      cases, so we need to check that this is ok.  One exception is when
2994      vectorizing an outer-loop: the inner-loop is executed sequentially,
2995      and therefore vectorizing reductions in the inner-loop during
2996      outer-loop vectorization is safe.  */
2997
2998   if (*v_reduc_type != COND_REDUCTION
2999       && check_reduction)
3000     {
3001       /* CHECKME: check for !flag_finite_math_only too?  */
3002       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3003         {
3004           /* Changing the order of operations changes the semantics.  */
3005           if (dump_enabled_p ())
3006             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3007                         "reduction: unsafe fp math optimization: ");
3008           return NULL;
3009         }
3010       else if (INTEGRAL_TYPE_P (type))
3011         {
3012           if (!operation_no_trapping_overflow (type, code))
3013             {
3014               /* Changing the order of operations changes the semantics.  */
3015               if (dump_enabled_p ())
3016                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3017                                 "reduction: unsafe int math optimization"
3018                                 " (overflow traps): ");
3019               return NULL;
3020             }
3021           if (need_wrapping_integral_overflow
3022               && !TYPE_OVERFLOW_WRAPS (type)
3023               && operation_can_overflow (code))
3024             {
3025               /* Changing the order of operations changes the semantics.  */
3026               if (dump_enabled_p ())
3027                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3028                                 "reduction: unsafe int math optimization"
3029                                 " (overflow doesn't wrap): ");
3030               return NULL;
3031             }
3032         }
3033       else if (SAT_FIXED_POINT_TYPE_P (type))
3034         {
3035           /* Changing the order of operations changes the semantics.  */
3036           if (dump_enabled_p ())
3037           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3038                           "reduction: unsafe fixed-point math optimization: ");
3039           return NULL;
3040         }
3041     }
3042
3043   /* Reduction is safe. We're dealing with one of the following:
3044      1) integer arithmetic and no trapv
3045      2) floating point arithmetic, and special flags permit this optimization
3046      3) nested cycle (i.e., outer loop vectorization).  */
3047   if (TREE_CODE (op1) == SSA_NAME)
3048     def1 = SSA_NAME_DEF_STMT (op1);
3049
3050   if (TREE_CODE (op2) == SSA_NAME)
3051     def2 = SSA_NAME_DEF_STMT (op2);
3052
3053   if (code != COND_EXPR
3054       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3055     {
3056       if (dump_enabled_p ())
3057         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3058       return NULL;
3059     }
3060
3061   /* Check that one def is the reduction def, defined by PHI,
3062      the other def is either defined in the loop ("vect_internal_def"),
3063      or it's an induction (defined by a loop-header phi-node).  */
3064
3065   if (def2 && def2 == phi
3066       && (code == COND_EXPR
3067           || !def1 || gimple_nop_p (def1)
3068           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3069           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3070               && (is_gimple_assign (def1)
3071                   || is_gimple_call (def1)
3072                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3073                       == vect_induction_def
3074                   || (gimple_code (def1) == GIMPLE_PHI
3075                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3076                           == vect_internal_def
3077                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3078     {
3079       if (dump_enabled_p ())
3080         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3081       return def_stmt;
3082     }
3083
3084   if (def1 && def1 == phi
3085       && (code == COND_EXPR
3086           || !def2 || gimple_nop_p (def2)
3087           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3088           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3089               && (is_gimple_assign (def2)
3090                   || is_gimple_call (def2)
3091                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3092                        == vect_induction_def
3093                   || (gimple_code (def2) == GIMPLE_PHI
3094                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3095                            == vect_internal_def
3096                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3097     {
3098       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3099         {
3100           /* Check if we can swap operands (just for simplicity - so that
3101              the rest of the code can assume that the reduction variable
3102              is always the last (second) argument).  */
3103           if (code == COND_EXPR)
3104             {
3105               /* Swap cond_expr by inverting the condition.  */
3106               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3107               enum tree_code invert_code = ERROR_MARK;
3108               enum tree_code cond_code = TREE_CODE (cond_expr);
3109
3110               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3111                 {
3112                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3113                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3114                 }
3115               if (invert_code != ERROR_MARK)
3116                 {
3117                   TREE_SET_CODE (cond_expr, invert_code);
3118                   swap_ssa_operands (def_stmt,
3119                                      gimple_assign_rhs2_ptr (def_stmt),
3120                                      gimple_assign_rhs3_ptr (def_stmt));
3121                 }
3122               else
3123                 {
3124                   if (dump_enabled_p ())
3125                     report_vect_op (MSG_NOTE, def_stmt,
3126                                     "detected reduction: cannot swap operands "
3127                                     "for cond_expr");
3128                   return NULL;
3129                 }
3130             }
3131           else
3132             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3133                                gimple_assign_rhs2_ptr (def_stmt));
3134
3135           if (dump_enabled_p ())
3136             report_vect_op (MSG_NOTE, def_stmt,
3137                             "detected reduction: need to swap operands: ");
3138
3139           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3140             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3141         }
3142       else
3143         {
3144           if (dump_enabled_p ())
3145             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3146         }
3147
3148       return def_stmt;
3149     }
3150
3151   /* Try to find SLP reduction chain.  */
3152   if (! nested_in_vect_loop
3153       && code != COND_EXPR
3154       && orig_code != MINUS_EXPR
3155       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3156     {
3157       if (dump_enabled_p ())
3158         report_vect_op (MSG_NOTE, def_stmt,
3159                         "reduction: detected reduction chain: ");
3160
3161       return def_stmt;
3162     }
3163
3164   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3165   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3166   while (first)
3167     {
3168       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3169       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3170       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3171       first = next;
3172     }
3173
3174   /* Look for the expression computing loop_arg from loop PHI result.  */
3175   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3176   auto_bitmap visited;
3177   tree lookfor = PHI_RESULT (phi);
3178   ssa_op_iter curri;
3179   use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3180                                             SSA_OP_USE);
3181   while (USE_FROM_PTR (curr) != loop_arg)
3182     curr = op_iter_next_use (&curri);
3183   curri.i = curri.numops;
3184   do
3185     {
3186       path.safe_push (std::make_pair (curri, curr));
3187       tree use = USE_FROM_PTR (curr);
3188       if (use == lookfor)
3189         break;
3190       gimple *def = SSA_NAME_DEF_STMT (use);
3191       if (gimple_nop_p (def)
3192           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3193         {
3194 pop:
3195           do
3196             {
3197               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3198               curri = x.first;
3199               curr = x.second;
3200               do
3201                 curr = op_iter_next_use (&curri);
3202               /* Skip already visited or non-SSA operands (from iterating
3203                  over PHI args).  */
3204               while (curr != NULL_USE_OPERAND_P
3205                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3206                          || ! bitmap_set_bit (visited,
3207                                               SSA_NAME_VERSION
3208                                                 (USE_FROM_PTR (curr)))));
3209             }
3210           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3211           if (curr == NULL_USE_OPERAND_P)
3212             break;
3213         }
3214       else
3215         {
3216           if (gimple_code (def) == GIMPLE_PHI)
3217             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3218           else
3219             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3220           while (curr != NULL_USE_OPERAND_P
3221                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3222                      || ! bitmap_set_bit (visited,
3223                                           SSA_NAME_VERSION
3224                                             (USE_FROM_PTR (curr)))))
3225             curr = op_iter_next_use (&curri);
3226           if (curr == NULL_USE_OPERAND_P)
3227             goto pop;
3228         }
3229     }
3230   while (1);
3231   if (dump_file && (dump_flags & TDF_DETAILS))
3232     {
3233       dump_printf_loc (MSG_NOTE, vect_location,
3234                        "reduction path: ");
3235       unsigned i;
3236       std::pair<ssa_op_iter, use_operand_p> *x;
3237       FOR_EACH_VEC_ELT (path, i, x)
3238         {
3239           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3240           dump_printf (MSG_NOTE, " ");
3241         }
3242       dump_printf (MSG_NOTE, "\n");
3243     }
3244
3245   /* Check whether the reduction path detected is valid.  */
3246   bool fail = path.length () == 0;
3247   bool neg = false;
3248   for (unsigned i = 1; i < path.length (); ++i)
3249     {
3250       gimple *use_stmt = USE_STMT (path[i].second);
3251       tree op = USE_FROM_PTR (path[i].second);
3252       if (! has_single_use (op)
3253           || ! is_gimple_assign (use_stmt))
3254         {
3255           fail = true;
3256           break;
3257         }
3258       if (gimple_assign_rhs_code (use_stmt) != code)
3259         {
3260           if (code == PLUS_EXPR
3261               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3262             {
3263               /* Track whether we negate the reduction value each iteration.  */
3264               if (gimple_assign_rhs2 (use_stmt) == op)
3265                 neg = ! neg;
3266             }
3267           else
3268             {
3269               fail = true;
3270               break;
3271             }
3272         }
3273     }
3274   if (! fail && ! neg)
3275     return def_stmt;
3276
3277   if (dump_enabled_p ())
3278     {
3279       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3280                       "reduction: unknown pattern: ");
3281     }
3282
3283   return NULL;
3284 }
3285
3286 /* Wrapper around vect_is_simple_reduction, which will modify code
3287    in-place if it enables detection of more reductions.  Arguments
3288    as there.  */
3289
3290 gimple *
3291 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3292                              bool *double_reduc,
3293                              bool need_wrapping_integral_overflow)
3294 {
3295   enum vect_reduction_type v_reduc_type;
3296   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3297                                           need_wrapping_integral_overflow,
3298                                           &v_reduc_type);
3299   if (def)
3300     {
3301       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3302       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3303       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3304       reduc_def_info = vinfo_for_stmt (def);
3305       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3306     }
3307   return def;
3308 }
3309
3310 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3311 int
3312 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3313                              int *peel_iters_epilogue,
3314                              stmt_vector_for_cost *scalar_cost_vec,
3315                              stmt_vector_for_cost *prologue_cost_vec,
3316                              stmt_vector_for_cost *epilogue_cost_vec)
3317 {
3318   int retval = 0;
3319   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3320
3321   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3322     {
3323       *peel_iters_epilogue = vf/2;
3324       if (dump_enabled_p ())
3325         dump_printf_loc (MSG_NOTE, vect_location,
3326                          "cost model: epilogue peel iters set to vf/2 "
3327                          "because loop iterations are unknown .\n");
3328
3329       /* If peeled iterations are known but number of scalar loop
3330          iterations are unknown, count a taken branch per peeled loop.  */
3331       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3332                                  NULL, 0, vect_prologue);
3333       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3334                                  NULL, 0, vect_epilogue);
3335     }
3336   else
3337     {
3338       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3339       peel_iters_prologue = niters < peel_iters_prologue ?
3340                             niters : peel_iters_prologue;
3341       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3342       /* If we need to peel for gaps, but no peeling is required, we have to
3343          peel VF iterations.  */
3344       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3345         *peel_iters_epilogue = vf;
3346     }
3347
3348   stmt_info_for_cost *si;
3349   int j;
3350   if (peel_iters_prologue)
3351     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3352         {
3353           stmt_vec_info stmt_info
3354             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3355           retval += record_stmt_cost (prologue_cost_vec,
3356                                       si->count * peel_iters_prologue,
3357                                       si->kind, stmt_info, si->misalign,
3358                                       vect_prologue);
3359         }
3360   if (*peel_iters_epilogue)
3361     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3362         {
3363           stmt_vec_info stmt_info
3364             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3365           retval += record_stmt_cost (epilogue_cost_vec,
3366                                       si->count * *peel_iters_epilogue,
3367                                       si->kind, stmt_info, si->misalign,
3368                                       vect_epilogue);
3369         }
3370
3371   return retval;
3372 }
3373
3374 /* Function vect_estimate_min_profitable_iters
3375
3376    Return the number of iterations required for the vector version of the
3377    loop to be profitable relative to the cost of the scalar version of the
3378    loop.
3379
3380    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3381    of iterations for vectorization.  -1 value means loop vectorization
3382    is not profitable.  This returned value may be used for dynamic
3383    profitability check.
3384
3385    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3386    for static check against estimated number of iterations.  */
3387
3388 static void
3389 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3390                                     int *ret_min_profitable_niters,
3391                                     int *ret_min_profitable_estimate)
3392 {
3393   int min_profitable_iters;
3394   int min_profitable_estimate;
3395   int peel_iters_prologue;
3396   int peel_iters_epilogue;
3397   unsigned vec_inside_cost = 0;
3398   int vec_outside_cost = 0;
3399   unsigned vec_prologue_cost = 0;
3400   unsigned vec_epilogue_cost = 0;
3401   int scalar_single_iter_cost = 0;
3402   int scalar_outside_cost = 0;
3403   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3404   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3405   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3406
3407   /* Cost model disabled.  */
3408   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3409     {
3410       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3411       *ret_min_profitable_niters = 0;
3412       *ret_min_profitable_estimate = 0;
3413       return;
3414     }
3415
3416   /* Requires loop versioning tests to handle misalignment.  */
3417   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3418     {
3419       /*  FIXME: Make cost depend on complexity of individual check.  */
3420       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3421       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3422                             vect_prologue);
3423       dump_printf (MSG_NOTE,
3424                    "cost model: Adding cost of checks for loop "
3425                    "versioning to treat misalignment.\n");
3426     }
3427
3428   /* Requires loop versioning with alias checks.  */
3429   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3430     {
3431       /*  FIXME: Make cost depend on complexity of individual check.  */
3432       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3433       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3434                             vect_prologue);
3435       dump_printf (MSG_NOTE,
3436                    "cost model: Adding cost of checks for loop "
3437                    "versioning aliasing.\n");
3438     }
3439
3440   /* Requires loop versioning with niter checks.  */
3441   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3442     {
3443       /*  FIXME: Make cost depend on complexity of individual check.  */
3444       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3445                             vect_prologue);
3446       dump_printf (MSG_NOTE,
3447                    "cost model: Adding cost of checks for loop "
3448                    "versioning niters.\n");
3449     }
3450
3451   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3452     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3453                           vect_prologue);
3454
3455   /* Count statements in scalar loop.  Using this as scalar cost for a single
3456      iteration for now.
3457
3458      TODO: Add outer loop support.
3459
3460      TODO: Consider assigning different costs to different scalar
3461      statements.  */
3462
3463   scalar_single_iter_cost
3464     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3465
3466   /* Add additional cost for the peeled instructions in prologue and epilogue
3467      loop.
3468
3469      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3470      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3471
3472      TODO: Build an expression that represents peel_iters for prologue and
3473      epilogue to be used in a run-time test.  */
3474
3475   if (npeel  < 0)
3476     {
3477       peel_iters_prologue = vf/2;
3478       dump_printf (MSG_NOTE, "cost model: "
3479                    "prologue peel iters set to vf/2.\n");
3480
3481       /* If peeling for alignment is unknown, loop bound of main loop becomes
3482          unknown.  */
3483       peel_iters_epilogue = vf/2;
3484       dump_printf (MSG_NOTE, "cost model: "
3485                    "epilogue peel iters set to vf/2 because "
3486                    "peeling for alignment is unknown.\n");
3487
3488       /* If peeled iterations are unknown, count a taken branch and a not taken
3489          branch per peeled loop. Even if scalar loop iterations are known,
3490          vector iterations are not known since peeled prologue iterations are
3491          not known. Hence guards remain the same.  */
3492       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3493                             NULL, 0, vect_prologue);
3494       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3495                             NULL, 0, vect_prologue);
3496       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3497                             NULL, 0, vect_epilogue);
3498       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3499                             NULL, 0, vect_epilogue);
3500       stmt_info_for_cost *si;
3501       int j;
3502       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3503         {
3504           struct _stmt_vec_info *stmt_info
3505             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3506           (void) add_stmt_cost (target_cost_data,
3507                                 si->count * peel_iters_prologue,
3508                                 si->kind, stmt_info, si->misalign,
3509                                 vect_prologue);
3510           (void) add_stmt_cost (target_cost_data,
3511                                 si->count * peel_iters_epilogue,
3512                                 si->kind, stmt_info, si->misalign,
3513                                 vect_epilogue);
3514         }
3515     }
3516   else
3517     {
3518       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3519       stmt_info_for_cost *si;
3520       int j;
3521       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3522
3523       prologue_cost_vec.create (2);
3524       epilogue_cost_vec.create (2);
3525       peel_iters_prologue = npeel;
3526
3527       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3528                                           &peel_iters_epilogue,
3529                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3530                                             (loop_vinfo),
3531                                           &prologue_cost_vec,
3532                                           &epilogue_cost_vec);
3533
3534       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3535         {
3536           struct _stmt_vec_info *stmt_info
3537             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3538           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3539                                 si->misalign, vect_prologue);
3540         }
3541
3542       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3543         {
3544           struct _stmt_vec_info *stmt_info
3545             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3546           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3547                                 si->misalign, vect_epilogue);
3548         }
3549
3550       prologue_cost_vec.release ();
3551       epilogue_cost_vec.release ();
3552     }
3553
3554   /* FORNOW: The scalar outside cost is incremented in one of the
3555      following ways:
3556
3557      1. The vectorizer checks for alignment and aliasing and generates
3558      a condition that allows dynamic vectorization.  A cost model
3559      check is ANDED with the versioning condition.  Hence scalar code
3560      path now has the added cost of the versioning check.
3561
3562        if (cost > th & versioning_check)
3563          jmp to vector code
3564
3565      Hence run-time scalar is incremented by not-taken branch cost.
3566
3567      2. The vectorizer then checks if a prologue is required.  If the
3568      cost model check was not done before during versioning, it has to
3569      be done before the prologue check.
3570
3571        if (cost <= th)
3572          prologue = scalar_iters
3573        if (prologue == 0)
3574          jmp to vector code
3575        else
3576          execute prologue
3577        if (prologue == num_iters)
3578          go to exit
3579
3580      Hence the run-time scalar cost is incremented by a taken branch,
3581      plus a not-taken branch, plus a taken branch cost.
3582
3583      3. The vectorizer then checks if an epilogue is required.  If the
3584      cost model check was not done before during prologue check, it
3585      has to be done with the epilogue check.
3586
3587        if (prologue == 0)
3588          jmp to vector code
3589        else
3590          execute prologue
3591        if (prologue == num_iters)
3592          go to exit
3593        vector code:
3594          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3595            jmp to epilogue
3596
3597      Hence the run-time scalar cost should be incremented by 2 taken
3598      branches.
3599
3600      TODO: The back end may reorder the BBS's differently and reverse
3601      conditions/branch directions.  Change the estimates below to
3602      something more reasonable.  */
3603
3604   /* If the number of iterations is known and we do not do versioning, we can
3605      decide whether to vectorize at compile time.  Hence the scalar version
3606      do not carry cost model guard costs.  */
3607   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3608       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3609     {
3610       /* Cost model check occurs at versioning.  */
3611       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3612         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3613       else
3614         {
3615           /* Cost model check occurs at prologue generation.  */
3616           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3617             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3618               + vect_get_stmt_cost (cond_branch_not_taken);
3619           /* Cost model check occurs at epilogue generation.  */
3620           else
3621             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3622         }
3623     }
3624
3625   /* Complete the target-specific cost calculations.  */
3626   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3627                &vec_inside_cost, &vec_epilogue_cost);
3628
3629   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3630
3631   if (dump_enabled_p ())
3632     {
3633       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3634       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3635                    vec_inside_cost);
3636       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3637                    vec_prologue_cost);
3638       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3639                    vec_epilogue_cost);
3640       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3641                    scalar_single_iter_cost);
3642       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3643                    scalar_outside_cost);
3644       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3645                    vec_outside_cost);
3646       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3647                    peel_iters_prologue);
3648       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3649                    peel_iters_epilogue);
3650     }
3651
3652   /* Calculate number of iterations required to make the vector version
3653      profitable, relative to the loop bodies only.  The following condition
3654      must hold true:
3655      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3656      where
3657      SIC = scalar iteration cost, VIC = vector iteration cost,
3658      VOC = vector outside cost, VF = vectorization factor,
3659      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3660      SOC = scalar outside cost for run time cost model check.  */
3661
3662   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3663     {
3664       if (vec_outside_cost <= 0)
3665         min_profitable_iters = 0;
3666       else
3667         {
3668           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3669                                   - vec_inside_cost * peel_iters_prologue
3670                                   - vec_inside_cost * peel_iters_epilogue)
3671                                  / ((scalar_single_iter_cost * vf)
3672                                     - vec_inside_cost);
3673
3674           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3675               <= (((int) vec_inside_cost * min_profitable_iters)
3676                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3677             min_profitable_iters++;
3678         }
3679     }
3680   /* vector version will never be profitable.  */
3681   else
3682     {
3683       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3684         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3685                     "did not happen for a simd loop");
3686
3687       if (dump_enabled_p ())
3688         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3689                          "cost model: the vector iteration cost = %d "
3690                          "divided by the scalar iteration cost = %d "
3691                          "is greater or equal to the vectorization factor = %d"
3692                          ".\n",
3693                          vec_inside_cost, scalar_single_iter_cost, vf);
3694       *ret_min_profitable_niters = -1;
3695       *ret_min_profitable_estimate = -1;
3696       return;
3697     }
3698
3699   dump_printf (MSG_NOTE,
3700                "  Calculated minimum iters for profitability: %d\n",
3701                min_profitable_iters);
3702
3703   /* We want the vectorized loop to execute at least once.  */
3704   if (min_profitable_iters < (vf + peel_iters_prologue + peel_iters_epilogue))
3705     min_profitable_iters = vf + peel_iters_prologue + peel_iters_epilogue;
3706
3707   if (dump_enabled_p ())
3708     dump_printf_loc (MSG_NOTE, vect_location,
3709                      "  Runtime profitability threshold = %d\n",
3710                      min_profitable_iters);
3711
3712   *ret_min_profitable_niters = min_profitable_iters;
3713
3714   /* Calculate number of iterations required to make the vector version
3715      profitable, relative to the loop bodies only.
3716
3717      Non-vectorized variant is SIC * niters and it must win over vector
3718      variant on the expected loop trip count.  The following condition must hold true:
3719      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3720
3721   if (vec_outside_cost <= 0)
3722     min_profitable_estimate = 0;
3723   else
3724     {
3725       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3726                                  - vec_inside_cost * peel_iters_prologue
3727                                  - vec_inside_cost * peel_iters_epilogue)
3728                                  / ((scalar_single_iter_cost * vf)
3729                                    - vec_inside_cost);
3730     }
3731   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3732   if (dump_enabled_p ())
3733     dump_printf_loc (MSG_NOTE, vect_location,
3734                      "  Static estimate profitability threshold = %d\n",
3735                      min_profitable_estimate);
3736
3737   *ret_min_profitable_estimate = min_profitable_estimate;
3738 }
3739
3740 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3741    vector elements (not bits) for a vector of mode MODE.  */
3742 static void
3743 calc_vec_perm_mask_for_shift (machine_mode mode, unsigned int offset,
3744                               unsigned char *sel)
3745 {
3746   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3747
3748   for (i = 0; i < nelt; i++)
3749     sel[i] = (i + offset) & (2*nelt - 1);
3750 }
3751
3752 /* Checks whether the target supports whole-vector shifts for vectors of mode
3753    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3754    it supports vec_perm_const with masks for all necessary shift amounts.  */
3755 static bool
3756 have_whole_vector_shift (machine_mode mode)
3757 {
3758   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3759     return true;
3760
3761   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3762     return false;
3763
3764   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3765   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3766
3767   for (i = nelt/2; i >= 1; i/=2)
3768     {
3769       calc_vec_perm_mask_for_shift (mode, i, sel);
3770       if (!can_vec_perm_p (mode, false, sel))
3771         return false;
3772     }
3773   return true;
3774 }
3775
3776 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3777    functions. Design better to avoid maintenance issues.  */
3778
3779 /* Function vect_model_reduction_cost.
3780
3781    Models cost for a reduction operation, including the vector ops
3782    generated within the strip-mine loop, the initial definition before
3783    the loop, and the epilogue code that must be generated.  */
3784
3785 static void
3786 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3787                            int ncopies)
3788 {
3789   int prologue_cost = 0, epilogue_cost = 0;
3790   enum tree_code code;
3791   optab optab;
3792   tree vectype;
3793   gimple *orig_stmt;
3794   machine_mode mode;
3795   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3796   struct loop *loop = NULL;
3797   void *target_cost_data;
3798
3799   if (loop_vinfo)
3800     {
3801       loop = LOOP_VINFO_LOOP (loop_vinfo);
3802       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3803     }
3804   else
3805     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3806
3807   /* Condition reductions generate two reductions in the loop.  */
3808   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3809     ncopies *= 2;
3810
3811   /* Cost of reduction op inside loop.  */
3812   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3813                                         stmt_info, 0, vect_body);
3814
3815   vectype = STMT_VINFO_VECTYPE (stmt_info);
3816   mode = TYPE_MODE (vectype);
3817   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3818
3819   if (!orig_stmt)
3820     orig_stmt = STMT_VINFO_STMT (stmt_info);
3821
3822   code = gimple_assign_rhs_code (orig_stmt);
3823
3824   /* Add in cost for initial definition.
3825      For cond reduction we have four vectors: initial index, step, initial
3826      result of the data reduction, initial value of the index reduction.  */
3827   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3828                        == COND_REDUCTION ? 4 : 1;
3829   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3830                                   scalar_to_vec, stmt_info, 0,
3831                                   vect_prologue);
3832
3833   /* Determine cost of epilogue code.
3834
3835      We have a reduction operator that will reduce the vector in one statement.
3836      Also requires scalar extract.  */
3837
3838   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3839     {
3840       if (reduc_code != ERROR_MARK)
3841         {
3842           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3843             {
3844               /* An EQ stmt and an COND_EXPR stmt.  */
3845               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3846                                               vector_stmt, stmt_info, 0,
3847                                               vect_epilogue);
3848               /* Reduction of the max index and a reduction of the found
3849                  values.  */
3850               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3851                                               vec_to_scalar, stmt_info, 0,
3852                                               vect_epilogue);
3853               /* A broadcast of the max value.  */
3854               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3855                                               scalar_to_vec, stmt_info, 0,
3856                                               vect_epilogue);
3857             }
3858           else
3859             {
3860               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3861                                               stmt_info, 0, vect_epilogue);
3862               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3863                                               vec_to_scalar, stmt_info, 0,
3864                                               vect_epilogue);
3865             }
3866         }
3867       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3868         {
3869           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3870           /* Extraction of scalar elements.  */
3871           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3872                                           vec_to_scalar, stmt_info, 0,
3873                                           vect_epilogue);
3874           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3875           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3876                                           scalar_stmt, stmt_info, 0,
3877                                           vect_epilogue);
3878         }
3879       else
3880         {
3881           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3882           tree bitsize =
3883             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3884           int element_bitsize = tree_to_uhwi (bitsize);
3885           int nelements = vec_size_in_bits / element_bitsize;
3886
3887           if (code == COND_EXPR)
3888             code = MAX_EXPR;
3889
3890           optab = optab_for_tree_code (code, vectype, optab_default);
3891
3892           /* We have a whole vector shift available.  */
3893           if (optab != unknown_optab
3894               && VECTOR_MODE_P (mode)
3895               && optab_handler (optab, mode) != CODE_FOR_nothing
3896               && have_whole_vector_shift (mode))
3897             {
3898               /* Final reduction via vector shifts and the reduction operator.
3899                  Also requires scalar extract.  */
3900               epilogue_cost += add_stmt_cost (target_cost_data,
3901                                               exact_log2 (nelements) * 2,
3902                                               vector_stmt, stmt_info, 0,
3903                                               vect_epilogue);
3904               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3905                                               vec_to_scalar, stmt_info, 0,
3906                                               vect_epilogue);
3907             }
3908           else
3909             /* Use extracts and reduction op for final reduction.  For N
3910                elements, we have N extracts and N-1 reduction ops.  */
3911             epilogue_cost += add_stmt_cost (target_cost_data,
3912                                             nelements + nelements - 1,
3913                                             vector_stmt, stmt_info, 0,
3914                                             vect_epilogue);
3915         }
3916     }
3917
3918   if (dump_enabled_p ())
3919     dump_printf (MSG_NOTE,
3920                  "vect_model_reduction_cost: inside_cost = %d, "
3921                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3922                  prologue_cost, epilogue_cost);
3923 }
3924
3925
3926 /* Function vect_model_induction_cost.
3927
3928    Models cost for induction operations.  */
3929
3930 static void
3931 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3932 {
3933   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3934   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3935   unsigned inside_cost, prologue_cost;
3936
3937   if (PURE_SLP_STMT (stmt_info))
3938     return;
3939
3940   /* loop cost for vec_loop.  */
3941   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3942                                stmt_info, 0, vect_body);
3943
3944   /* prologue cost for vec_init and vec_step.  */
3945   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3946                                  stmt_info, 0, vect_prologue);
3947
3948   if (dump_enabled_p ())
3949     dump_printf_loc (MSG_NOTE, vect_location,
3950                      "vect_model_induction_cost: inside_cost = %d, "
3951                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3952 }
3953
3954
3955
3956 /* Function get_initial_def_for_reduction
3957
3958    Input:
3959    STMT - a stmt that performs a reduction operation in the loop.
3960    INIT_VAL - the initial value of the reduction variable
3961
3962    Output:
3963    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3964         of the reduction (used for adjusting the epilog - see below).
3965    Return a vector variable, initialized according to the operation that STMT
3966         performs. This vector will be used as the initial value of the
3967         vector of partial results.
3968
3969    Option1 (adjust in epilog): Initialize the vector as follows:
3970      add/bit or/xor:    [0,0,...,0,0]
3971      mult/bit and:      [1,1,...,1,1]
3972      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3973    and when necessary (e.g. add/mult case) let the caller know
3974    that it needs to adjust the result by init_val.
3975
3976    Option2: Initialize the vector as follows:
3977      add/bit or/xor:    [init_val,0,0,...,0]
3978      mult/bit and:      [init_val,1,1,...,1]
3979      min/max/cond_expr: [init_val,init_val,...,init_val]
3980    and no adjustments are needed.
3981
3982    For example, for the following code:
3983
3984    s = init_val;
3985    for (i=0;i<n;i++)
3986      s = s + a[i];
3987
3988    STMT is 's = s + a[i]', and the reduction variable is 's'.
3989    For a vector of 4 units, we want to return either [0,0,0,init_val],
3990    or [0,0,0,0] and let the caller know that it needs to adjust
3991    the result at the end by 'init_val'.
3992
3993    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3994    initialization vector is simpler (same element in all entries), if
3995    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3996
3997    A cost model should help decide between these two schemes.  */
3998
3999 tree
4000 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4001                                tree *adjustment_def)
4002 {
4003   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4004   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4005   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4006   tree scalar_type = TREE_TYPE (init_val);
4007   tree vectype = get_vectype_for_scalar_type (scalar_type);
4008   int nunits;
4009   enum tree_code code = gimple_assign_rhs_code (stmt);
4010   tree def_for_init;
4011   tree init_def;
4012   tree *elts;
4013   int i;
4014   bool nested_in_vect_loop = false;
4015   REAL_VALUE_TYPE real_init_val = dconst0;
4016   int int_init_val = 0;
4017   gimple *def_stmt = NULL;
4018   gimple_seq stmts = NULL;
4019
4020   gcc_assert (vectype);
4021   nunits = TYPE_VECTOR_SUBPARTS (vectype);
4022
4023   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4024               || SCALAR_FLOAT_TYPE_P (scalar_type));
4025
4026   if (nested_in_vect_loop_p (loop, stmt))
4027     nested_in_vect_loop = true;
4028   else
4029     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4030
4031   /* In case of double reduction we only create a vector variable to be put
4032      in the reduction phi node.  The actual statement creation is done in
4033      vect_create_epilog_for_reduction.  */
4034   if (adjustment_def && nested_in_vect_loop
4035       && TREE_CODE (init_val) == SSA_NAME
4036       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4037       && gimple_code (def_stmt) == GIMPLE_PHI
4038       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4039       && vinfo_for_stmt (def_stmt)
4040       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4041           == vect_double_reduction_def)
4042     {
4043       *adjustment_def = NULL;
4044       return vect_create_destination_var (init_val, vectype);
4045     }
4046
4047   /* In case of a nested reduction do not use an adjustment def as
4048      that case is not supported by the epilogue generation correctly
4049      if ncopies is not one.  */
4050   if (adjustment_def && nested_in_vect_loop)
4051     {
4052       *adjustment_def = NULL;
4053       return vect_get_vec_def_for_operand (init_val, stmt);
4054     }
4055
4056   switch (code)
4057     {
4058       case WIDEN_SUM_EXPR:
4059       case DOT_PROD_EXPR:
4060       case SAD_EXPR:
4061       case PLUS_EXPR:
4062       case MINUS_EXPR:
4063       case BIT_IOR_EXPR:
4064       case BIT_XOR_EXPR:
4065       case MULT_EXPR:
4066       case BIT_AND_EXPR:
4067         /* ADJUSMENT_DEF is NULL when called from
4068            vect_create_epilog_for_reduction to vectorize double reduction.  */
4069         if (adjustment_def)
4070           *adjustment_def = init_val;
4071
4072         if (code == MULT_EXPR)
4073           {
4074             real_init_val = dconst1;
4075             int_init_val = 1;
4076           }
4077
4078         if (code == BIT_AND_EXPR)
4079           int_init_val = -1;
4080
4081         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4082           def_for_init = build_real (scalar_type, real_init_val);
4083         else
4084           def_for_init = build_int_cst (scalar_type, int_init_val);
4085
4086         /* Create a vector of '0' or '1' except the first element.  */
4087         elts = XALLOCAVEC (tree, nunits);
4088         for (i = nunits - 2; i >= 0; --i)
4089           elts[i + 1] = def_for_init;
4090
4091         /* Option1: the first element is '0' or '1' as well.  */
4092         if (adjustment_def)
4093           {
4094             elts[0] = def_for_init;
4095             init_def = build_vector (vectype, elts);
4096             break;
4097           }
4098
4099         /* Option2: the first element is INIT_VAL.  */
4100         elts[0] = init_val;
4101         if (TREE_CONSTANT (init_val))
4102           init_def = build_vector (vectype, elts);
4103         else
4104           {
4105             vec<constructor_elt, va_gc> *v;
4106             vec_alloc (v, nunits);
4107             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4108             for (i = 1; i < nunits; ++i)
4109               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4110             init_def = build_constructor (vectype, v);
4111           }
4112
4113         break;
4114
4115       case MIN_EXPR:
4116       case MAX_EXPR:
4117       case COND_EXPR:
4118         if (adjustment_def)
4119           {
4120             *adjustment_def = NULL_TREE;
4121             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4122               {
4123                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4124                 break;
4125               }
4126           }
4127         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4128         if (! gimple_seq_empty_p (stmts))
4129           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4130         init_def = build_vector_from_val (vectype, init_val);
4131         break;
4132
4133       default:
4134         gcc_unreachable ();
4135     }
4136
4137   return init_def;
4138 }
4139
4140 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4141    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4142
4143 static void
4144 get_initial_defs_for_reduction (slp_tree slp_node,
4145                                 vec<tree> *vec_oprnds,
4146                                 unsigned int number_of_vectors,
4147                                 enum tree_code code, bool reduc_chain)
4148 {
4149   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4150   gimple *stmt = stmts[0];
4151   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4152   unsigned nunits;
4153   tree vec_cst;
4154   tree *elts;
4155   unsigned j, number_of_places_left_in_vector;
4156   tree vector_type, scalar_type;
4157   tree vop;
4158   int group_size = stmts.length ();
4159   unsigned int vec_num, i;
4160   unsigned number_of_copies = 1;
4161   vec<tree> voprnds;
4162   voprnds.create (number_of_vectors);
4163   bool constant_p;
4164   tree neutral_op = NULL;
4165   struct loop *loop;
4166   gimple_seq ctor_seq = NULL;
4167
4168   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4169   scalar_type = TREE_TYPE (vector_type);
4170   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4171
4172   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4173
4174   loop = (gimple_bb (stmt))->loop_father;
4175   gcc_assert (loop);
4176
4177   /* op is the reduction operand of the first stmt already.  */
4178   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4179      we need either neutral operands or the original operands.  See
4180      get_initial_def_for_reduction() for details.  */
4181   switch (code)
4182     {
4183     case WIDEN_SUM_EXPR:
4184     case DOT_PROD_EXPR:
4185     case SAD_EXPR:
4186     case PLUS_EXPR:
4187     case MINUS_EXPR:
4188     case BIT_IOR_EXPR:
4189     case BIT_XOR_EXPR:
4190       neutral_op = build_zero_cst (scalar_type);
4191       break;
4192
4193     case MULT_EXPR:
4194       neutral_op = build_one_cst (scalar_type);
4195       break;
4196
4197     case BIT_AND_EXPR:
4198       neutral_op = build_all_ones_cst (scalar_type);
4199       break;
4200
4201     /* For MIN/MAX we don't have an easy neutral operand but
4202        the initial values can be used fine here.  Only for
4203        a reduction chain we have to force a neutral element.  */
4204     case MAX_EXPR:
4205     case MIN_EXPR:
4206       if (! reduc_chain)
4207         neutral_op = NULL;
4208       else
4209         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt,
4210                                             loop_preheader_edge (loop));
4211       break;
4212
4213     default:
4214       gcc_assert (! reduc_chain);
4215       neutral_op = NULL;
4216     }
4217
4218   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4219      created vectors. It is greater than 1 if unrolling is performed.
4220
4221      For example, we have two scalar operands, s1 and s2 (e.g., group of
4222      strided accesses of size two), while NUNITS is four (i.e., four scalars
4223      of this type can be packed in a vector).  The output vector will contain
4224      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4225      will be 2).
4226
4227      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4228      containing the operands.
4229
4230      For example, NUNITS is four as before, and the group size is 8
4231      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4232      {s5, s6, s7, s8}.  */
4233
4234   number_of_copies = nunits * number_of_vectors / group_size;
4235
4236   number_of_places_left_in_vector = nunits;
4237   constant_p = true;
4238   elts = XALLOCAVEC (tree, nunits);
4239   for (j = 0; j < number_of_copies; j++)
4240     {
4241       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4242         {
4243           tree op;
4244           /* Get the def before the loop.  In reduction chain we have only
4245              one initial value.  */
4246           if ((j != (number_of_copies - 1)
4247                || (reduc_chain && i != 0))
4248               && neutral_op)
4249             op = neutral_op;
4250           else
4251             op = PHI_ARG_DEF_FROM_EDGE (stmt,
4252                                         loop_preheader_edge (loop));
4253
4254           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4255           number_of_places_left_in_vector--;
4256           elts[number_of_places_left_in_vector] = op;
4257           if (!CONSTANT_CLASS_P (op))
4258             constant_p = false;
4259
4260           if (number_of_places_left_in_vector == 0)
4261             {
4262               if (constant_p)
4263                 vec_cst = build_vector (vector_type, elts);
4264               else
4265                 {
4266                   vec<constructor_elt, va_gc> *v;
4267                   unsigned k;
4268                   vec_alloc (v, nunits);
4269                   for (k = 0; k < nunits; ++k)
4270                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
4271                   vec_cst = build_constructor (vector_type, v);
4272                 }
4273               tree init;
4274               gimple_stmt_iterator gsi;
4275               init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
4276               if (ctor_seq != NULL)
4277                 {
4278                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
4279                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
4280                                                         GSI_SAME_STMT);
4281                   ctor_seq = NULL;
4282                 }
4283               voprnds.quick_push (init);
4284
4285               number_of_places_left_in_vector = nunits;
4286               constant_p = true;
4287             }
4288         }
4289     }
4290
4291   /* Since the vectors are created in the reverse order, we should invert
4292      them.  */
4293   vec_num = voprnds.length ();
4294   for (j = vec_num; j != 0; j--)
4295     {
4296       vop = voprnds[j - 1];
4297       vec_oprnds->quick_push (vop);
4298     }
4299
4300   voprnds.release ();
4301
4302   /* In case that VF is greater than the unrolling factor needed for the SLP
4303      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4304      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4305      to replicate the vectors.  */
4306   while (number_of_vectors > vec_oprnds->length ())
4307     {
4308       tree neutral_vec = NULL;
4309
4310       if (neutral_op)
4311         {
4312           if (!neutral_vec)
4313             neutral_vec = build_vector_from_val (vector_type, neutral_op);
4314
4315           vec_oprnds->quick_push (neutral_vec);
4316         }
4317       else
4318         {
4319           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4320             vec_oprnds->quick_push (vop);
4321         }
4322     }
4323 }
4324
4325
4326 /* Function vect_create_epilog_for_reduction
4327
4328    Create code at the loop-epilog to finalize the result of a reduction
4329    computation.
4330
4331    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4332      reduction statements.
4333    STMT is the scalar reduction stmt that is being vectorized.
4334    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4335      number of elements that we can fit in a vectype (nunits).  In this case
4336      we have to generate more than one vector stmt - i.e - we need to "unroll"
4337      the vector stmt by a factor VF/nunits.  For more details see documentation
4338      in vectorizable_operation.
4339    REDUC_CODE is the tree-code for the epilog reduction.
4340    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4341      computation.
4342    REDUC_INDEX is the index of the operand in the right hand side of the
4343      statement that is defined by REDUCTION_PHI.
4344    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4345    SLP_NODE is an SLP node containing a group of reduction statements. The
4346      first one in this group is STMT.
4347
4348    This function:
4349    1. Creates the reduction def-use cycles: sets the arguments for
4350       REDUCTION_PHIS:
4351       The loop-entry argument is the vectorized initial-value of the reduction.
4352       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4353       sums.
4354    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4355       by applying the operation specified by REDUC_CODE if available, or by
4356       other means (whole-vector shifts or a scalar loop).
4357       The function also creates a new phi node at the loop exit to preserve
4358       loop-closed form, as illustrated below.
4359
4360      The flow at the entry to this function:
4361
4362         loop:
4363           vec_def = phi <null, null>            # REDUCTION_PHI
4364           VECT_DEF = vector_stmt                # vectorized form of STMT
4365           s_loop = scalar_stmt                  # (scalar) STMT
4366         loop_exit:
4367           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4368           use <s_out0>
4369           use <s_out0>
4370
4371      The above is transformed by this function into:
4372
4373         loop:
4374           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4375           VECT_DEF = vector_stmt                # vectorized form of STMT
4376           s_loop = scalar_stmt                  # (scalar) STMT
4377         loop_exit:
4378           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4379           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4380           v_out2 = reduce <v_out1>
4381           s_out3 = extract_field <v_out2, 0>
4382           s_out4 = adjust_result <s_out3>
4383           use <s_out4>
4384           use <s_out4>
4385 */
4386
4387 static void
4388 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4389                                   gimple *reduc_def_stmt,
4390                                   int ncopies, enum tree_code reduc_code,
4391                                   vec<gimple *> reduction_phis,
4392                                   bool double_reduc,
4393                                   slp_tree slp_node,
4394                                   slp_instance slp_node_instance)
4395 {
4396   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4397   stmt_vec_info prev_phi_info;
4398   tree vectype;
4399   machine_mode mode;
4400   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4401   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4402   basic_block exit_bb;
4403   tree scalar_dest;
4404   tree scalar_type;
4405   gimple *new_phi = NULL, *phi;
4406   gimple_stmt_iterator exit_gsi;
4407   tree vec_dest;
4408   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4409   gimple *epilog_stmt = NULL;
4410   enum tree_code code = gimple_assign_rhs_code (stmt);
4411   gimple *exit_phi;
4412   tree bitsize;
4413   tree adjustment_def = NULL;
4414   tree vec_initial_def = NULL;
4415   tree expr, def, initial_def = NULL;
4416   tree orig_name, scalar_result;
4417   imm_use_iterator imm_iter, phi_imm_iter;
4418   use_operand_p use_p, phi_use_p;
4419   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4420   bool nested_in_vect_loop = false;
4421   auto_vec<gimple *> new_phis;
4422   auto_vec<gimple *> inner_phis;
4423   enum vect_def_type dt = vect_unknown_def_type;
4424   int j, i;
4425   auto_vec<tree> scalar_results;
4426   unsigned int group_size = 1, k, ratio;
4427   auto_vec<tree> vec_initial_defs;
4428   auto_vec<gimple *> phis;
4429   bool slp_reduc = false;
4430   tree new_phi_result;
4431   gimple *inner_phi = NULL;
4432   tree induction_index = NULL_TREE;
4433
4434   if (slp_node)
4435     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4436
4437   if (nested_in_vect_loop_p (loop, stmt))
4438     {
4439       outer_loop = loop;
4440       loop = loop->inner;
4441       nested_in_vect_loop = true;
4442       gcc_assert (!slp_node);
4443     }
4444
4445   vectype = STMT_VINFO_VECTYPE (stmt_info);
4446   gcc_assert (vectype);
4447   mode = TYPE_MODE (vectype);
4448
4449   /* 1. Create the reduction def-use cycle:
4450      Set the arguments of REDUCTION_PHIS, i.e., transform
4451
4452         loop:
4453           vec_def = phi <null, null>            # REDUCTION_PHI
4454           VECT_DEF = vector_stmt                # vectorized form of STMT
4455           ...
4456
4457      into:
4458
4459         loop:
4460           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4461           VECT_DEF = vector_stmt                # vectorized form of STMT
4462           ...
4463
4464      (in case of SLP, do it for all the phis). */
4465
4466   /* Get the loop-entry arguments.  */
4467   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4468   if (slp_node)
4469     {
4470       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4471       vec_initial_defs.reserve (vec_num);
4472       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4473                                       &vec_initial_defs, vec_num, code,
4474                                       GROUP_FIRST_ELEMENT (stmt_info));
4475     }
4476   else
4477     {
4478       /* Get at the scalar def before the loop, that defines the initial value
4479          of the reduction variable.  */
4480       gimple *def_stmt;
4481       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4482                                            loop_preheader_edge (loop));
4483       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4484       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4485                                                        &adjustment_def);
4486       vec_initial_defs.create (1);
4487       vec_initial_defs.quick_push (vec_initial_def);
4488     }
4489
4490   /* Set phi nodes arguments.  */
4491   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4492     {
4493       tree vec_init_def, def;
4494       gimple_seq stmts;
4495       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4496                                            true, NULL_TREE);
4497       if (stmts)
4498         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4499
4500       def = vect_defs[i];
4501       for (j = 0; j < ncopies; j++)
4502         {
4503           if (j != 0)
4504             {
4505               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4506               if (nested_in_vect_loop)
4507                 vec_init_def
4508                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4509                                                     vec_init_def);
4510             }
4511
4512           /* Set the loop-entry arg of the reduction-phi.  */
4513
4514           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4515               == INTEGER_INDUC_COND_REDUCTION)
4516             {
4517               /* Initialise the reduction phi to zero.  This prevents initial
4518                  values of non-zero interferring with the reduction op.  */
4519               gcc_assert (ncopies == 1);
4520               gcc_assert (i == 0);
4521
4522               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4523               tree zero_vec = build_zero_cst (vec_init_def_type);
4524
4525               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4526                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4527             }
4528           else
4529             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4530                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4531
4532           /* Set the loop-latch arg for the reduction-phi.  */
4533           if (j > 0)
4534             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4535
4536           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4537                        UNKNOWN_LOCATION);
4538
4539           if (dump_enabled_p ())
4540             {
4541               dump_printf_loc (MSG_NOTE, vect_location,
4542                                "transform reduction: created def-use cycle: ");
4543               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4544               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4545             }
4546         }
4547     }
4548
4549   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4550      which is updated with the current index of the loop for every match of
4551      the original loop's cond_expr (VEC_STMT).  This results in a vector
4552      containing the last time the condition passed for that vector lane.
4553      The first match will be a 1 to allow 0 to be used for non-matching
4554      indexes.  If there are no matches at all then the vector will be all
4555      zeroes.  */
4556   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4557     {
4558       tree indx_before_incr, indx_after_incr;
4559       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4560       int k;
4561
4562       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4563       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4564
4565       int scalar_precision
4566         = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype)));
4567       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4568       tree cr_index_vector_type = build_vector_type
4569         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4570
4571       /* First we create a simple vector induction variable which starts
4572          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4573          vector size (STEP).  */
4574
4575       /* Create a {1,2,3,...} vector.  */
4576       tree *vtemp = XALLOCAVEC (tree, nunits_out);
4577       for (k = 0; k < nunits_out; ++k)
4578         vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
4579       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4580
4581       /* Create a vector of the step value.  */
4582       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4583       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4584
4585       /* Create an induction variable.  */
4586       gimple_stmt_iterator incr_gsi;
4587       bool insert_after;
4588       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4589       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4590                  insert_after, &indx_before_incr, &indx_after_incr);
4591
4592       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4593          filled with zeros (VEC_ZERO).  */
4594
4595       /* Create a vector of 0s.  */
4596       tree zero = build_zero_cst (cr_index_scalar_type);
4597       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4598
4599       /* Create a vector phi node.  */
4600       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4601       new_phi = create_phi_node (new_phi_tree, loop->header);
4602       set_vinfo_for_stmt (new_phi,
4603                           new_stmt_vec_info (new_phi, loop_vinfo));
4604       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4605                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4606
4607       /* Now take the condition from the loops original cond_expr
4608          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4609          every match uses values from the induction variable
4610          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4611          (NEW_PHI_TREE).
4612          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4613          the new cond_expr (INDEX_COND_EXPR).  */
4614
4615       /* Duplicate the condition from vec_stmt.  */
4616       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4617
4618       /* Create a conditional, where the condition is taken from vec_stmt
4619          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4620          else is the phi (NEW_PHI_TREE).  */
4621       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4622                                      ccompare, indx_before_incr,
4623                                      new_phi_tree);
4624       induction_index = make_ssa_name (cr_index_vector_type);
4625       gimple *index_condition = gimple_build_assign (induction_index,
4626                                                      index_cond_expr);
4627       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4628       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4629                                                         loop_vinfo);
4630       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4631       set_vinfo_for_stmt (index_condition, index_vec_info);
4632
4633       /* Update the phi with the vec cond.  */
4634       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4635                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4636     }
4637
4638   /* 2. Create epilog code.
4639         The reduction epilog code operates across the elements of the vector
4640         of partial results computed by the vectorized loop.
4641         The reduction epilog code consists of:
4642
4643         step 1: compute the scalar result in a vector (v_out2)
4644         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4645         step 3: adjust the scalar result (s_out3) if needed.
4646
4647         Step 1 can be accomplished using one the following three schemes:
4648           (scheme 1) using reduc_code, if available.
4649           (scheme 2) using whole-vector shifts, if available.
4650           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4651                      combined.
4652
4653           The overall epilog code looks like this:
4654
4655           s_out0 = phi <s_loop>         # original EXIT_PHI
4656           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4657           v_out2 = reduce <v_out1>              # step 1
4658           s_out3 = extract_field <v_out2, 0>    # step 2
4659           s_out4 = adjust_result <s_out3>       # step 3
4660
4661           (step 3 is optional, and steps 1 and 2 may be combined).
4662           Lastly, the uses of s_out0 are replaced by s_out4.  */
4663
4664
4665   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4666          v_out1 = phi <VECT_DEF>
4667          Store them in NEW_PHIS.  */
4668
4669   exit_bb = single_exit (loop)->dest;
4670   prev_phi_info = NULL;
4671   new_phis.create (vect_defs.length ());
4672   FOR_EACH_VEC_ELT (vect_defs, i, def)
4673     {
4674       for (j = 0; j < ncopies; j++)
4675         {
4676           tree new_def = copy_ssa_name (def);
4677           phi = create_phi_node (new_def, exit_bb);
4678           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4679           if (j == 0)
4680             new_phis.quick_push (phi);
4681           else
4682             {
4683               def = vect_get_vec_def_for_stmt_copy (dt, def);
4684               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4685             }
4686
4687           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4688           prev_phi_info = vinfo_for_stmt (phi);
4689         }
4690     }
4691
4692   /* The epilogue is created for the outer-loop, i.e., for the loop being
4693      vectorized.  Create exit phis for the outer loop.  */
4694   if (double_reduc)
4695     {
4696       loop = outer_loop;
4697       exit_bb = single_exit (loop)->dest;
4698       inner_phis.create (vect_defs.length ());
4699       FOR_EACH_VEC_ELT (new_phis, i, phi)
4700         {
4701           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4702           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4703           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4704                            PHI_RESULT (phi));
4705           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4706                                                             loop_vinfo));
4707           inner_phis.quick_push (phi);
4708           new_phis[i] = outer_phi;
4709           prev_phi_info = vinfo_for_stmt (outer_phi);
4710           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4711             {
4712               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4713               new_result = copy_ssa_name (PHI_RESULT (phi));
4714               outer_phi = create_phi_node (new_result, exit_bb);
4715               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4716                                PHI_RESULT (phi));
4717               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4718                                                                 loop_vinfo));
4719               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4720               prev_phi_info = vinfo_for_stmt (outer_phi);
4721             }
4722         }
4723     }
4724
4725   exit_gsi = gsi_after_labels (exit_bb);
4726
4727   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4728          (i.e. when reduc_code is not available) and in the final adjustment
4729          code (if needed).  Also get the original scalar reduction variable as
4730          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4731          represents a reduction pattern), the tree-code and scalar-def are
4732          taken from the original stmt that the pattern-stmt (STMT) replaces.
4733          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4734          are taken from STMT.  */
4735
4736   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4737   if (!orig_stmt)
4738     {
4739       /* Regular reduction  */
4740       orig_stmt = stmt;
4741     }
4742   else
4743     {
4744       /* Reduction pattern  */
4745       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4746       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4747       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4748     }
4749
4750   code = gimple_assign_rhs_code (orig_stmt);
4751   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4752      partial results are added and not subtracted.  */
4753   if (code == MINUS_EXPR)
4754     code = PLUS_EXPR;
4755
4756   scalar_dest = gimple_assign_lhs (orig_stmt);
4757   scalar_type = TREE_TYPE (scalar_dest);
4758   scalar_results.create (group_size);
4759   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4760   bitsize = TYPE_SIZE (scalar_type);
4761
4762   /* In case this is a reduction in an inner-loop while vectorizing an outer
4763      loop - we don't need to extract a single scalar result at the end of the
4764      inner-loop (unless it is double reduction, i.e., the use of reduction is
4765      outside the outer-loop).  The final vector of partial results will be used
4766      in the vectorized outer-loop, or reduced to a scalar result at the end of
4767      the outer-loop.  */
4768   if (nested_in_vect_loop && !double_reduc)
4769     goto vect_finalize_reduction;
4770
4771   /* SLP reduction without reduction chain, e.g.,
4772      # a1 = phi <a2, a0>
4773      # b1 = phi <b2, b0>
4774      a2 = operation (a1)
4775      b2 = operation (b1)  */
4776   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4777
4778   /* In case of reduction chain, e.g.,
4779      # a1 = phi <a3, a0>
4780      a2 = operation (a1)
4781      a3 = operation (a2),
4782
4783      we may end up with more than one vector result.  Here we reduce them to
4784      one vector.  */
4785   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4786     {
4787       tree first_vect = PHI_RESULT (new_phis[0]);
4788       tree tmp;
4789       gassign *new_vec_stmt = NULL;
4790
4791       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4792       for (k = 1; k < new_phis.length (); k++)
4793         {
4794           gimple *next_phi = new_phis[k];
4795           tree second_vect = PHI_RESULT (next_phi);
4796
4797           tmp = build2 (code, vectype,  first_vect, second_vect);
4798           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4799           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4800           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4801           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4802         }
4803
4804       new_phi_result = first_vect;
4805       if (new_vec_stmt)
4806         {
4807           new_phis.truncate (0);
4808           new_phis.safe_push (new_vec_stmt);
4809         }
4810     }
4811   else
4812     new_phi_result = PHI_RESULT (new_phis[0]);
4813
4814   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4815       && reduc_code != ERROR_MARK)
4816     {
4817       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4818          various data values where the condition matched and another vector
4819          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4820          need to extract the last matching index (which will be the index with
4821          highest value) and use this to index into the data vector.
4822          For the case where there were no matches, the data vector will contain
4823          all default values and the index vector will be all zeros.  */
4824
4825       /* Get various versions of the type of the vector of indexes.  */
4826       tree index_vec_type = TREE_TYPE (induction_index);
4827       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4828       tree index_scalar_type = TREE_TYPE (index_vec_type);
4829       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4830         (index_vec_type);
4831
4832       /* Get an unsigned integer version of the type of the data vector.  */
4833       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4834       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4835       tree vectype_unsigned = build_vector_type
4836         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4837
4838       /* First we need to create a vector (ZERO_VEC) of zeros and another
4839          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4840          can create using a MAX reduction and then expanding.
4841          In the case where the loop never made any matches, the max index will
4842          be zero.  */
4843
4844       /* Vector of {0, 0, 0,...}.  */
4845       tree zero_vec = make_ssa_name (vectype);
4846       tree zero_vec_rhs = build_zero_cst (vectype);
4847       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4848       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4849
4850       /* Find maximum value from the vector of found indexes.  */
4851       tree max_index = make_ssa_name (index_scalar_type);
4852       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4853                                                     induction_index);
4854       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4855
4856       /* Vector of {max_index, max_index, max_index,...}.  */
4857       tree max_index_vec = make_ssa_name (index_vec_type);
4858       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4859                                                       max_index);
4860       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4861                                                         max_index_vec_rhs);
4862       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4863
4864       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4865          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4866          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4867          otherwise.  Only one value should match, resulting in a vector
4868          (VEC_COND) with one data value and the rest zeros.
4869          In the case where the loop never made any matches, every index will
4870          match, resulting in a vector with all data values (which will all be
4871          the default value).  */
4872
4873       /* Compare the max index vector to the vector of found indexes to find
4874          the position of the max value.  */
4875       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4876       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4877                                                       induction_index,
4878                                                       max_index_vec);
4879       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4880
4881       /* Use the compare to choose either values from the data vector or
4882          zero.  */
4883       tree vec_cond = make_ssa_name (vectype);
4884       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4885                                                    vec_compare, new_phi_result,
4886                                                    zero_vec);
4887       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4888
4889       /* Finally we need to extract the data value from the vector (VEC_COND)
4890          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4891          reduction, but because this doesn't exist, we can use a MAX reduction
4892          instead.  The data value might be signed or a float so we need to cast
4893          it first.
4894          In the case where the loop never made any matches, the data values are
4895          all identical, and so will reduce down correctly.  */
4896
4897       /* Make the matched data values unsigned.  */
4898       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4899       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4900                                        vec_cond);
4901       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4902                                                         VIEW_CONVERT_EXPR,
4903                                                         vec_cond_cast_rhs);
4904       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4905
4906       /* Reduce down to a scalar value.  */
4907       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4908       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4909                                       optab_default);
4910       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4911                   != CODE_FOR_nothing);
4912       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4913                                                      REDUC_MAX_EXPR,
4914                                                      vec_cond_cast);
4915       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4916
4917       /* Convert the reduced value back to the result type and set as the
4918          result.  */
4919       gimple_seq stmts = NULL;
4920       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4921                                data_reduc);
4922       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4923       scalar_results.safe_push (new_temp);
4924     }
4925   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4926            && reduc_code == ERROR_MARK)
4927     {
4928       /* Condition redution without supported REDUC_MAX_EXPR.  Generate
4929          idx = 0;
4930          idx_val = induction_index[0];
4931          val = data_reduc[0];
4932          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4933            if (induction_index[i] > idx_val)
4934              val = data_reduc[i], idx_val = induction_index[i];
4935          return val;  */
4936
4937       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4938       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4939       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4940       unsigned HOST_WIDE_INT v_size
4941         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4942       tree idx_val = NULL_TREE, val = NULL_TREE;
4943       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4944         {
4945           tree old_idx_val = idx_val;
4946           tree old_val = val;
4947           idx_val = make_ssa_name (idx_eltype);
4948           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4949                                              build3 (BIT_FIELD_REF, idx_eltype,
4950                                                      induction_index,
4951                                                      bitsize_int (el_size),
4952                                                      bitsize_int (off)));
4953           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4954           val = make_ssa_name (data_eltype);
4955           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4956                                              build3 (BIT_FIELD_REF,
4957                                                      data_eltype,
4958                                                      new_phi_result,
4959                                                      bitsize_int (el_size),
4960                                                      bitsize_int (off)));
4961           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4962           if (off != 0)
4963             {
4964               tree new_idx_val = idx_val;
4965               tree new_val = val;
4966               if (off != v_size - el_size)
4967                 {
4968                   new_idx_val = make_ssa_name (idx_eltype);
4969                   epilog_stmt = gimple_build_assign (new_idx_val,
4970                                                      MAX_EXPR, idx_val,
4971                                                      old_idx_val);
4972                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4973                 }
4974               new_val = make_ssa_name (data_eltype);
4975               epilog_stmt = gimple_build_assign (new_val,
4976                                                  COND_EXPR,
4977                                                  build2 (GT_EXPR,
4978                                                          boolean_type_node,
4979                                                          idx_val,
4980                                                          old_idx_val),
4981                                                  val, old_val);
4982               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4983               idx_val = new_idx_val;
4984               val = new_val;
4985             }
4986         }
4987       /* Convert the reduced value back to the result type and set as the
4988          result.  */
4989       gimple_seq stmts = NULL;
4990       val = gimple_convert (&stmts, scalar_type, val);
4991       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4992       scalar_results.safe_push (val);
4993     }
4994
4995   /* 2.3 Create the reduction code, using one of the three schemes described
4996          above. In SLP we simply need to extract all the elements from the
4997          vector (without reducing them), so we use scalar shifts.  */
4998   else if (reduc_code != ERROR_MARK && !slp_reduc)
4999     {
5000       tree tmp;
5001       tree vec_elem_type;
5002
5003       /* Case 1:  Create:
5004          v_out2 = reduc_expr <v_out1>  */
5005
5006       if (dump_enabled_p ())
5007         dump_printf_loc (MSG_NOTE, vect_location,
5008                          "Reduce using direct vector reduction.\n");
5009
5010       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
5011       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
5012         {
5013           tree tmp_dest =
5014               vect_create_destination_var (scalar_dest, vec_elem_type);
5015           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
5016           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
5017           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
5018           gimple_assign_set_lhs (epilog_stmt, new_temp);
5019           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5020
5021           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
5022         }
5023       else
5024         tmp = build1 (reduc_code, scalar_type, new_phi_result);
5025
5026       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
5027       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5028       gimple_assign_set_lhs (epilog_stmt, new_temp);
5029       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5030
5031       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5032           == INTEGER_INDUC_COND_REDUCTION)
5033         {
5034           /* Earlier we set the initial value to be zero.  Check the result
5035              and if it is zero then replace with the original initial
5036              value.  */
5037           tree zero = build_zero_cst (scalar_type);
5038           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5039
5040           tmp = make_ssa_name (new_scalar_dest);
5041           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5042                                              initial_def, new_temp);
5043           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5044           new_temp = tmp;
5045         }
5046
5047       scalar_results.safe_push (new_temp);
5048     }
5049   else
5050     {
5051       bool reduce_with_shift = have_whole_vector_shift (mode);
5052       int element_bitsize = tree_to_uhwi (bitsize);
5053       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5054       tree vec_temp;
5055
5056       /* COND reductions all do the final reduction with MAX_EXPR.  */
5057       if (code == COND_EXPR)
5058         code = MAX_EXPR;
5059
5060       /* Regardless of whether we have a whole vector shift, if we're
5061          emulating the operation via tree-vect-generic, we don't want
5062          to use it.  Only the first round of the reduction is likely
5063          to still be profitable via emulation.  */
5064       /* ??? It might be better to emit a reduction tree code here, so that
5065          tree-vect-generic can expand the first round via bit tricks.  */
5066       if (!VECTOR_MODE_P (mode))
5067         reduce_with_shift = false;
5068       else
5069         {
5070           optab optab = optab_for_tree_code (code, vectype, optab_default);
5071           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5072             reduce_with_shift = false;
5073         }
5074
5075       if (reduce_with_shift && !slp_reduc)
5076         {
5077           int nelements = vec_size_in_bits / element_bitsize;
5078           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
5079
5080           int elt_offset;
5081
5082           tree zero_vec = build_zero_cst (vectype);
5083           /* Case 2: Create:
5084              for (offset = nelements/2; offset >= 1; offset/=2)
5085                 {
5086                   Create:  va' = vec_shift <va, offset>
5087                   Create:  va = vop <va, va'>
5088                 }  */
5089
5090           tree rhs;
5091
5092           if (dump_enabled_p ())
5093             dump_printf_loc (MSG_NOTE, vect_location,
5094                              "Reduce using vector shifts\n");
5095
5096           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5097           new_temp = new_phi_result;
5098           for (elt_offset = nelements / 2;
5099                elt_offset >= 1;
5100                elt_offset /= 2)
5101             {
5102               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
5103               tree mask = vect_gen_perm_mask_any (vectype, sel);
5104               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5105                                                  new_temp, zero_vec, mask);
5106               new_name = make_ssa_name (vec_dest, epilog_stmt);
5107               gimple_assign_set_lhs (epilog_stmt, new_name);
5108               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5109
5110               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5111                                                  new_temp);
5112               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5113               gimple_assign_set_lhs (epilog_stmt, new_temp);
5114               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5115             }
5116
5117           /* 2.4  Extract the final scalar result.  Create:
5118              s_out3 = extract_field <v_out2, bitpos>  */
5119
5120           if (dump_enabled_p ())
5121             dump_printf_loc (MSG_NOTE, vect_location,
5122                              "extract scalar result\n");
5123
5124           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5125                         bitsize, bitsize_zero_node);
5126           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5127           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5128           gimple_assign_set_lhs (epilog_stmt, new_temp);
5129           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5130           scalar_results.safe_push (new_temp);
5131         }
5132       else
5133         {
5134           /* Case 3: Create:
5135              s = extract_field <v_out2, 0>
5136              for (offset = element_size;
5137                   offset < vector_size;
5138                   offset += element_size;)
5139                {
5140                  Create:  s' = extract_field <v_out2, offset>
5141                  Create:  s = op <s, s'>  // For non SLP cases
5142                }  */
5143
5144           if (dump_enabled_p ())
5145             dump_printf_loc (MSG_NOTE, vect_location,
5146                              "Reduce using scalar code.\n");
5147
5148           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5149           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5150             {
5151               int bit_offset;
5152               if (gimple_code (new_phi) == GIMPLE_PHI)
5153                 vec_temp = PHI_RESULT (new_phi);
5154               else
5155                 vec_temp = gimple_assign_lhs (new_phi);
5156               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5157                             bitsize_zero_node);
5158               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5159               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5160               gimple_assign_set_lhs (epilog_stmt, new_temp);
5161               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5162
5163               /* In SLP we don't need to apply reduction operation, so we just
5164                  collect s' values in SCALAR_RESULTS.  */
5165               if (slp_reduc)
5166                 scalar_results.safe_push (new_temp);
5167
5168               for (bit_offset = element_bitsize;
5169                    bit_offset < vec_size_in_bits;
5170                    bit_offset += element_bitsize)
5171                 {
5172                   tree bitpos = bitsize_int (bit_offset);
5173                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5174                                      bitsize, bitpos);
5175
5176                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5177                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5178                   gimple_assign_set_lhs (epilog_stmt, new_name);
5179                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5180
5181                   if (slp_reduc)
5182                     {
5183                       /* In SLP we don't need to apply reduction operation, so
5184                          we just collect s' values in SCALAR_RESULTS.  */
5185                       new_temp = new_name;
5186                       scalar_results.safe_push (new_name);
5187                     }
5188                   else
5189                     {
5190                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5191                                                          new_name, new_temp);
5192                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5193                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5194                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5195                     }
5196                 }
5197             }
5198
5199           /* The only case where we need to reduce scalar results in SLP, is
5200              unrolling.  If the size of SCALAR_RESULTS is greater than
5201              GROUP_SIZE, we reduce them combining elements modulo
5202              GROUP_SIZE.  */
5203           if (slp_reduc)
5204             {
5205               tree res, first_res, new_res;
5206               gimple *new_stmt;
5207
5208               /* Reduce multiple scalar results in case of SLP unrolling.  */
5209               for (j = group_size; scalar_results.iterate (j, &res);
5210                    j++)
5211                 {
5212                   first_res = scalar_results[j % group_size];
5213                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5214                                                   first_res, res);
5215                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5216                   gimple_assign_set_lhs (new_stmt, new_res);
5217                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5218                   scalar_results[j % group_size] = new_res;
5219                 }
5220             }
5221           else
5222             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5223             scalar_results.safe_push (new_temp);
5224         }
5225
5226       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5227           == INTEGER_INDUC_COND_REDUCTION)
5228         {
5229           /* Earlier we set the initial value to be zero.  Check the result
5230              and if it is zero then replace with the original initial
5231              value.  */
5232           tree zero = build_zero_cst (scalar_type);
5233           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5234
5235           tree tmp = make_ssa_name (new_scalar_dest);
5236           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5237                                              initial_def, new_temp);
5238           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5239           scalar_results[0] = tmp;
5240         }
5241     }
5242
5243 vect_finalize_reduction:
5244
5245   if (double_reduc)
5246     loop = loop->inner;
5247
5248   /* 2.5 Adjust the final result by the initial value of the reduction
5249          variable. (When such adjustment is not needed, then
5250          'adjustment_def' is zero).  For example, if code is PLUS we create:
5251          new_temp = loop_exit_def + adjustment_def  */
5252
5253   if (adjustment_def)
5254     {
5255       gcc_assert (!slp_reduc);
5256       if (nested_in_vect_loop)
5257         {
5258           new_phi = new_phis[0];
5259           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5260           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5261           new_dest = vect_create_destination_var (scalar_dest, vectype);
5262         }
5263       else
5264         {
5265           new_temp = scalar_results[0];
5266           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5267           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5268           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5269         }
5270
5271       epilog_stmt = gimple_build_assign (new_dest, expr);
5272       new_temp = make_ssa_name (new_dest, epilog_stmt);
5273       gimple_assign_set_lhs (epilog_stmt, new_temp);
5274       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5275       if (nested_in_vect_loop)
5276         {
5277           set_vinfo_for_stmt (epilog_stmt,
5278                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5279           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5280                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5281
5282           if (!double_reduc)
5283             scalar_results.quick_push (new_temp);
5284           else
5285             scalar_results[0] = new_temp;
5286         }
5287       else
5288         scalar_results[0] = new_temp;
5289
5290       new_phis[0] = epilog_stmt;
5291     }
5292
5293   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5294           phis with new adjusted scalar results, i.e., replace use <s_out0>
5295           with use <s_out4>.
5296
5297      Transform:
5298         loop_exit:
5299           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5300           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5301           v_out2 = reduce <v_out1>
5302           s_out3 = extract_field <v_out2, 0>
5303           s_out4 = adjust_result <s_out3>
5304           use <s_out0>
5305           use <s_out0>
5306
5307      into:
5308
5309         loop_exit:
5310           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5311           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5312           v_out2 = reduce <v_out1>
5313           s_out3 = extract_field <v_out2, 0>
5314           s_out4 = adjust_result <s_out3>
5315           use <s_out4>
5316           use <s_out4> */
5317
5318
5319   /* In SLP reduction chain we reduce vector results into one vector if
5320      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5321      the last stmt in the reduction chain, since we are looking for the loop
5322      exit phi node.  */
5323   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5324     {
5325       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5326       /* Handle reduction patterns.  */
5327       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5328         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5329
5330       scalar_dest = gimple_assign_lhs (dest_stmt);
5331       group_size = 1;
5332     }
5333
5334   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5335      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5336      need to match SCALAR_RESULTS with corresponding statements.  The first
5337      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5338      the first vector stmt, etc.
5339      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5340   if (group_size > new_phis.length ())
5341     {
5342       ratio = group_size / new_phis.length ();
5343       gcc_assert (!(group_size % new_phis.length ()));
5344     }
5345   else
5346     ratio = 1;
5347
5348   for (k = 0; k < group_size; k++)
5349     {
5350       if (k % ratio == 0)
5351         {
5352           epilog_stmt = new_phis[k / ratio];
5353           reduction_phi = reduction_phis[k / ratio];
5354           if (double_reduc)
5355             inner_phi = inner_phis[k / ratio];
5356         }
5357
5358       if (slp_reduc)
5359         {
5360           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5361
5362           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5363           /* SLP statements can't participate in patterns.  */
5364           gcc_assert (!orig_stmt);
5365           scalar_dest = gimple_assign_lhs (current_stmt);
5366         }
5367
5368       phis.create (3);
5369       /* Find the loop-closed-use at the loop exit of the original scalar
5370          result.  (The reduction result is expected to have two immediate uses -
5371          one at the latch block, and one at the loop exit).  */
5372       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5373         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5374             && !is_gimple_debug (USE_STMT (use_p)))
5375           phis.safe_push (USE_STMT (use_p));
5376
5377       /* While we expect to have found an exit_phi because of loop-closed-ssa
5378          form we can end up without one if the scalar cycle is dead.  */
5379
5380       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5381         {
5382           if (outer_loop)
5383             {
5384               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5385               gphi *vect_phi;
5386
5387               /* FORNOW. Currently not supporting the case that an inner-loop
5388                  reduction is not used in the outer-loop (but only outside the
5389                  outer-loop), unless it is double reduction.  */
5390               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5391                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5392                           || double_reduc);
5393
5394               if (double_reduc)
5395                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5396               else
5397                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5398               if (!double_reduc
5399                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5400                       != vect_double_reduction_def)
5401                 continue;
5402
5403               /* Handle double reduction:
5404
5405                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5406                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5407                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5408                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5409
5410                  At that point the regular reduction (stmt2 and stmt3) is
5411                  already vectorized, as well as the exit phi node, stmt4.
5412                  Here we vectorize the phi node of double reduction, stmt1, and
5413                  update all relevant statements.  */
5414
5415               /* Go through all the uses of s2 to find double reduction phi
5416                  node, i.e., stmt1 above.  */
5417               orig_name = PHI_RESULT (exit_phi);
5418               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5419                 {
5420                   stmt_vec_info use_stmt_vinfo;
5421                   stmt_vec_info new_phi_vinfo;
5422                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5423                   basic_block bb = gimple_bb (use_stmt);
5424                   gimple *use;
5425
5426                   /* Check that USE_STMT is really double reduction phi
5427                      node.  */
5428                   if (gimple_code (use_stmt) != GIMPLE_PHI
5429                       || gimple_phi_num_args (use_stmt) != 2
5430                       || bb->loop_father != outer_loop)
5431                     continue;
5432                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5433                   if (!use_stmt_vinfo
5434                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5435                           != vect_double_reduction_def)
5436                     continue;
5437
5438                   /* Create vector phi node for double reduction:
5439                      vs1 = phi <vs0, vs2>
5440                      vs1 was created previously in this function by a call to
5441                        vect_get_vec_def_for_operand and is stored in
5442                        vec_initial_def;
5443                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5444                      vs0 is created here.  */
5445
5446                   /* Create vector phi node.  */
5447                   vect_phi = create_phi_node (vec_initial_def, bb);
5448                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5449                                     loop_vec_info_for_loop (outer_loop));
5450                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5451
5452                   /* Create vs0 - initial def of the double reduction phi.  */
5453                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5454                                              loop_preheader_edge (outer_loop));
5455                   init_def = get_initial_def_for_reduction (stmt,
5456                                                           preheader_arg, NULL);
5457                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5458                                                     vectype, NULL);
5459
5460                   /* Update phi node arguments with vs0 and vs2.  */
5461                   add_phi_arg (vect_phi, vect_phi_init,
5462                                loop_preheader_edge (outer_loop),
5463                                UNKNOWN_LOCATION);
5464                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5465                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5466                   if (dump_enabled_p ())
5467                     {
5468                       dump_printf_loc (MSG_NOTE, vect_location,
5469                                        "created double reduction phi node: ");
5470                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5471                     }
5472
5473                   vect_phi_res = PHI_RESULT (vect_phi);
5474
5475                   /* Replace the use, i.e., set the correct vs1 in the regular
5476                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5477                      loop is redundant.  */
5478                   use = reduction_phi;
5479                   for (j = 0; j < ncopies; j++)
5480                     {
5481                       edge pr_edge = loop_preheader_edge (loop);
5482                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5483                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5484                     }
5485                 }
5486             }
5487         }
5488
5489       phis.release ();
5490       if (nested_in_vect_loop)
5491         {
5492           if (double_reduc)
5493             loop = outer_loop;
5494           else
5495             continue;
5496         }
5497
5498       phis.create (3);
5499       /* Find the loop-closed-use at the loop exit of the original scalar
5500          result.  (The reduction result is expected to have two immediate uses,
5501          one at the latch block, and one at the loop exit).  For double
5502          reductions we are looking for exit phis of the outer loop.  */
5503       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5504         {
5505           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5506             {
5507               if (!is_gimple_debug (USE_STMT (use_p)))
5508                 phis.safe_push (USE_STMT (use_p));
5509             }
5510           else
5511             {
5512               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5513                 {
5514                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5515
5516                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5517                     {
5518                       if (!flow_bb_inside_loop_p (loop,
5519                                              gimple_bb (USE_STMT (phi_use_p)))
5520                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5521                         phis.safe_push (USE_STMT (phi_use_p));
5522                     }
5523                 }
5524             }
5525         }
5526
5527       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5528         {
5529           /* Replace the uses:  */
5530           orig_name = PHI_RESULT (exit_phi);
5531           scalar_result = scalar_results[k];
5532           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5533             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5534               SET_USE (use_p, scalar_result);
5535         }
5536
5537       phis.release ();
5538     }
5539 }
5540
5541
5542 /* Function is_nonwrapping_integer_induction.
5543
5544    Check if STMT (which is part of loop LOOP) both increments and
5545    does not cause overflow.  */
5546
5547 static bool
5548 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5549 {
5550   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5551   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5552   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5553   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5554   widest_int ni, max_loop_value, lhs_max;
5555   bool overflow = false;
5556
5557   /* Make sure the loop is integer based.  */
5558   if (TREE_CODE (base) != INTEGER_CST
5559       || TREE_CODE (step) != INTEGER_CST)
5560     return false;
5561
5562   /* Check that the induction increments.  */
5563   if (tree_int_cst_sgn (step) == -1)
5564     return false;
5565
5566   /* Check that the max size of the loop will not wrap.  */
5567
5568   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5569     return true;
5570
5571   if (! max_stmt_executions (loop, &ni))
5572     return false;
5573
5574   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5575                             &overflow);
5576   if (overflow)
5577     return false;
5578
5579   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5580                             TYPE_SIGN (lhs_type), &overflow);
5581   if (overflow)
5582     return false;
5583
5584   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5585           <= TYPE_PRECISION (lhs_type));
5586 }
5587
5588 /* Function vectorizable_reduction.
5589
5590    Check if STMT performs a reduction operation that can be vectorized.
5591    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5592    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5593    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5594
5595    This function also handles reduction idioms (patterns) that have been
5596    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5597    of this form:
5598      X = pattern_expr (arg0, arg1, ..., X)
5599    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5600    sequence that had been detected and replaced by the pattern-stmt (STMT).
5601
5602    This function also handles reduction of condition expressions, for example:
5603      for (int i = 0; i < N; i++)
5604        if (a[i] < value)
5605          last = a[i];
5606    This is handled by vectorising the loop and creating an additional vector
5607    containing the loop indexes for which "a[i] < value" was true.  In the
5608    function epilogue this is reduced to a single max value and then used to
5609    index into the vector of results.
5610
5611    In some cases of reduction patterns, the type of the reduction variable X is
5612    different than the type of the other arguments of STMT.
5613    In such cases, the vectype that is used when transforming STMT into a vector
5614    stmt is different than the vectype that is used to determine the
5615    vectorization factor, because it consists of a different number of elements
5616    than the actual number of elements that are being operated upon in parallel.
5617
5618    For example, consider an accumulation of shorts into an int accumulator.
5619    On some targets it's possible to vectorize this pattern operating on 8
5620    shorts at a time (hence, the vectype for purposes of determining the
5621    vectorization factor should be V8HI); on the other hand, the vectype that
5622    is used to create the vector form is actually V4SI (the type of the result).
5623
5624    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5625    indicates what is the actual level of parallelism (V8HI in the example), so
5626    that the right vectorization factor would be derived.  This vectype
5627    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5628    be used to create the vectorized stmt.  The right vectype for the vectorized
5629    stmt is obtained from the type of the result X:
5630         get_vectype_for_scalar_type (TREE_TYPE (X))
5631
5632    This means that, contrary to "regular" reductions (or "regular" stmts in
5633    general), the following equation:
5634       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5635    does *NOT* necessarily hold for reduction patterns.  */
5636
5637 bool
5638 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5639                         gimple **vec_stmt, slp_tree slp_node,
5640                         slp_instance slp_node_instance)
5641 {
5642   tree vec_dest;
5643   tree scalar_dest;
5644   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5645   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5646   tree vectype_in = NULL_TREE;
5647   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5648   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5649   enum tree_code code, orig_code, epilog_reduc_code;
5650   machine_mode vec_mode;
5651   int op_type;
5652   optab optab, reduc_optab;
5653   tree new_temp = NULL_TREE;
5654   gimple *def_stmt;
5655   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5656   tree scalar_type;
5657   bool is_simple_use;
5658   gimple *orig_stmt;
5659   stmt_vec_info orig_stmt_info = NULL;
5660   int i;
5661   int ncopies;
5662   int epilog_copies;
5663   stmt_vec_info prev_stmt_info, prev_phi_info;
5664   bool single_defuse_cycle = false;
5665   gimple *new_stmt = NULL;
5666   int j;
5667   tree ops[3];
5668   enum vect_def_type dts[3];
5669   bool nested_cycle = false, found_nested_cycle_def = false;
5670   bool double_reduc = false;
5671   basic_block def_bb;
5672   struct loop * def_stmt_loop, *outer_loop = NULL;
5673   tree def_arg;
5674   gimple *def_arg_stmt;
5675   auto_vec<tree> vec_oprnds0;
5676   auto_vec<tree> vec_oprnds1;
5677   auto_vec<tree> vec_oprnds2;
5678   auto_vec<tree> vect_defs;
5679   auto_vec<gimple *> phis;
5680   int vec_num;
5681   tree def0, tem;
5682   bool first_p = true;
5683   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5684   tree cond_reduc_val = NULL_TREE;
5685
5686   /* Make sure it was already recognized as a reduction computation.  */
5687   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5688       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5689     return false;
5690
5691   if (nested_in_vect_loop_p (loop, stmt))
5692     {
5693       outer_loop = loop;
5694       loop = loop->inner;
5695       nested_cycle = true;
5696     }
5697
5698   /* In case of reduction chain we switch to the first stmt in the chain, but
5699      we don't update STMT_INFO, since only the last stmt is marked as reduction
5700      and has reduction properties.  */
5701   if (GROUP_FIRST_ELEMENT (stmt_info)
5702       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5703     {
5704       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5705       first_p = false;
5706     }
5707
5708   if (gimple_code (stmt) == GIMPLE_PHI)
5709     {
5710       /* Analysis is fully done on the reduction stmt invocation.  */
5711       if (! vec_stmt)
5712         {
5713           if (slp_node)
5714             slp_node_instance->reduc_phis = slp_node;
5715
5716           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5717           return true;
5718         }
5719
5720       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5721       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5722         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5723
5724       gcc_assert (is_gimple_assign (reduc_stmt));
5725       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5726         {
5727           tree op = gimple_op (reduc_stmt, k);
5728           if (op == gimple_phi_result (stmt))
5729             continue;
5730           if (k == 1
5731               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5732             continue;
5733           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5734           if (! vectype_in
5735               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5736             vectype_in = tem;
5737           break;
5738         }
5739       gcc_assert (vectype_in);
5740
5741       if (slp_node)
5742         ncopies = 1;
5743       else
5744         ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5745                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5746
5747       use_operand_p use_p;
5748       gimple *use_stmt;
5749       if (ncopies > 1
5750           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5751               <= vect_used_only_live)
5752           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5753           && (use_stmt == reduc_stmt
5754               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5755                   == reduc_stmt)))
5756         single_defuse_cycle = true;
5757
5758       /* Create the destination vector  */
5759       scalar_dest = gimple_assign_lhs (reduc_stmt);
5760       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5761
5762       if (slp_node)
5763         /* The size vect_schedule_slp_instance computes is off for us.  */
5764         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5765                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5766                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5767       else
5768         vec_num = 1;
5769
5770       /* Generate the reduction PHIs upfront.  */
5771       prev_phi_info = NULL;
5772       for (j = 0; j < ncopies; j++)
5773         {
5774           if (j == 0 || !single_defuse_cycle)
5775             {
5776               for (i = 0; i < vec_num; i++)
5777                 {
5778                   /* Create the reduction-phi that defines the reduction
5779                      operand.  */
5780                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5781                   set_vinfo_for_stmt (new_phi,
5782                                       new_stmt_vec_info (new_phi, loop_vinfo));
5783
5784                   if (slp_node)
5785                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5786                   else
5787                     {
5788                       if (j == 0)
5789                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5790                       else
5791                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5792                       prev_phi_info = vinfo_for_stmt (new_phi);
5793                     }
5794                 }
5795             }
5796         }
5797
5798       return true;
5799     }
5800
5801   /* 1. Is vectorizable reduction?  */
5802   /* Not supportable if the reduction variable is used in the loop, unless
5803      it's a reduction chain.  */
5804   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5805       && !GROUP_FIRST_ELEMENT (stmt_info))
5806     return false;
5807
5808   /* Reductions that are not used even in an enclosing outer-loop,
5809      are expected to be "live" (used out of the loop).  */
5810   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5811       && !STMT_VINFO_LIVE_P (stmt_info))
5812     return false;
5813
5814   /* 2. Has this been recognized as a reduction pattern?
5815
5816      Check if STMT represents a pattern that has been recognized
5817      in earlier analysis stages.  For stmts that represent a pattern,
5818      the STMT_VINFO_RELATED_STMT field records the last stmt in
5819      the original sequence that constitutes the pattern.  */
5820
5821   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5822   if (orig_stmt)
5823     {
5824       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5825       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5826       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5827     }
5828
5829   /* 3. Check the operands of the operation.  The first operands are defined
5830         inside the loop body. The last operand is the reduction variable,
5831         which is defined by the loop-header-phi.  */
5832
5833   gcc_assert (is_gimple_assign (stmt));
5834
5835   /* Flatten RHS.  */
5836   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5837     {
5838     case GIMPLE_BINARY_RHS:
5839       code = gimple_assign_rhs_code (stmt);
5840       op_type = TREE_CODE_LENGTH (code);
5841       gcc_assert (op_type == binary_op);
5842       ops[0] = gimple_assign_rhs1 (stmt);
5843       ops[1] = gimple_assign_rhs2 (stmt);
5844       break;
5845
5846     case GIMPLE_TERNARY_RHS:
5847       code = gimple_assign_rhs_code (stmt);
5848       op_type = TREE_CODE_LENGTH (code);
5849       gcc_assert (op_type == ternary_op);
5850       ops[0] = gimple_assign_rhs1 (stmt);
5851       ops[1] = gimple_assign_rhs2 (stmt);
5852       ops[2] = gimple_assign_rhs3 (stmt);
5853       break;
5854
5855     case GIMPLE_UNARY_RHS:
5856       return false;
5857
5858     default:
5859       gcc_unreachable ();
5860     }
5861
5862   if (code == COND_EXPR && slp_node)
5863     return false;
5864
5865   scalar_dest = gimple_assign_lhs (stmt);
5866   scalar_type = TREE_TYPE (scalar_dest);
5867   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5868       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5869     return false;
5870
5871   /* Do not try to vectorize bit-precision reductions.  */
5872   if ((TYPE_PRECISION (scalar_type)
5873        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5874     return false;
5875
5876   /* All uses but the last are expected to be defined in the loop.
5877      The last use is the reduction variable.  In case of nested cycle this
5878      assumption is not true: we use reduc_index to record the index of the
5879      reduction variable.  */
5880   gimple *reduc_def_stmt = NULL;
5881   int reduc_index = -1;
5882   for (i = 0; i < op_type; i++)
5883     {
5884       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5885       if (i == 0 && code == COND_EXPR)
5886         continue;
5887
5888       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5889                                           &def_stmt, &dts[i], &tem);
5890       dt = dts[i];
5891       gcc_assert (is_simple_use);
5892       if (dt == vect_reduction_def)
5893         {
5894           reduc_def_stmt = def_stmt;
5895           reduc_index = i;
5896           continue;
5897         }
5898       else
5899         {
5900           if (!vectype_in)
5901             vectype_in = tem;
5902         }
5903
5904       if (dt != vect_internal_def
5905           && dt != vect_external_def
5906           && dt != vect_constant_def
5907           && dt != vect_induction_def
5908           && !(dt == vect_nested_cycle && nested_cycle))
5909         return false;
5910
5911       if (dt == vect_nested_cycle)
5912         {
5913           found_nested_cycle_def = true;
5914           reduc_def_stmt = def_stmt;
5915           reduc_index = i;
5916         }
5917
5918       if (i == 1 && code == COND_EXPR)
5919         {
5920           /* Record how value of COND_EXPR is defined.  */
5921           if (dt == vect_constant_def)
5922             {
5923               cond_reduc_dt = dt;
5924               cond_reduc_val = ops[i];
5925             }
5926           if (dt == vect_induction_def && def_stmt != NULL
5927               && is_nonwrapping_integer_induction (def_stmt, loop))
5928             cond_reduc_dt = dt;
5929         }
5930     }
5931
5932   if (!vectype_in)
5933     vectype_in = vectype_out;
5934
5935   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5936      directy used in stmt.  */
5937   if (reduc_index == -1)
5938     {
5939       if (orig_stmt)
5940         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5941       else
5942         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5943     }
5944
5945   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5946     return false;
5947
5948   if (!(reduc_index == -1
5949         || dts[reduc_index] == vect_reduction_def
5950         || dts[reduc_index] == vect_nested_cycle
5951         || ((dts[reduc_index] == vect_internal_def
5952              || dts[reduc_index] == vect_external_def
5953              || dts[reduc_index] == vect_constant_def
5954              || dts[reduc_index] == vect_induction_def)
5955             && nested_cycle && found_nested_cycle_def)))
5956     {
5957       /* For pattern recognized stmts, orig_stmt might be a reduction,
5958          but some helper statements for the pattern might not, or
5959          might be COND_EXPRs with reduction uses in the condition.  */
5960       gcc_assert (orig_stmt);
5961       return false;
5962     }
5963
5964   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5965   enum vect_reduction_type v_reduc_type
5966     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5967   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5968
5969   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5970   /* If we have a condition reduction, see if we can simplify it further.  */
5971   if (v_reduc_type == COND_REDUCTION)
5972     {
5973       if (cond_reduc_dt == vect_induction_def)
5974         {
5975           if (dump_enabled_p ())
5976             dump_printf_loc (MSG_NOTE, vect_location,
5977                              "condition expression based on "
5978                              "integer induction.\n");
5979           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5980             = INTEGER_INDUC_COND_REDUCTION;
5981         }
5982
5983       /* Loop peeling modifies initial value of reduction PHI, which
5984          makes the reduction stmt to be transformed different to the
5985          original stmt analyzed.  We need to record reduction code for
5986          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5987          it can be used directly at transform stage.  */
5988       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5989           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5990         {
5991           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5992           gcc_assert (cond_reduc_dt == vect_constant_def);
5993           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5994         }
5995       else if (cond_reduc_dt == vect_constant_def)
5996         {
5997           enum vect_def_type cond_initial_dt;
5998           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5999           tree cond_initial_val
6000             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
6001
6002           gcc_assert (cond_reduc_val != NULL_TREE);
6003           vect_is_simple_use (cond_initial_val, loop_vinfo,
6004                               &def_stmt, &cond_initial_dt);
6005           if (cond_initial_dt == vect_constant_def
6006               && types_compatible_p (TREE_TYPE (cond_initial_val),
6007                                      TREE_TYPE (cond_reduc_val)))
6008             {
6009               tree e = fold_binary (LE_EXPR, boolean_type_node,
6010                                     cond_initial_val, cond_reduc_val);
6011               if (e && (integer_onep (e) || integer_zerop (e)))
6012                 {
6013                   if (dump_enabled_p ())
6014                     dump_printf_loc (MSG_NOTE, vect_location,
6015                                      "condition expression based on "
6016                                      "compile time constant.\n");
6017                   /* Record reduction code at analysis stage.  */
6018                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
6019                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
6020                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6021                     = CONST_COND_REDUCTION;
6022                 }
6023             }
6024         }
6025     }
6026
6027   if (orig_stmt)
6028     gcc_assert (tmp == orig_stmt
6029                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
6030   else
6031     /* We changed STMT to be the first stmt in reduction chain, hence we
6032        check that in this case the first element in the chain is STMT.  */
6033     gcc_assert (stmt == tmp
6034                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
6035
6036   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
6037     return false;
6038
6039   if (slp_node)
6040     ncopies = 1;
6041   else
6042     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6043                / TYPE_VECTOR_SUBPARTS (vectype_in));
6044
6045   gcc_assert (ncopies >= 1);
6046
6047   vec_mode = TYPE_MODE (vectype_in);
6048
6049   if (code == COND_EXPR)
6050     {
6051       /* Only call during the analysis stage, otherwise we'll lose
6052          STMT_VINFO_TYPE.  */
6053       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
6054                                                 ops[reduc_index], 0, NULL))
6055         {
6056           if (dump_enabled_p ())
6057             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6058                              "unsupported condition in reduction\n");
6059           return false;
6060         }
6061     }
6062   else
6063     {
6064       /* 4. Supportable by target?  */
6065
6066       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6067           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6068         {
6069           /* Shifts and rotates are only supported by vectorizable_shifts,
6070              not vectorizable_reduction.  */
6071           if (dump_enabled_p ())
6072             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6073                              "unsupported shift or rotation.\n");
6074           return false;
6075         }
6076
6077       /* 4.1. check support for the operation in the loop  */
6078       optab = optab_for_tree_code (code, vectype_in, optab_default);
6079       if (!optab)
6080         {
6081           if (dump_enabled_p ())
6082             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6083                              "no optab.\n");
6084
6085           return false;
6086         }
6087
6088       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6089         {
6090           if (dump_enabled_p ())
6091             dump_printf (MSG_NOTE, "op not supported by target.\n");
6092
6093           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6094               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6095                   < vect_min_worthwhile_factor (code))
6096             return false;
6097
6098           if (dump_enabled_p ())
6099             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6100         }
6101
6102       /* Worthwhile without SIMD support?  */
6103       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6104           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6105              < vect_min_worthwhile_factor (code))
6106         {
6107           if (dump_enabled_p ())
6108             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6109                              "not worthwhile without SIMD support.\n");
6110
6111           return false;
6112         }
6113     }
6114
6115   /* 4.2. Check support for the epilog operation.
6116
6117           If STMT represents a reduction pattern, then the type of the
6118           reduction variable may be different than the type of the rest
6119           of the arguments.  For example, consider the case of accumulation
6120           of shorts into an int accumulator; The original code:
6121                         S1: int_a = (int) short_a;
6122           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6123
6124           was replaced with:
6125                         STMT: int_acc = widen_sum <short_a, int_acc>
6126
6127           This means that:
6128           1. The tree-code that is used to create the vector operation in the
6129              epilog code (that reduces the partial results) is not the
6130              tree-code of STMT, but is rather the tree-code of the original
6131              stmt from the pattern that STMT is replacing.  I.e, in the example
6132              above we want to use 'widen_sum' in the loop, but 'plus' in the
6133              epilog.
6134           2. The type (mode) we use to check available target support
6135              for the vector operation to be created in the *epilog*, is
6136              determined by the type of the reduction variable (in the example
6137              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6138              However the type (mode) we use to check available target support
6139              for the vector operation to be created *inside the loop*, is
6140              determined by the type of the other arguments to STMT (in the
6141              example we'd check this: optab_handler (widen_sum_optab,
6142              vect_short_mode)).
6143
6144           This is contrary to "regular" reductions, in which the types of all
6145           the arguments are the same as the type of the reduction variable.
6146           For "regular" reductions we can therefore use the same vector type
6147           (and also the same tree-code) when generating the epilog code and
6148           when generating the code inside the loop.  */
6149
6150   if (orig_stmt)
6151     {
6152       /* This is a reduction pattern: get the vectype from the type of the
6153          reduction variable, and get the tree-code from orig_stmt.  */
6154       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6155                   == TREE_CODE_REDUCTION);
6156       orig_code = gimple_assign_rhs_code (orig_stmt);
6157       gcc_assert (vectype_out);
6158       vec_mode = TYPE_MODE (vectype_out);
6159     }
6160   else
6161     {
6162       /* Regular reduction: use the same vectype and tree-code as used for
6163          the vector code inside the loop can be used for the epilog code. */
6164       orig_code = code;
6165
6166       if (code == MINUS_EXPR)
6167         orig_code = PLUS_EXPR;
6168
6169       /* For simple condition reductions, replace with the actual expression
6170          we want to base our reduction around.  */
6171       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6172         {
6173           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6174           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6175         }
6176       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6177                  == INTEGER_INDUC_COND_REDUCTION)
6178         orig_code = MAX_EXPR;
6179     }
6180
6181   if (nested_cycle)
6182     {
6183       def_bb = gimple_bb (reduc_def_stmt);
6184       def_stmt_loop = def_bb->loop_father;
6185       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6186                                        loop_preheader_edge (def_stmt_loop));
6187       if (TREE_CODE (def_arg) == SSA_NAME
6188           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6189           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6190           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6191           && vinfo_for_stmt (def_arg_stmt)
6192           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6193               == vect_double_reduction_def)
6194         double_reduc = true;
6195     }
6196
6197   epilog_reduc_code = ERROR_MARK;
6198
6199   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6200     {
6201       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6202         {
6203           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6204                                          optab_default);
6205           if (!reduc_optab)
6206             {
6207               if (dump_enabled_p ())
6208                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6209                                  "no optab for reduction.\n");
6210
6211               epilog_reduc_code = ERROR_MARK;
6212             }
6213           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6214             {
6215               if (dump_enabled_p ())
6216                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6217                                  "reduc op not supported by target.\n");
6218
6219               epilog_reduc_code = ERROR_MARK;
6220             }
6221         }
6222       else
6223         {
6224           if (!nested_cycle || double_reduc)
6225             {
6226               if (dump_enabled_p ())
6227                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6228                                  "no reduc code for scalar code.\n");
6229
6230               return false;
6231             }
6232         }
6233     }
6234   else
6235     {
6236       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
6237       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6238       cr_index_vector_type = build_vector_type
6239         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6240
6241       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6242                                    optab_default);
6243       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6244           != CODE_FOR_nothing)
6245         epilog_reduc_code = REDUC_MAX_EXPR;
6246     }
6247
6248   if ((double_reduc
6249        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6250       && ncopies > 1)
6251     {
6252       if (dump_enabled_p ())
6253         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6254                          "multiple types in double reduction or condition "
6255                          "reduction.\n");
6256       return false;
6257     }
6258
6259   /* In case of widenning multiplication by a constant, we update the type
6260      of the constant to be the type of the other operand.  We check that the
6261      constant fits the type in the pattern recognition pass.  */
6262   if (code == DOT_PROD_EXPR
6263       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6264     {
6265       if (TREE_CODE (ops[0]) == INTEGER_CST)
6266         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6267       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6268         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6269       else
6270         {
6271           if (dump_enabled_p ())
6272             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6273                              "invalid types in dot-prod\n");
6274
6275           return false;
6276         }
6277     }
6278
6279   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6280     {
6281       widest_int ni;
6282
6283       if (! max_loop_iterations (loop, &ni))
6284         {
6285           if (dump_enabled_p ())
6286             dump_printf_loc (MSG_NOTE, vect_location,
6287                              "loop count not known, cannot create cond "
6288                              "reduction.\n");
6289           return false;
6290         }
6291       /* Convert backedges to iterations.  */
6292       ni += 1;
6293
6294       /* The additional index will be the same type as the condition.  Check
6295          that the loop can fit into this less one (because we'll use up the
6296          zero slot for when there are no matches).  */
6297       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6298       if (wi::geu_p (ni, wi::to_widest (max_index)))
6299         {
6300           if (dump_enabled_p ())
6301             dump_printf_loc (MSG_NOTE, vect_location,
6302                              "loop size is greater than data size.\n");
6303           return false;
6304         }
6305     }
6306
6307   /* In case the vectorization factor (VF) is bigger than the number
6308      of elements that we can fit in a vectype (nunits), we have to generate
6309      more than one vector stmt - i.e - we need to "unroll" the
6310      vector stmt by a factor VF/nunits.  For more details see documentation
6311      in vectorizable_operation.  */
6312
6313   /* If the reduction is used in an outer loop we need to generate
6314      VF intermediate results, like so (e.g. for ncopies=2):
6315         r0 = phi (init, r0)
6316         r1 = phi (init, r1)
6317         r0 = x0 + r0;
6318         r1 = x1 + r1;
6319     (i.e. we generate VF results in 2 registers).
6320     In this case we have a separate def-use cycle for each copy, and therefore
6321     for each copy we get the vector def for the reduction variable from the
6322     respective phi node created for this copy.
6323
6324     Otherwise (the reduction is unused in the loop nest), we can combine
6325     together intermediate results, like so (e.g. for ncopies=2):
6326         r = phi (init, r)
6327         r = x0 + r;
6328         r = x1 + r;
6329    (i.e. we generate VF/2 results in a single register).
6330    In this case for each copy we get the vector def for the reduction variable
6331    from the vectorized reduction operation generated in the previous iteration.
6332
6333    This only works when we see both the reduction PHI and its only consumer
6334    in vectorizable_reduction and there are no intermediate stmts
6335    participating.  */
6336   use_operand_p use_p;
6337   gimple *use_stmt;
6338   if (ncopies > 1
6339       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6340       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6341       && (use_stmt == stmt
6342           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6343     {
6344       single_defuse_cycle = true;
6345       epilog_copies = 1;
6346     }
6347   else
6348     epilog_copies = ncopies;
6349
6350   /* If the reduction stmt is one of the patterns that have lane
6351      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6352   if ((ncopies > 1
6353        && ! single_defuse_cycle)
6354       && (code == DOT_PROD_EXPR
6355           || code == WIDEN_SUM_EXPR
6356           || code == SAD_EXPR))
6357     {
6358       if (dump_enabled_p ())
6359         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6360                          "multi def-use cycle not possible for lane-reducing "
6361                          "reduction operation\n");
6362       return false;
6363     }
6364
6365   if (!vec_stmt) /* transformation not required.  */
6366     {
6367       if (first_p)
6368         vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6369       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6370       return true;
6371     }
6372
6373   /* Transform.  */
6374
6375   if (dump_enabled_p ())
6376     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6377
6378   /* FORNOW: Multiple types are not supported for condition.  */
6379   if (code == COND_EXPR)
6380     gcc_assert (ncopies == 1);
6381
6382   /* Create the destination vector  */
6383   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6384
6385   prev_stmt_info = NULL;
6386   prev_phi_info = NULL;
6387   if (slp_node)
6388     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6389   else
6390     {
6391       vec_num = 1;
6392       vec_oprnds0.create (1);
6393       vec_oprnds1.create (1);
6394       if (op_type == ternary_op)
6395         vec_oprnds2.create (1);
6396     }
6397
6398   phis.create (vec_num);
6399   vect_defs.create (vec_num);
6400   if (!slp_node)
6401     vect_defs.quick_push (NULL_TREE);
6402
6403   if (slp_node)
6404     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6405   else
6406     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6407
6408   for (j = 0; j < ncopies; j++)
6409     {
6410       if (code == COND_EXPR)
6411         {
6412           gcc_assert (!slp_node);
6413           vectorizable_condition (stmt, gsi, vec_stmt,
6414                                   PHI_RESULT (phis[0]),
6415                                   reduc_index, NULL);
6416           /* Multiple types are not supported for condition.  */
6417           break;
6418         }
6419
6420       /* Handle uses.  */
6421       if (j == 0)
6422         {
6423           if (slp_node)
6424             {
6425               /* Get vec defs for all the operands except the reduction index,
6426                  ensuring the ordering of the ops in the vector is kept.  */
6427               auto_vec<tree, 3> slp_ops;
6428               auto_vec<vec<tree>, 3> vec_defs;
6429
6430               slp_ops.quick_push (ops[0]);
6431               slp_ops.quick_push (ops[1]);
6432               if (op_type == ternary_op)
6433                 slp_ops.quick_push (ops[2]);
6434
6435               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6436
6437               vec_oprnds0.safe_splice (vec_defs[0]);
6438               vec_defs[0].release ();
6439               vec_oprnds1.safe_splice (vec_defs[1]);
6440               vec_defs[1].release ();
6441               if (op_type == ternary_op)
6442                 {
6443                   vec_oprnds2.safe_splice (vec_defs[2]);
6444                   vec_defs[2].release ();
6445                 }
6446             }
6447           else
6448             {
6449               vec_oprnds0.quick_push
6450                 (vect_get_vec_def_for_operand (ops[0], stmt));
6451               vec_oprnds1.quick_push
6452                 (vect_get_vec_def_for_operand (ops[1], stmt));
6453               if (op_type == ternary_op)
6454                 vec_oprnds2.quick_push
6455                   (vect_get_vec_def_for_operand (ops[2], stmt));
6456             }
6457         }
6458       else
6459         {
6460           if (!slp_node)
6461             {
6462               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6463
6464               if (single_defuse_cycle && reduc_index == 0)
6465                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6466               else
6467                 vec_oprnds0[0]
6468                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6469               if (single_defuse_cycle && reduc_index == 1)
6470                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6471               else
6472                 vec_oprnds1[0]
6473                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6474               if (op_type == ternary_op)
6475                 {
6476                   if (single_defuse_cycle && reduc_index == 2)
6477                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6478                   else
6479                     vec_oprnds2[0]
6480                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6481                 }
6482             }
6483         }
6484
6485       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6486         {
6487           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6488           if (op_type == ternary_op)
6489             vop[2] = vec_oprnds2[i];
6490
6491           new_temp = make_ssa_name (vec_dest, new_stmt);
6492           new_stmt = gimple_build_assign (new_temp, code,
6493                                           vop[0], vop[1], vop[2]);
6494           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6495
6496           if (slp_node)
6497             {
6498               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6499               vect_defs.quick_push (new_temp);
6500             }
6501           else
6502             vect_defs[0] = new_temp;
6503         }
6504
6505       if (slp_node)
6506         continue;
6507
6508       if (j == 0)
6509         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6510       else
6511         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6512
6513       prev_stmt_info = vinfo_for_stmt (new_stmt);
6514     }
6515
6516   /* Finalize the reduction-phi (set its arguments) and create the
6517      epilog reduction code.  */
6518   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6519     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6520
6521   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6522                                     epilog_copies,
6523                                     epilog_reduc_code, phis,
6524                                     double_reduc, slp_node, slp_node_instance);
6525
6526   return true;
6527 }
6528
6529 /* Function vect_min_worthwhile_factor.
6530
6531    For a loop where we could vectorize the operation indicated by CODE,
6532    return the minimum vectorization factor that makes it worthwhile
6533    to use generic vectors.  */
6534 int
6535 vect_min_worthwhile_factor (enum tree_code code)
6536 {
6537   switch (code)
6538     {
6539     case PLUS_EXPR:
6540     case MINUS_EXPR:
6541     case NEGATE_EXPR:
6542       return 4;
6543
6544     case BIT_AND_EXPR:
6545     case BIT_IOR_EXPR:
6546     case BIT_XOR_EXPR:
6547     case BIT_NOT_EXPR:
6548       return 2;
6549
6550     default:
6551       return INT_MAX;
6552     }
6553 }
6554
6555
6556 /* Function vectorizable_induction
6557
6558    Check if PHI performs an induction computation that can be vectorized.
6559    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6560    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6561    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6562
6563 bool
6564 vectorizable_induction (gimple *phi,
6565                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6566                         gimple **vec_stmt, slp_tree slp_node)
6567 {
6568   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6569   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6570   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6571   unsigned ncopies;
6572   bool nested_in_vect_loop = false;
6573   struct loop *iv_loop;
6574   tree vec_def;
6575   edge pe = loop_preheader_edge (loop);
6576   basic_block new_bb;
6577   tree new_vec, vec_init, vec_step, t;
6578   tree new_name;
6579   gimple *new_stmt;
6580   gphi *induction_phi;
6581   tree induc_def, vec_dest;
6582   tree init_expr, step_expr;
6583   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6584   unsigned i;
6585   tree expr;
6586   gimple_seq stmts;
6587   imm_use_iterator imm_iter;
6588   use_operand_p use_p;
6589   gimple *exit_phi;
6590   edge latch_e;
6591   tree loop_arg;
6592   gimple_stmt_iterator si;
6593   basic_block bb = gimple_bb (phi);
6594
6595   if (gimple_code (phi) != GIMPLE_PHI)
6596     return false;
6597
6598   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6599     return false;
6600
6601   /* Make sure it was recognized as induction computation.  */
6602   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6603     return false;
6604
6605   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6606   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6607
6608   if (slp_node)
6609     ncopies = 1;
6610   else
6611     ncopies = vf / nunits;
6612   gcc_assert (ncopies >= 1);
6613
6614   /* FORNOW. These restrictions should be relaxed.  */
6615   if (nested_in_vect_loop_p (loop, phi))
6616     {
6617       imm_use_iterator imm_iter;
6618       use_operand_p use_p;
6619       gimple *exit_phi;
6620       edge latch_e;
6621       tree loop_arg;
6622
6623       if (ncopies > 1)
6624         {
6625           if (dump_enabled_p ())
6626             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6627                              "multiple types in nested loop.\n");
6628           return false;
6629         }
6630
6631       /* FORNOW: outer loop induction with SLP not supported.  */
6632       if (STMT_SLP_TYPE (stmt_info))
6633         return false;
6634
6635       exit_phi = NULL;
6636       latch_e = loop_latch_edge (loop->inner);
6637       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6638       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6639         {
6640           gimple *use_stmt = USE_STMT (use_p);
6641           if (is_gimple_debug (use_stmt))
6642             continue;
6643
6644           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6645             {
6646               exit_phi = use_stmt;
6647               break;
6648             }
6649         }
6650       if (exit_phi)
6651         {
6652           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6653           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6654                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6655             {
6656               if (dump_enabled_p ())
6657                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6658                                  "inner-loop induction only used outside "
6659                                  "of the outer vectorized loop.\n");
6660               return false;
6661             }
6662         }
6663
6664       nested_in_vect_loop = true;
6665       iv_loop = loop->inner;
6666     }
6667   else
6668     iv_loop = loop;
6669   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6670
6671   if (!vec_stmt) /* transformation not required.  */
6672     {
6673       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6674       if (dump_enabled_p ())
6675         dump_printf_loc (MSG_NOTE, vect_location,
6676                          "=== vectorizable_induction ===\n");
6677       vect_model_induction_cost (stmt_info, ncopies);
6678       return true;
6679     }
6680
6681   /* Transform.  */
6682
6683   /* Compute a vector variable, initialized with the first VF values of
6684      the induction variable.  E.g., for an iv with IV_PHI='X' and
6685      evolution S, for a vector of 4 units, we want to compute:
6686      [X, X + S, X + 2*S, X + 3*S].  */
6687
6688   if (dump_enabled_p ())
6689     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6690
6691   latch_e = loop_latch_edge (iv_loop);
6692   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6693
6694   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6695   gcc_assert (step_expr != NULL_TREE);
6696
6697   pe = loop_preheader_edge (iv_loop);
6698   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6699                                      loop_preheader_edge (iv_loop));
6700
6701   /* Convert the step to the desired type.  */
6702   stmts = NULL;
6703   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6704   if (stmts)
6705     {
6706       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6707       gcc_assert (!new_bb);
6708     }
6709
6710   /* Find the first insertion point in the BB.  */
6711   si = gsi_after_labels (bb);
6712
6713   /* For SLP induction we have to generate several IVs as for example
6714      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6715      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6716      [VF*S, VF*S, VF*S, VF*S] for all.  */
6717   if (slp_node)
6718     {
6719       /* Convert the init to the desired type.  */
6720       stmts = NULL;
6721       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6722       if (stmts)
6723         {
6724           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6725           gcc_assert (!new_bb);
6726         }
6727
6728       /* Generate [VF*S, VF*S, ... ].  */
6729       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6730         {
6731           expr = build_int_cst (integer_type_node, vf);
6732           expr = fold_convert (TREE_TYPE (step_expr), expr);
6733         }
6734       else
6735         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6736       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6737                               expr, step_expr);
6738       if (! CONSTANT_CLASS_P (new_name))
6739         new_name = vect_init_vector (phi, new_name,
6740                                      TREE_TYPE (step_expr), NULL);
6741       new_vec = build_vector_from_val (vectype, new_name);
6742       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6743
6744       /* Now generate the IVs.  */
6745       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6746       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6747       unsigned elts = nunits * nvects;
6748       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6749       gcc_assert (elts % group_size == 0);
6750       tree elt = init_expr;
6751       unsigned ivn;
6752       for (ivn = 0; ivn < nivs; ++ivn)
6753         {
6754           tree *elts = XALLOCAVEC (tree, nunits);
6755           bool constant_p = true;
6756           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6757             {
6758               if (ivn*nunits + eltn >= group_size
6759                   && (ivn*nunits + eltn) % group_size == 0)
6760                 {
6761                   stmts = NULL;
6762                   elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6763                                       elt, step_expr);
6764                   if (stmts)
6765                     {
6766                       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6767                       gcc_assert (!new_bb);
6768                     }
6769                 }
6770               if (! CONSTANT_CLASS_P (elt))
6771                 constant_p = false;
6772               elts[eltn] = elt;
6773             }
6774           if (constant_p)
6775             new_vec = build_vector (vectype, elts);
6776           else
6777             {
6778               vec<constructor_elt, va_gc> *v;
6779               vec_alloc (v, nunits);
6780               for (i = 0; i < nunits; ++i)
6781                 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
6782               new_vec = build_constructor (vectype, v);
6783             }
6784           vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6785
6786           /* Create the induction-phi that defines the induction-operand.  */
6787           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6788           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6789           set_vinfo_for_stmt (induction_phi,
6790                               new_stmt_vec_info (induction_phi, loop_vinfo));
6791           induc_def = PHI_RESULT (induction_phi);
6792
6793           /* Create the iv update inside the loop  */
6794           vec_def = make_ssa_name (vec_dest);
6795           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6796           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6797           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6798
6799           /* Set the arguments of the phi node:  */
6800           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6801           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6802                        UNKNOWN_LOCATION);
6803
6804           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6805         }
6806
6807       /* Re-use IVs when we can.  */
6808       if (ivn < nvects)
6809         {
6810           unsigned vfp
6811             = least_common_multiple (group_size, nunits) / group_size;
6812           /* Generate [VF'*S, VF'*S, ... ].  */
6813           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6814             {
6815               expr = build_int_cst (integer_type_node, vfp);
6816               expr = fold_convert (TREE_TYPE (step_expr), expr);
6817             }
6818           else
6819             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6820           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6821                                   expr, step_expr);
6822           if (! CONSTANT_CLASS_P (new_name))
6823             new_name = vect_init_vector (phi, new_name,
6824                                          TREE_TYPE (step_expr), NULL);
6825           new_vec = build_vector_from_val (vectype, new_name);
6826           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6827           for (; ivn < nvects; ++ivn)
6828             {
6829               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6830               tree def;
6831               if (gimple_code (iv) == GIMPLE_PHI)
6832                 def = gimple_phi_result (iv);
6833               else
6834                 def = gimple_assign_lhs (iv);
6835               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6836                                               PLUS_EXPR,
6837                                               def, vec_step);
6838               if (gimple_code (iv) == GIMPLE_PHI)
6839                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6840               else
6841                 {
6842                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6843                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6844                 }
6845               set_vinfo_for_stmt (new_stmt,
6846                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6847               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6848             }
6849         }
6850
6851       return true;
6852     }
6853
6854   /* Create the vector that holds the initial_value of the induction.  */
6855   if (nested_in_vect_loop)
6856     {
6857       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6858          been created during vectorization of previous stmts.  We obtain it
6859          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6860       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6861       /* If the initial value is not of proper type, convert it.  */
6862       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6863         {
6864           new_stmt
6865             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6866                                                           vect_simple_var,
6867                                                           "vec_iv_"),
6868                                    VIEW_CONVERT_EXPR,
6869                                    build1 (VIEW_CONVERT_EXPR, vectype,
6870                                            vec_init));
6871           vec_init = gimple_assign_lhs (new_stmt);
6872           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6873                                                  new_stmt);
6874           gcc_assert (!new_bb);
6875           set_vinfo_for_stmt (new_stmt,
6876                               new_stmt_vec_info (new_stmt, loop_vinfo));
6877         }
6878     }
6879   else
6880     {
6881       vec<constructor_elt, va_gc> *v;
6882
6883       /* iv_loop is the loop to be vectorized. Create:
6884          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6885       stmts = NULL;
6886       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6887
6888       vec_alloc (v, nunits);
6889       bool constant_p = is_gimple_min_invariant (new_name);
6890       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6891       for (i = 1; i < nunits; i++)
6892         {
6893           /* Create: new_name_i = new_name + step_expr  */
6894           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6895                                    new_name, step_expr);
6896           if (!is_gimple_min_invariant (new_name))
6897             constant_p = false;
6898           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6899         }
6900       if (stmts)
6901         {
6902           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6903           gcc_assert (!new_bb);
6904         }
6905
6906       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
6907       if (constant_p)
6908         new_vec = build_vector_from_ctor (vectype, v);
6909       else
6910         new_vec = build_constructor (vectype, v);
6911       vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6912     }
6913
6914
6915   /* Create the vector that holds the step of the induction.  */
6916   if (nested_in_vect_loop)
6917     /* iv_loop is nested in the loop to be vectorized. Generate:
6918        vec_step = [S, S, S, S]  */
6919     new_name = step_expr;
6920   else
6921     {
6922       /* iv_loop is the loop to be vectorized. Generate:
6923           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6924       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6925         {
6926           expr = build_int_cst (integer_type_node, vf);
6927           expr = fold_convert (TREE_TYPE (step_expr), expr);
6928         }
6929       else
6930         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6931       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6932                               expr, step_expr);
6933       if (TREE_CODE (step_expr) == SSA_NAME)
6934         new_name = vect_init_vector (phi, new_name,
6935                                      TREE_TYPE (step_expr), NULL);
6936     }
6937
6938   t = unshare_expr (new_name);
6939   gcc_assert (CONSTANT_CLASS_P (new_name)
6940               || TREE_CODE (new_name) == SSA_NAME);
6941   new_vec = build_vector_from_val (vectype, t);
6942   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6943
6944
6945   /* Create the following def-use cycle:
6946      loop prolog:
6947          vec_init = ...
6948          vec_step = ...
6949      loop:
6950          vec_iv = PHI <vec_init, vec_loop>
6951          ...
6952          STMT
6953          ...
6954          vec_loop = vec_iv + vec_step;  */
6955
6956   /* Create the induction-phi that defines the induction-operand.  */
6957   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6958   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6959   set_vinfo_for_stmt (induction_phi,
6960                       new_stmt_vec_info (induction_phi, loop_vinfo));
6961   induc_def = PHI_RESULT (induction_phi);
6962
6963   /* Create the iv update inside the loop  */
6964   vec_def = make_ssa_name (vec_dest);
6965   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6966   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6967   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6968
6969   /* Set the arguments of the phi node:  */
6970   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6971   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6972                UNKNOWN_LOCATION);
6973
6974   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6975
6976   /* In case that vectorization factor (VF) is bigger than the number
6977      of elements that we can fit in a vectype (nunits), we have to generate
6978      more than one vector stmt - i.e - we need to "unroll" the
6979      vector stmt by a factor VF/nunits.  For more details see documentation
6980      in vectorizable_operation.  */
6981
6982   if (ncopies > 1)
6983     {
6984       stmt_vec_info prev_stmt_vinfo;
6985       /* FORNOW. This restriction should be relaxed.  */
6986       gcc_assert (!nested_in_vect_loop);
6987
6988       /* Create the vector that holds the step of the induction.  */
6989       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6990         {
6991           expr = build_int_cst (integer_type_node, nunits);
6992           expr = fold_convert (TREE_TYPE (step_expr), expr);
6993         }
6994       else
6995         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6996       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6997                               expr, step_expr);
6998       if (TREE_CODE (step_expr) == SSA_NAME)
6999         new_name = vect_init_vector (phi, new_name,
7000                                      TREE_TYPE (step_expr), NULL);
7001       t = unshare_expr (new_name);
7002       gcc_assert (CONSTANT_CLASS_P (new_name)
7003                   || TREE_CODE (new_name) == SSA_NAME);
7004       new_vec = build_vector_from_val (vectype, t);
7005       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
7006
7007       vec_def = induc_def;
7008       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
7009       for (i = 1; i < ncopies; i++)
7010         {
7011           /* vec_i = vec_prev + vec_step  */
7012           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
7013                                           vec_def, vec_step);
7014           vec_def = make_ssa_name (vec_dest, new_stmt);
7015           gimple_assign_set_lhs (new_stmt, vec_def);
7016
7017           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
7018           set_vinfo_for_stmt (new_stmt,
7019                               new_stmt_vec_info (new_stmt, loop_vinfo));
7020           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
7021           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
7022         }
7023     }
7024
7025   if (nested_in_vect_loop)
7026     {
7027       /* Find the loop-closed exit-phi of the induction, and record
7028          the final vector of induction results:  */
7029       exit_phi = NULL;
7030       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
7031         {
7032           gimple *use_stmt = USE_STMT (use_p);
7033           if (is_gimple_debug (use_stmt))
7034             continue;
7035
7036           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
7037             {
7038               exit_phi = use_stmt;
7039               break;
7040             }
7041         }
7042       if (exit_phi)
7043         {
7044           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
7045           /* FORNOW. Currently not supporting the case that an inner-loop induction
7046              is not used in the outer-loop (i.e. only outside the outer-loop).  */
7047           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
7048                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
7049
7050           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
7051           if (dump_enabled_p ())
7052             {
7053               dump_printf_loc (MSG_NOTE, vect_location,
7054                                "vector of inductions after inner-loop:");
7055               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
7056             }
7057         }
7058     }
7059
7060
7061   if (dump_enabled_p ())
7062     {
7063       dump_printf_loc (MSG_NOTE, vect_location,
7064                        "transform induction: created def-use cycle: ");
7065       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7066       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7067                         SSA_NAME_DEF_STMT (vec_def), 0);
7068     }
7069
7070   return true;
7071 }
7072
7073 /* Function vectorizable_live_operation.
7074
7075    STMT computes a value that is used outside the loop.  Check if
7076    it can be supported.  */
7077
7078 bool
7079 vectorizable_live_operation (gimple *stmt,
7080                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7081                              slp_tree slp_node, int slp_index,
7082                              gimple **vec_stmt)
7083 {
7084   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7085   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7086   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7087   imm_use_iterator imm_iter;
7088   tree lhs, lhs_type, bitsize, vec_bitsize;
7089   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7090   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7091   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
7092   gimple *use_stmt;
7093   auto_vec<tree> vec_oprnds;
7094
7095   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7096
7097   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7098     return false;
7099
7100   /* FORNOW.  CHECKME.  */
7101   if (nested_in_vect_loop_p (loop, stmt))
7102     return false;
7103
7104   /* If STMT is not relevant and it is a simple assignment and its inputs are
7105      invariant then it can remain in place, unvectorized.  The original last
7106      scalar value that it computes will be used.  */
7107   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7108     {
7109       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7110       if (dump_enabled_p ())
7111         dump_printf_loc (MSG_NOTE, vect_location,
7112                          "statement is simple and uses invariant.  Leaving in "
7113                          "place.\n");
7114       return true;
7115     }
7116
7117   if (!vec_stmt)
7118     /* No transformation required.  */
7119     return true;
7120
7121   /* If stmt has a related stmt, then use that for getting the lhs.  */
7122   if (is_pattern_stmt_p (stmt_info))
7123     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7124
7125   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7126         : gimple_get_lhs (stmt);
7127   lhs_type = TREE_TYPE (lhs);
7128
7129   bitsize = TYPE_SIZE (TREE_TYPE (vectype));
7130   vec_bitsize = TYPE_SIZE (vectype);
7131
7132   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7133   tree vec_lhs, bitstart;
7134   if (slp_node)
7135     {
7136       gcc_assert (slp_index >= 0);
7137
7138       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7139       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7140
7141       /* Get the last occurrence of the scalar index from the concatenation of
7142          all the slp vectors. Calculate which slp vector it is and the index
7143          within.  */
7144       int pos = (num_vec * nunits) - num_scalar + slp_index;
7145       int vec_entry = pos / nunits;
7146       int vec_index = pos % nunits;
7147
7148       /* Get the correct slp vectorized stmt.  */
7149       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7150
7151       /* Get entry to use.  */
7152       bitstart = build_int_cst (unsigned_type_node, vec_index);
7153       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7154     }
7155   else
7156     {
7157       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7158       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7159
7160       /* For multiple copies, get the last copy.  */
7161       for (int i = 1; i < ncopies; ++i)
7162         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7163                                                   vec_lhs);
7164
7165       /* Get the last lane in the vector.  */
7166       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7167     }
7168
7169   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7170      loop.  */
7171   gimple_seq stmts = NULL;
7172   tree bftype = TREE_TYPE (vectype);
7173   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7174     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7175   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7176   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7177                                    true, NULL_TREE);
7178   if (stmts)
7179     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7180
7181   /* Replace use of lhs with newly computed result.  If the use stmt is a
7182      single arg PHI, just replace all uses of PHI result.  It's necessary
7183      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7184   use_operand_p use_p;
7185   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7186     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7187         && !is_gimple_debug (use_stmt))
7188     {
7189       if (gimple_code (use_stmt) == GIMPLE_PHI
7190           && gimple_phi_num_args (use_stmt) == 1)
7191         {
7192           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7193         }
7194       else
7195         {
7196           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7197             SET_USE (use_p, new_tree);
7198         }
7199       update_stmt (use_stmt);
7200     }
7201
7202   return true;
7203 }
7204
7205 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7206
7207 static void
7208 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7209 {
7210   ssa_op_iter op_iter;
7211   imm_use_iterator imm_iter;
7212   def_operand_p def_p;
7213   gimple *ustmt;
7214
7215   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7216     {
7217       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7218         {
7219           basic_block bb;
7220
7221           if (!is_gimple_debug (ustmt))
7222             continue;
7223
7224           bb = gimple_bb (ustmt);
7225
7226           if (!flow_bb_inside_loop_p (loop, bb))
7227             {
7228               if (gimple_debug_bind_p (ustmt))
7229                 {
7230                   if (dump_enabled_p ())
7231                     dump_printf_loc (MSG_NOTE, vect_location,
7232                                      "killing debug use\n");
7233
7234                   gimple_debug_bind_reset_value (ustmt);
7235                   update_stmt (ustmt);
7236                 }
7237               else
7238                 gcc_unreachable ();
7239             }
7240         }
7241     }
7242 }
7243
7244 /* Given loop represented by LOOP_VINFO, return true if computation of
7245    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7246    otherwise.  */
7247
7248 static bool
7249 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7250 {
7251   /* Constant case.  */
7252   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7253     {
7254       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7255       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7256
7257       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7258       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7259       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7260         return true;
7261     }
7262
7263   widest_int max;
7264   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7265   /* Check the upper bound of loop niters.  */
7266   if (get_max_loop_iterations (loop, &max))
7267     {
7268       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7269       signop sgn = TYPE_SIGN (type);
7270       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7271       if (max < type_max)
7272         return true;
7273     }
7274   return false;
7275 }
7276
7277 /* Scale profiling counters by estimation for LOOP which is vectorized
7278    by factor VF.  */
7279
7280 static void
7281 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7282 {
7283   edge preheader = loop_preheader_edge (loop);
7284   /* Reduce loop iterations by the vectorization factor.  */
7285   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7286   profile_count freq_h = loop->header->count, freq_e = preheader->count;
7287
7288   /* Use frequency only if counts are zero.  */
7289   if (!(freq_h > 0) && !(freq_e > 0))
7290     {
7291       freq_h = profile_count::from_gcov_type (loop->header->frequency);
7292       freq_e = profile_count::from_gcov_type (EDGE_FREQUENCY (preheader));
7293     }
7294   if (freq_h > 0)
7295     {
7296       profile_probability p;
7297
7298       /* Avoid dropping loop body profile counter to 0 because of zero count
7299          in loop's preheader.  */
7300       if (!(freq_e > profile_count::from_gcov_type (1)))
7301        freq_e = profile_count::from_gcov_type (1);
7302       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7303       scale_loop_frequencies (loop, p);
7304     }
7305
7306   basic_block exit_bb = single_pred (loop->latch);
7307   edge exit_e = single_exit (loop);
7308   exit_e->count = loop_preheader_edge (loop)->count;
7309   exit_e->probability = profile_probability::always ()
7310                                  .apply_scale (1, new_est_niter + 1);
7311
7312   edge exit_l = single_pred_edge (loop->latch);
7313   profile_probability prob = exit_l->probability;
7314   exit_l->probability = exit_e->probability.invert ();
7315   exit_l->count = exit_bb->count - exit_e->count;
7316   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7317     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7318 }
7319
7320 /* Function vect_transform_loop.
7321
7322    The analysis phase has determined that the loop is vectorizable.
7323    Vectorize the loop - created vectorized stmts to replace the scalar
7324    stmts in the loop, and update the loop exit condition.
7325    Returns scalar epilogue loop if any.  */
7326
7327 struct loop *
7328 vect_transform_loop (loop_vec_info loop_vinfo)
7329 {
7330   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7331   struct loop *epilogue = NULL;
7332   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7333   int nbbs = loop->num_nodes;
7334   int i;
7335   tree niters_vector = NULL;
7336   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7337   bool grouped_store;
7338   bool slp_scheduled = false;
7339   gimple *stmt, *pattern_stmt;
7340   gimple_seq pattern_def_seq = NULL;
7341   gimple_stmt_iterator pattern_def_si = gsi_none ();
7342   bool transform_pattern_stmt = false;
7343   bool check_profitability = false;
7344   int th;
7345
7346   if (dump_enabled_p ())
7347     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7348
7349   /* Use the more conservative vectorization threshold.  If the number
7350      of iterations is constant assume the cost check has been performed
7351      by our caller.  If the threshold makes all loops profitable that
7352      run at least the vectorization factor number of times checking
7353      is pointless, too.  */
7354   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7355   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7356       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7357     {
7358       if (dump_enabled_p ())
7359         dump_printf_loc (MSG_NOTE, vect_location,
7360                          "Profitability threshold is %d loop iterations.\n",
7361                          th);
7362       check_profitability = true;
7363     }
7364
7365   /* Make sure there exists a single-predecessor exit bb.  Do this before
7366      versioning.   */
7367   edge e = single_exit (loop);
7368   if (! single_pred_p (e->dest))
7369     {
7370       split_loop_exit_edge (e);
7371       if (dump_enabled_p ())
7372         dump_printf (MSG_NOTE, "split exit edge\n");
7373     }
7374
7375   /* Version the loop first, if required, so the profitability check
7376      comes first.  */
7377
7378   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7379     {
7380       vect_loop_versioning (loop_vinfo, th, check_profitability);
7381       check_profitability = false;
7382     }
7383
7384   /* Make sure there exists a single-predecessor exit bb also on the
7385      scalar loop copy.  Do this after versioning but before peeling
7386      so CFG structure is fine for both scalar and if-converted loop
7387      to make slpeel_duplicate_current_defs_from_edges face matched
7388      loop closed PHI nodes on the exit.  */
7389   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7390     {
7391       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7392       if (! single_pred_p (e->dest))
7393         {
7394           split_loop_exit_edge (e);
7395           if (dump_enabled_p ())
7396             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7397         }
7398     }
7399
7400   tree niters = vect_build_loop_niters (loop_vinfo);
7401   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7402   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7403   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7404   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7405                               check_profitability, niters_no_overflow);
7406   if (niters_vector == NULL_TREE)
7407     {
7408       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7409         niters_vector
7410           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7411                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7412       else
7413         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7414                                      niters_no_overflow);
7415     }
7416
7417   /* 1) Make sure the loop header has exactly two entries
7418      2) Make sure we have a preheader basic block.  */
7419
7420   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7421
7422   split_edge (loop_preheader_edge (loop));
7423
7424   /* FORNOW: the vectorizer supports only loops which body consist
7425      of one basic block (header + empty latch). When the vectorizer will
7426      support more involved loop forms, the order by which the BBs are
7427      traversed need to be reconsidered.  */
7428
7429   for (i = 0; i < nbbs; i++)
7430     {
7431       basic_block bb = bbs[i];
7432       stmt_vec_info stmt_info;
7433
7434       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7435            gsi_next (&si))
7436         {
7437           gphi *phi = si.phi ();
7438           if (dump_enabled_p ())
7439             {
7440               dump_printf_loc (MSG_NOTE, vect_location,
7441                                "------>vectorizing phi: ");
7442               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7443             }
7444           stmt_info = vinfo_for_stmt (phi);
7445           if (!stmt_info)
7446             continue;
7447
7448           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7449             vect_loop_kill_debug_uses (loop, phi);
7450
7451           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7452               && !STMT_VINFO_LIVE_P (stmt_info))
7453             continue;
7454
7455           if (STMT_VINFO_VECTYPE (stmt_info)
7456               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7457                   != (unsigned HOST_WIDE_INT) vf)
7458               && dump_enabled_p ())
7459             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7460
7461           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7462                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7463                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7464               && ! PURE_SLP_STMT (stmt_info))
7465             {
7466               if (dump_enabled_p ())
7467                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7468               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7469             }
7470         }
7471
7472       pattern_stmt = NULL;
7473       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7474            !gsi_end_p (si) || transform_pattern_stmt;)
7475         {
7476           bool is_store;
7477
7478           if (transform_pattern_stmt)
7479             stmt = pattern_stmt;
7480           else
7481             {
7482               stmt = gsi_stmt (si);
7483               /* During vectorization remove existing clobber stmts.  */
7484               if (gimple_clobber_p (stmt))
7485                 {
7486                   unlink_stmt_vdef (stmt);
7487                   gsi_remove (&si, true);
7488                   release_defs (stmt);
7489                   continue;
7490                 }
7491             }
7492
7493           if (dump_enabled_p ())
7494             {
7495               dump_printf_loc (MSG_NOTE, vect_location,
7496                                "------>vectorizing statement: ");
7497               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7498             }
7499
7500           stmt_info = vinfo_for_stmt (stmt);
7501
7502           /* vector stmts created in the outer-loop during vectorization of
7503              stmts in an inner-loop may not have a stmt_info, and do not
7504              need to be vectorized.  */
7505           if (!stmt_info)
7506             {
7507               gsi_next (&si);
7508               continue;
7509             }
7510
7511           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7512             vect_loop_kill_debug_uses (loop, stmt);
7513
7514           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7515               && !STMT_VINFO_LIVE_P (stmt_info))
7516             {
7517               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7518                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7519                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7520                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7521                 {
7522                   stmt = pattern_stmt;
7523                   stmt_info = vinfo_for_stmt (stmt);
7524                 }
7525               else
7526                 {
7527                   gsi_next (&si);
7528                   continue;
7529                 }
7530             }
7531           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7532                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7533                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7534                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7535             transform_pattern_stmt = true;
7536
7537           /* If pattern statement has def stmts, vectorize them too.  */
7538           if (is_pattern_stmt_p (stmt_info))
7539             {
7540               if (pattern_def_seq == NULL)
7541                 {
7542                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7543                   pattern_def_si = gsi_start (pattern_def_seq);
7544                 }
7545               else if (!gsi_end_p (pattern_def_si))
7546                 gsi_next (&pattern_def_si);
7547               if (pattern_def_seq != NULL)
7548                 {
7549                   gimple *pattern_def_stmt = NULL;
7550                   stmt_vec_info pattern_def_stmt_info = NULL;
7551
7552                   while (!gsi_end_p (pattern_def_si))
7553                     {
7554                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7555                       pattern_def_stmt_info
7556                         = vinfo_for_stmt (pattern_def_stmt);
7557                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7558                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7559                         break;
7560                       gsi_next (&pattern_def_si);
7561                     }
7562
7563                   if (!gsi_end_p (pattern_def_si))
7564                     {
7565                       if (dump_enabled_p ())
7566                         {
7567                           dump_printf_loc (MSG_NOTE, vect_location,
7568                                            "==> vectorizing pattern def "
7569                                            "stmt: ");
7570                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7571                                             pattern_def_stmt, 0);
7572                         }
7573
7574                       stmt = pattern_def_stmt;
7575                       stmt_info = pattern_def_stmt_info;
7576                     }
7577                   else
7578                     {
7579                       pattern_def_si = gsi_none ();
7580                       transform_pattern_stmt = false;
7581                     }
7582                 }
7583               else
7584                 transform_pattern_stmt = false;
7585             }
7586
7587           if (STMT_VINFO_VECTYPE (stmt_info))
7588             {
7589               unsigned int nunits
7590                 = (unsigned int)
7591                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7592               if (!STMT_SLP_TYPE (stmt_info)
7593                   && nunits != (unsigned int) vf
7594                   && dump_enabled_p ())
7595                   /* For SLP VF is set according to unrolling factor, and not
7596                      to vector size, hence for SLP this print is not valid.  */
7597                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7598             }
7599
7600           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7601              reached.  */
7602           if (STMT_SLP_TYPE (stmt_info))
7603             {
7604               if (!slp_scheduled)
7605                 {
7606                   slp_scheduled = true;
7607
7608                   if (dump_enabled_p ())
7609                     dump_printf_loc (MSG_NOTE, vect_location,
7610                                      "=== scheduling SLP instances ===\n");
7611
7612                   vect_schedule_slp (loop_vinfo);
7613                 }
7614
7615               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7616               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7617                 {
7618                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7619                     {
7620                       pattern_def_seq = NULL;
7621                       gsi_next (&si);
7622                     }
7623                   continue;
7624                 }
7625             }
7626
7627           /* -------- vectorize statement ------------ */
7628           if (dump_enabled_p ())
7629             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7630
7631           grouped_store = false;
7632           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7633           if (is_store)
7634             {
7635               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7636                 {
7637                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7638                      interleaving chain was completed - free all the stores in
7639                      the chain.  */
7640                   gsi_next (&si);
7641                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7642                 }
7643               else
7644                 {
7645                   /* Free the attached stmt_vec_info and remove the stmt.  */
7646                   gimple *store = gsi_stmt (si);
7647                   free_stmt_vec_info (store);
7648                   unlink_stmt_vdef (store);
7649                   gsi_remove (&si, true);
7650                   release_defs (store);
7651                 }
7652
7653               /* Stores can only appear at the end of pattern statements.  */
7654               gcc_assert (!transform_pattern_stmt);
7655               pattern_def_seq = NULL;
7656             }
7657           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7658             {
7659               pattern_def_seq = NULL;
7660               gsi_next (&si);
7661             }
7662         }                       /* stmts in BB */
7663     }                           /* BBs in loop */
7664
7665   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7666
7667   scale_profile_for_vect_loop (loop, vf);
7668
7669   /* The minimum number of iterations performed by the epilogue.  This
7670      is 1 when peeling for gaps because we always need a final scalar
7671      iteration.  */
7672   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7673   /* +1 to convert latch counts to loop iteration counts,
7674      -min_epilogue_iters to remove iterations that cannot be performed
7675        by the vector code.  */
7676   int bias = 1 - min_epilogue_iters;
7677   /* In these calculations the "- 1" converts loop iteration counts
7678      back to latch counts.  */
7679   if (loop->any_upper_bound)
7680     loop->nb_iterations_upper_bound
7681       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7682   if (loop->any_likely_upper_bound)
7683     loop->nb_iterations_likely_upper_bound
7684       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7685   if (loop->any_estimate)
7686     loop->nb_iterations_estimate
7687       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7688
7689   if (dump_enabled_p ())
7690     {
7691       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7692         {
7693           dump_printf_loc (MSG_NOTE, vect_location,
7694                            "LOOP VECTORIZED\n");
7695           if (loop->inner)
7696             dump_printf_loc (MSG_NOTE, vect_location,
7697                              "OUTER LOOP VECTORIZED\n");
7698           dump_printf (MSG_NOTE, "\n");
7699         }
7700       else
7701         dump_printf_loc (MSG_NOTE, vect_location,
7702                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7703                          current_vector_size);
7704     }
7705
7706   /* Free SLP instances here because otherwise stmt reference counting
7707      won't work.  */
7708   slp_instance instance;
7709   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7710     vect_free_slp_instance (instance);
7711   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7712   /* Clear-up safelen field since its value is invalid after vectorization
7713      since vectorized loop can have loop-carried dependencies.  */
7714   loop->safelen = 0;
7715
7716   /* Don't vectorize epilogue for epilogue.  */
7717   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7718     epilogue = NULL;
7719
7720   if (epilogue)
7721     {
7722         unsigned int vector_sizes
7723           = targetm.vectorize.autovectorize_vector_sizes ();
7724         vector_sizes &= current_vector_size - 1;
7725
7726         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7727           epilogue = NULL;
7728         else if (!vector_sizes)
7729           epilogue = NULL;
7730         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7731                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7732           {
7733             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7734             int ratio = current_vector_size / smallest_vec_size;
7735             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7736               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7737             eiters = eiters % vf;
7738
7739             epilogue->nb_iterations_upper_bound = eiters - 1;
7740
7741             if (eiters < vf / ratio)
7742               epilogue = NULL;
7743             }
7744     }
7745
7746   if (epilogue)
7747     {
7748       epilogue->force_vectorize = loop->force_vectorize;
7749       epilogue->safelen = loop->safelen;
7750       epilogue->dont_vectorize = false;
7751
7752       /* We may need to if-convert epilogue to vectorize it.  */
7753       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7754         tree_if_conversion (epilogue);
7755     }
7756
7757   return epilogue;
7758 }
7759
7760 /* The code below is trying to perform simple optimization - revert
7761    if-conversion for masked stores, i.e. if the mask of a store is zero
7762    do not perform it and all stored value producers also if possible.
7763    For example,
7764      for (i=0; i<n; i++)
7765        if (c[i])
7766         {
7767           p1[i] += 1;
7768           p2[i] = p3[i] +2;
7769         }
7770    this transformation will produce the following semi-hammock:
7771
7772    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7773      {
7774        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7775        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7776        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7777        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7778        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7779        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7780      }
7781 */
7782
7783 void
7784 optimize_mask_stores (struct loop *loop)
7785 {
7786   basic_block *bbs = get_loop_body (loop);
7787   unsigned nbbs = loop->num_nodes;
7788   unsigned i;
7789   basic_block bb;
7790   struct loop *bb_loop;
7791   gimple_stmt_iterator gsi;
7792   gimple *stmt;
7793   auto_vec<gimple *> worklist;
7794
7795   vect_location = find_loop_location (loop);
7796   /* Pick up all masked stores in loop if any.  */
7797   for (i = 0; i < nbbs; i++)
7798     {
7799       bb = bbs[i];
7800       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7801            gsi_next (&gsi))
7802         {
7803           stmt = gsi_stmt (gsi);
7804           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7805             worklist.safe_push (stmt);
7806         }
7807     }
7808
7809   free (bbs);
7810   if (worklist.is_empty ())
7811     return;
7812
7813   /* Loop has masked stores.  */
7814   while (!worklist.is_empty ())
7815     {
7816       gimple *last, *last_store;
7817       edge e, efalse;
7818       tree mask;
7819       basic_block store_bb, join_bb;
7820       gimple_stmt_iterator gsi_to;
7821       tree vdef, new_vdef;
7822       gphi *phi;
7823       tree vectype;
7824       tree zero;
7825
7826       last = worklist.pop ();
7827       mask = gimple_call_arg (last, 2);
7828       bb = gimple_bb (last);
7829       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7830          the same loop as if_bb.  It could be different to LOOP when two
7831          level loop-nest is vectorized and mask_store belongs to the inner
7832          one.  */
7833       e = split_block (bb, last);
7834       bb_loop = bb->loop_father;
7835       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7836       join_bb = e->dest;
7837       store_bb = create_empty_bb (bb);
7838       add_bb_to_loop (store_bb, bb_loop);
7839       e->flags = EDGE_TRUE_VALUE;
7840       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7841       /* Put STORE_BB to likely part.  */
7842       efalse->probability = profile_probability::unlikely ();
7843       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7844       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7845       if (dom_info_available_p (CDI_DOMINATORS))
7846         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7847       if (dump_enabled_p ())
7848         dump_printf_loc (MSG_NOTE, vect_location,
7849                          "Create new block %d to sink mask stores.",
7850                          store_bb->index);
7851       /* Create vector comparison with boolean result.  */
7852       vectype = TREE_TYPE (mask);
7853       zero = build_zero_cst (vectype);
7854       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7855       gsi = gsi_last_bb (bb);
7856       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7857       /* Create new PHI node for vdef of the last masked store:
7858          .MEM_2 = VDEF <.MEM_1>
7859          will be converted to
7860          .MEM.3 = VDEF <.MEM_1>
7861          and new PHI node will be created in join bb
7862          .MEM_2 = PHI <.MEM_1, .MEM_3>
7863       */
7864       vdef = gimple_vdef (last);
7865       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7866       gimple_set_vdef (last, new_vdef);
7867       phi = create_phi_node (vdef, join_bb);
7868       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7869
7870       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7871       while (true)
7872         {
7873           gimple_stmt_iterator gsi_from;
7874           gimple *stmt1 = NULL;
7875
7876           /* Move masked store to STORE_BB.  */
7877           last_store = last;
7878           gsi = gsi_for_stmt (last);
7879           gsi_from = gsi;
7880           /* Shift GSI to the previous stmt for further traversal.  */
7881           gsi_prev (&gsi);
7882           gsi_to = gsi_start_bb (store_bb);
7883           gsi_move_before (&gsi_from, &gsi_to);
7884           /* Setup GSI_TO to the non-empty block start.  */
7885           gsi_to = gsi_start_bb (store_bb);
7886           if (dump_enabled_p ())
7887             {
7888               dump_printf_loc (MSG_NOTE, vect_location,
7889                                "Move stmt to created bb\n");
7890               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7891             }
7892           /* Move all stored value producers if possible.  */
7893           while (!gsi_end_p (gsi))
7894             {
7895               tree lhs;
7896               imm_use_iterator imm_iter;
7897               use_operand_p use_p;
7898               bool res;
7899
7900               /* Skip debug statements.  */
7901               if (is_gimple_debug (gsi_stmt (gsi)))
7902                 {
7903                   gsi_prev (&gsi);
7904                   continue;
7905                 }
7906               stmt1 = gsi_stmt (gsi);
7907               /* Do not consider statements writing to memory or having
7908                  volatile operand.  */
7909               if (gimple_vdef (stmt1)
7910                   || gimple_has_volatile_ops (stmt1))
7911                 break;
7912               gsi_from = gsi;
7913               gsi_prev (&gsi);
7914               lhs = gimple_get_lhs (stmt1);
7915               if (!lhs)
7916                 break;
7917
7918               /* LHS of vectorized stmt must be SSA_NAME.  */
7919               if (TREE_CODE (lhs) != SSA_NAME)
7920                 break;
7921
7922               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7923                 {
7924                   /* Remove dead scalar statement.  */
7925                   if (has_zero_uses (lhs))
7926                     {
7927                       gsi_remove (&gsi_from, true);
7928                       continue;
7929                     }
7930                 }
7931
7932               /* Check that LHS does not have uses outside of STORE_BB.  */
7933               res = true;
7934               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7935                 {
7936                   gimple *use_stmt;
7937                   use_stmt = USE_STMT (use_p);
7938                   if (is_gimple_debug (use_stmt))
7939                     continue;
7940                   if (gimple_bb (use_stmt) != store_bb)
7941                     {
7942                       res = false;
7943                       break;
7944                     }
7945                 }
7946               if (!res)
7947                 break;
7948
7949               if (gimple_vuse (stmt1)
7950                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7951                 break;
7952
7953               /* Can move STMT1 to STORE_BB.  */
7954               if (dump_enabled_p ())
7955                 {
7956                   dump_printf_loc (MSG_NOTE, vect_location,
7957                                    "Move stmt to created bb\n");
7958                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7959                 }
7960               gsi_move_before (&gsi_from, &gsi_to);
7961               /* Shift GSI_TO for further insertion.  */
7962               gsi_prev (&gsi_to);
7963             }
7964           /* Put other masked stores with the same mask to STORE_BB.  */
7965           if (worklist.is_empty ()
7966               || gimple_call_arg (worklist.last (), 2) != mask
7967               || worklist.last () != stmt1)
7968             break;
7969           last = worklist.pop ();
7970         }
7971       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7972     }
7973 }