gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53
  54 /* Loop Vectorization Pass.
  55
  56    This pass tries to vectorize loops.
  57
  58    For example, the vectorizer transforms the following simple loop:
  59
  60         short a[N]; short b[N]; short c[N]; int i;
  61
  62         for (i=0; i<N; i++){
  63           a[i] = b[i] + c[i];
  64         }
  65
  66    as if it was manually vectorized by rewriting the source code into:
  67
  68         typedef int __attribute__((mode(V8HI))) v8hi;
  69         short a[N];  short b[N]; short c[N];   int i;
  70         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  71         v8hi va, vb, vc;
  72
  73         for (i=0; i<N/8; i++){
  74           vb = pb[i];
  75           vc = pc[i];
  76           va = vb + vc;
  77           pa[i] = va;
  78         }
  79
  80         The main entry to this pass is vectorize_loops(), in which
  81    the vectorizer applies a set of analyses on a given set of loops,
  82    followed by the actual vectorization transformation for the loops that
  83    had successfully passed the analysis phase.
  84         Throughout this pass we make a distinction between two types of
  85    data: scalars (which are represented by SSA_NAMES), and memory references
  86    ("data-refs").  These two types of data require different handling both
  87    during analysis and transformation. The types of data-refs that the
  88    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  89    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  90    accesses are required to have a simple (consecutive) access pattern.
  91
  92    Analysis phase:
  93    ===============
  94         The driver for the analysis phase is vect_analyze_loop().
  95    It applies a set of analyses, some of which rely on the scalar evolution
  96    analyzer (scev) developed by Sebastian Pop.
  97
  98         During the analysis phase the vectorizer records some information
  99    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 100    loop, as well as general information about the loop as a whole, which is
 101    recorded in a "loop_vec_info" struct attached to each loop.
 102
 103    Transformation phase:
 104    =====================
 105         The loop transformation phase scans all the stmts in the loop, and
 106    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 107    the loop that needs to be vectorized.  It inserts the vector code sequence
 108    just before the scalar stmt S, and records a pointer to the vector code
 109    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 110    attached to S).  This pointer will be used for the vectorization of following
 111    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 112    otherwise, we rely on dead code elimination for removing it.
 113
 114         For example, say stmt S1 was vectorized into stmt VS1:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    S2:  a = b;
 119
 120    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 121    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 122    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 123    resulting sequence would be:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    VS2: va = vb;
 128    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 129
 130         Operands that are not SSA_NAMEs, are data-refs that appear in
 131    load/store operations (like 'x[i]' in S1), and are handled differently.
 132
 133    Target modeling:
 134    =================
 135         Currently the only target specific information that is used is the
 136    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 137    Targets that can support different sizes of vectors, for now will need
 138    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 139    flexibility will be added in the future.
 140
 141         Since we only vectorize operations which vector form can be
 142    expressed using existing tree codes, to verify that an operation is
 143    supported, the vectorizer checks the relevant optab at the relevant
 144    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 145    the value found is CODE_FOR_nothing, then there's no target support, and
 146    we can't vectorize the stmt.
 147
 148    For additional information on this project see:
 149    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 150 */
 151
 152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 153
 154 /* Function vect_determine_vectorization_factor
 155
 156    Determine the vectorization factor (VF).  VF is the number of data elements
 157    that are operated upon in parallel in a single iteration of the vectorized
 158    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 159    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 160    elements can fit in a single vector register.
 161
 162    We currently support vectorization of loops in which all types operated upon
 163    are of the same size.  Therefore this function currently sets VF according to
 164    the size of the types operated upon, and fails if there are multiple sizes
 165    in the loop.
 166
 167    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 168    original loop:
 169         for (i=0; i<N; i++){
 170           a[i] = b[i] + c[i];
 171         }
 172
 173    vectorized loop:
 174         for (i=0; i<N; i+=VF){
 175           a[i:VF] = b[i:VF] + c[i:VF];
 176         }
 177 */
 178
 179 static bool
 180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 181 {
 182   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 183   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 184   unsigned nbbs = loop->num_nodes;
 185   unsigned int vectorization_factor = 0;
 186   tree scalar_type = NULL_TREE;
 187   gphi *phi;
 188   tree vectype;
 189   unsigned int nunits;
 190   stmt_vec_info stmt_info;
 191   unsigned i;
 192   HOST_WIDE_INT dummy;
 193   gimple *stmt, *pattern_stmt = NULL;
 194   gimple_seq pattern_def_seq = NULL;
 195   gimple_stmt_iterator pattern_def_si = gsi_none ();
 196   bool analyze_pattern_stmt = false;
 197   bool bool_result;
 198   auto_vec<stmt_vec_info> mask_producers;
 199
 200   if (dump_enabled_p ())
 201     dump_printf_loc (MSG_NOTE, vect_location,
 202                      "=== vect_determine_vectorization_factor ===\n");
 203
 204   for (i = 0; i < nbbs; i++)
 205     {
 206       basic_block bb = bbs[i];
 207
 208       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 209            gsi_next (&si))
 210         {
 211           phi = si.phi ();
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217             }
 218
 219           gcc_assert (stmt_info);
 220
 221           if (STMT_VINFO_RELEVANT_P (stmt_info)
 222               || STMT_VINFO_LIVE_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 554           if (dump_enabled_p ())
 555             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 556           if (!vectorization_factor
 557               || (nunits > vectorization_factor))
 558             vectorization_factor = nunits;
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 571                      vectorization_factor);
 572   if (vectorization_factor <= 1)
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "not vectorized: unsupported data-type\n");
 577       return false;
 578     }
 579   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 580
 581   for (i = 0; i < mask_producers.length (); i++)
 582     {
 583       tree mask_type = NULL;
 584
 585       stmt = STMT_VINFO_STMT (mask_producers[i]);
 586
 587       if (is_gimple_assign (stmt)
 588           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 589           && !VECT_SCALAR_BOOLEAN_TYPE_P
 590                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                     }
 623                   return false;
 624                 }
 625
 626               /* No vectype probably means external definition.
 627                  Allow it in case there is another operand which
 628                  allows to determine mask type.  */
 629               if (!vectype)
 630                 continue;
 631
 632               if (!mask_type)
 633                 mask_type = vectype;
 634               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 635                        != TYPE_VECTOR_SUBPARTS (vectype))
 636                 {
 637                   if (dump_enabled_p ())
 638                     {
 639                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                                        "not vectorized: different sized masks "
 641                                        "types in statement, ");
 642                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 643                                          mask_type);
 644                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 645                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 646                                          vectype);
 647                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 648                     }
 649                   return false;
 650                 }
 651               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 652                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 653                 {
 654                   if (dump_enabled_p ())
 655                     {
 656                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                                        "not vectorized: mixed mask and "
 658                                        "nonmask vector types in statement, ");
 659                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 660                                          mask_type);
 661                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 662                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 663                                          vectype);
 664                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 665                     }
 666                   return false;
 667                 }
 668             }
 669
 670           /* We may compare boolean value loaded as vector of integers.
 671              Fix mask_type in such case.  */
 672           if (mask_type
 673               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 674               && gimple_code (stmt) == GIMPLE_ASSIGN
 675               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 676             mask_type = build_same_sized_truth_vector_type (mask_type);
 677         }
 678
 679       /* No mask_type should mean loop invariant predicate.
 680          This is probably a subject for optimization in
 681          if-conversion.  */
 682       if (!mask_type)
 683         {
 684           if (dump_enabled_p ())
 685             {
 686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                                "not vectorized: can't compute mask type "
 688                                "for statement, ");
 689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 690                                 0);
 691             }
 692           return false;
 693         }
 694
 695       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 696     }
 697
 698   return true;
 699 }
 700
 701
 702 /* Function vect_is_simple_iv_evolution.
 703
 704    FORNOW: A simple evolution of an induction variables in the loop is
 705    considered a polynomial evolution.  */
 706
 707 static bool
 708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 709                              tree * step)
 710 {
 711   tree init_expr;
 712   tree step_expr;
 713   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 714   basic_block bb;
 715
 716   /* When there is no evolution in this loop, the evolution function
 717      is not "simple".  */
 718   if (evolution_part == NULL_TREE)
 719     return false;
 720
 721   /* When the evolution is a polynomial of degree >= 2
 722      the evolution function is not "simple".  */
 723   if (tree_is_chrec (evolution_part))
 724     return false;
 725
 726   step_expr = evolution_part;
 727   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 728
 729   if (dump_enabled_p ())
 730     {
 731       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 732       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 733       dump_printf (MSG_NOTE, ",  init: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 735       dump_printf (MSG_NOTE, "\n");
 736     }
 737
 738   *init = init_expr;
 739   *step = step_expr;
 740
 741   if (TREE_CODE (step_expr) != INTEGER_CST
 742       && (TREE_CODE (step_expr) != SSA_NAME
 743           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 744               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 745           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 746               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 747                   || !flag_associative_math)))
 748       && (TREE_CODE (step_expr) != REAL_CST
 749           || !flag_associative_math))
 750     {
 751       if (dump_enabled_p ())
 752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                          "step unknown.\n");
 754       return false;
 755     }
 756
 757   return true;
 758 }
 759
 760 /* Function vect_analyze_scalar_cycles_1.
 761
 762    Examine the cross iteration def-use cycles of scalar variables
 763    in LOOP.  LOOP_VINFO represents the loop that is now being
 764    considered for vectorization (can be LOOP, or an outer-loop
 765    enclosing LOOP).  */
 766
 767 static void
 768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 769 {
 770   basic_block bb = loop->header;
 771   tree init, step;
 772   auto_vec<gimple *, 64> worklist;
 773   gphi_iterator gsi;
 774   bool double_reduc;
 775
 776   if (dump_enabled_p ())
 777     dump_printf_loc (MSG_NOTE, vect_location,
 778                      "=== vect_analyze_scalar_cycles ===\n");
 779
 780   /* First - identify all inductions.  Reduction detection assumes that all the
 781      inductions have been identified, therefore, this order must not be
 782      changed.  */
 783   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 784     {
 785       gphi *phi = gsi.phi ();
 786       tree access_fn = NULL;
 787       tree def = PHI_RESULT (phi);
 788       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 789
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 793           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 794         }
 795
 796       /* Skip virtual phi's.  The data dependences that are associated with
 797          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 798       if (virtual_operand_p (def))
 799         continue;
 800
 801       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 802
 803       /* Analyze the evolution function.  */
 804       access_fn = analyze_scalar_evolution (loop, def);
 805       if (access_fn)
 806         {
 807           STRIP_NOPS (access_fn);
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "Access function of PHI: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 816             = initial_condition_in_loop_num (access_fn, loop->num);
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 818             = evolution_part_in_loop_num (access_fn, loop->num);
 819         }
 820
 821       if (!access_fn
 822           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 823           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 824               && TREE_CODE (step) != INTEGER_CST))
 825         {
 826           worklist.safe_push (phi);
 827           continue;
 828         }
 829
 830       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 831                   != NULL_TREE);
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 833
 834       if (dump_enabled_p ())
 835         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 836       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 837     }
 838
 839
 840   /* Second - identify all reductions and nested cycles.  */
 841   while (worklist.length () > 0)
 842     {
 843       gimple *phi = worklist.pop ();
 844       tree def = PHI_RESULT (phi);
 845       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 846       gimple *reduc_stmt;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852         }
 853
 854       gcc_assert (!virtual_operand_p (def)
 855                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 856
 857       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 858                                                 &double_reduc, false);
 859       if (reduc_stmt)
 860         {
 861           if (double_reduc)
 862             {
 863               if (dump_enabled_p ())
 864                 dump_printf_loc (MSG_NOTE, vect_location,
 865                                  "Detected double reduction.\n");
 866
 867               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 868               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 869                                                     vect_double_reduction_def;
 870             }
 871           else
 872             {
 873               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 874                 {
 875                   if (dump_enabled_p ())
 876                     dump_printf_loc (MSG_NOTE, vect_location,
 877                                      "Detected vectorizable nested cycle.\n");
 878
 879                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 880                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 881                                                              vect_nested_cycle;
 882                 }
 883               else
 884                 {
 885                   if (dump_enabled_p ())
 886                     dump_printf_loc (MSG_NOTE, vect_location,
 887                                      "Detected reduction.\n");
 888
 889                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 890                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 891                                                            vect_reduction_def;
 892                   /* Store the reduction cycles for possible vectorization in
 893                      loop-aware SLP if it was not detected as reduction
 894                      chain.  */
 895                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 896                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 984         while (next)
 985           {
 986             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 987               break;
 988             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 989           }
 990         /* If not all stmt in the chain are patterns try to handle
 991            the chain without patterns.  */
 992         if (! next)
 993           {
 994             vect_fixup_reduc_chain (first);
 995             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 996               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 997           }
 998       }
 999 }
1000
1001 /* Function vect_get_loop_niters.
1002
1003    Determine how many iterations the loop is executed and place it
1004    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1005    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1006    niter information holds in ASSUMPTIONS.
1007
1008    Return the loop exit condition.  */
1009
1010
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013                       tree *number_of_iterations, tree *number_of_iterationsm1)
1014 {
1015   edge exit = single_exit (loop);
1016   struct tree_niter_desc niter_desc;
1017   tree niter_assumptions, niter, may_be_zero;
1018   gcond *cond = get_loop_exit_condition (loop);
1019
1020   *assumptions = boolean_true_node;
1021   *number_of_iterationsm1 = chrec_dont_know;
1022   *number_of_iterations = chrec_dont_know;
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   if (!exit)
1028     return cond;
1029
1030   niter = chrec_dont_know;
1031   may_be_zero = NULL_TREE;
1032   niter_assumptions = boolean_true_node;
1033   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034       || chrec_contains_undetermined (niter_desc.niter))
1035     return cond;
1036
1037   niter_assumptions = niter_desc.assumptions;
1038   may_be_zero = niter_desc.may_be_zero;
1039   niter = niter_desc.niter;
1040
1041   if (may_be_zero && integer_zerop (may_be_zero))
1042     may_be_zero = NULL_TREE;
1043
1044   if (may_be_zero)
1045     {
1046       if (COMPARISON_CLASS_P (may_be_zero))
1047         {
1048           /* Try to combine may_be_zero with assumptions, this can simplify
1049              computation of niter expression.  */
1050           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052                                              niter_assumptions,
1053                                              fold_build1 (TRUTH_NOT_EXPR,
1054                                                           boolean_type_node,
1055                                                           may_be_zero));
1056           else
1057             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1059
1060           may_be_zero = NULL_TREE;
1061         }
1062       else if (integer_nonzerop (may_be_zero))
1063         {
1064           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066           return cond;
1067         }
1068       else
1069         return cond;
1070     }
1071
1072   *assumptions = niter_assumptions;
1073   *number_of_iterationsm1 = niter;
1074
1075   /* We want the number of loop header executions which is the number
1076      of latch executions plus one.
1077      ???  For UINT_MAX latch executions this number overflows to zero
1078      for loops like do { n++; } while (n != 0);  */
1079   if (niter && !chrec_contains_undetermined (niter))
1080     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081                           build_int_cst (TREE_TYPE (niter), 1));
1082   *number_of_iterations = niter;
1083
1084   return cond;
1085 }
1086
1087 /* Function bb_in_loop_p
1088
1089    Used as predicate for dfs order traversal of the loop bbs.  */
1090
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1093 {
1094   const struct loop *const loop = (const struct loop *)data;
1095   if (flow_bb_inside_loop_p (loop, bb))
1096     return true;
1097   return false;
1098 }
1099
1100
1101 /* Function new_loop_vec_info.
1102
1103    Create and initialize a new loop_vec_info struct for LOOP, as well as
1104    stmt_vec_info structs for all the stmts in LOOP.  */
1105
1106 static loop_vec_info
1107 new_loop_vec_info (struct loop *loop)
1108 {
1109   loop_vec_info res;
1110   basic_block *bbs;
1111   gimple_stmt_iterator si;
1112   unsigned int i, nbbs;
1113
1114   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1115   res->kind = vec_info::loop;
1116   LOOP_VINFO_LOOP (res) = loop;
1117
1118   bbs = get_loop_body (loop);
1119
1120   /* Create/Update stmt_info for all stmts in the loop.  */
1121   for (i = 0; i < loop->num_nodes; i++)
1122     {
1123       basic_block bb = bbs[i];
1124
1125       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1126         {
1127           gimple *phi = gsi_stmt (si);
1128           gimple_set_uid (phi, 0);
1129           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1130         }
1131
1132       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1133         {
1134           gimple *stmt = gsi_stmt (si);
1135           gimple_set_uid (stmt, 0);
1136           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1137         }
1138     }
1139
1140   /* CHECKME: We want to visit all BBs before their successors (except for
1141      latch blocks, for which this assertion wouldn't hold).  In the simple
1142      case of the loop forms we allow, a dfs order of the BBs would the same
1143      as reversed postorder traversal, so we are safe.  */
1144
1145    free (bbs);
1146    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1147    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1148                               bbs, loop->num_nodes, loop);
1149    gcc_assert (nbbs == loop->num_nodes);
1150
1151   LOOP_VINFO_BBS (res) = bbs;
1152   LOOP_VINFO_NITERSM1 (res) = NULL;
1153   LOOP_VINFO_NITERS (res) = NULL;
1154   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1155   LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL;
1156   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1157   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1158   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1159   LOOP_VINFO_VECT_FACTOR (res) = 0;
1160   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1161   LOOP_VINFO_DATAREFS (res) = vNULL;
1162   LOOP_VINFO_DDRS (res) = vNULL;
1163   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1164   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1165   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1166   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1167   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1168   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1169   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1170   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1171   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1172   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1173   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1174   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1175   LOOP_VINFO_ORIG_LOOP_INFO (res) = NULL;
1176
1177   return res;
1178 }
1179
1180
1181 /* Function destroy_loop_vec_info.
1182
1183    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1184    stmts in the loop.  */
1185
1186 void
1187 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1188 {
1189   struct loop *loop;
1190   basic_block *bbs;
1191   int nbbs;
1192   gimple_stmt_iterator si;
1193   int j;
1194   vec<slp_instance> slp_instances;
1195   slp_instance instance;
1196   bool swapped;
1197
1198   if (!loop_vinfo)
1199     return;
1200
1201   loop = LOOP_VINFO_LOOP (loop_vinfo);
1202
1203   bbs = LOOP_VINFO_BBS (loop_vinfo);
1204   nbbs = clean_stmts ? loop->num_nodes : 0;
1205   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1206
1207   for (j = 0; j < nbbs; j++)
1208     {
1209       basic_block bb = bbs[j];
1210       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1211         free_stmt_vec_info (gsi_stmt (si));
1212
1213       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1214         {
1215           gimple *stmt = gsi_stmt (si);
1216
1217           /* We may have broken canonical form by moving a constant
1218              into RHS1 of a commutative op.  Fix such occurrences.  */
1219           if (swapped && is_gimple_assign (stmt))
1220             {
1221               enum tree_code code = gimple_assign_rhs_code (stmt);
1222
1223               if ((code == PLUS_EXPR
1224                    || code == POINTER_PLUS_EXPR
1225                    || code == MULT_EXPR)
1226                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1227                 swap_ssa_operands (stmt,
1228                                    gimple_assign_rhs1_ptr (stmt),
1229                                    gimple_assign_rhs2_ptr (stmt));
1230               else if (code == COND_EXPR
1231                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1232                 {
1233                   tree cond_expr = gimple_assign_rhs1 (stmt);
1234                   enum tree_code cond_code = TREE_CODE (cond_expr);
1235
1236                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1237                     {
1238                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1239                                                                   0));
1240                       cond_code = invert_tree_comparison (cond_code,
1241                                                           honor_nans);
1242                       if (cond_code != ERROR_MARK)
1243                         {
1244                           TREE_SET_CODE (cond_expr, cond_code);
1245                           swap_ssa_operands (stmt,
1246                                              gimple_assign_rhs2_ptr (stmt),
1247                                              gimple_assign_rhs3_ptr (stmt));
1248                         }
1249                     }
1250                 }
1251             }
1252
1253           /* Free stmt_vec_info.  */
1254           free_stmt_vec_info (stmt);
1255           gsi_next (&si);
1256         }
1257     }
1258
1259   free (LOOP_VINFO_BBS (loop_vinfo));
1260   vect_destroy_datarefs (loop_vinfo);
1261   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1262   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1263   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1264   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1265   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1266   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1267   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1268     vect_free_slp_instance (instance);
1269
1270   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1271   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1272   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1273   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1274
1275   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1276   loop_vinfo->scalar_cost_vec.release ();
1277
1278   free (loop_vinfo);
1279   loop->aux = NULL;
1280 }
1281
1282
1283 /* Calculate the cost of one scalar iteration of the loop.  */
1284 static void
1285 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1286 {
1287   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1288   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1289   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1290   int innerloop_iters, i;
1291
1292   /* Count statements in scalar loop.  Using this as scalar cost for a single
1293      iteration for now.
1294
1295      TODO: Add outer loop support.
1296
1297      TODO: Consider assigning different costs to different scalar
1298      statements.  */
1299
1300   /* FORNOW.  */
1301   innerloop_iters = 1;
1302   if (loop->inner)
1303     innerloop_iters = 50; /* FIXME */
1304
1305   for (i = 0; i < nbbs; i++)
1306     {
1307       gimple_stmt_iterator si;
1308       basic_block bb = bbs[i];
1309
1310       if (bb->loop_father == loop->inner)
1311         factor = innerloop_iters;
1312       else
1313         factor = 1;
1314
1315       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1316         {
1317           gimple *stmt = gsi_stmt (si);
1318           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1319
1320           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1321             continue;
1322
1323           /* Skip stmts that are not vectorized inside the loop.  */
1324           if (stmt_info
1325               && !STMT_VINFO_RELEVANT_P (stmt_info)
1326               && (!STMT_VINFO_LIVE_P (stmt_info)
1327                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1328               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1329             continue;
1330
1331           vect_cost_for_stmt kind;
1332           if (STMT_VINFO_DATA_REF (stmt_info))
1333             {
1334               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1335                kind = scalar_load;
1336              else
1337                kind = scalar_store;
1338             }
1339           else
1340             kind = scalar_stmt;
1341
1342           scalar_single_iter_cost
1343             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1344                                  factor, kind, stmt_info, 0, vect_prologue);
1345         }
1346     }
1347   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1348     = scalar_single_iter_cost;
1349 }
1350
1351
1352 /* Function vect_analyze_loop_form_1.
1353
1354    Verify that certain CFG restrictions hold, including:
1355    - the loop has a pre-header
1356    - the loop has a single entry and exit
1357    - the loop exit condition is simple enough
1358    - the number of iterations can be analyzed, i.e, a countable loop.  The
1359      niter could be analyzed under some assumptions.  */
1360
1361 bool
1362 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1363                           tree *assumptions, tree *number_of_iterationsm1,
1364                           tree *number_of_iterations, gcond **inner_loop_cond)
1365 {
1366   if (dump_enabled_p ())
1367     dump_printf_loc (MSG_NOTE, vect_location,
1368                      "=== vect_analyze_loop_form ===\n");
1369
1370   /* Different restrictions apply when we are considering an inner-most loop,
1371      vs. an outer (nested) loop.
1372      (FORNOW. May want to relax some of these restrictions in the future).  */
1373
1374   if (!loop->inner)
1375     {
1376       /* Inner-most loop.  We currently require that the number of BBs is
1377          exactly 2 (the header and latch).  Vectorizable inner-most loops
1378          look like this:
1379
1380                         (pre-header)
1381                            |
1382                           header <--------+
1383                            | |            |
1384                            | +--> latch --+
1385                            |
1386                         (exit-bb)  */
1387
1388       if (loop->num_nodes != 2)
1389         {
1390           if (dump_enabled_p ())
1391             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1392                              "not vectorized: control flow in loop.\n");
1393           return false;
1394         }
1395
1396       if (empty_block_p (loop->header))
1397         {
1398           if (dump_enabled_p ())
1399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1400                              "not vectorized: empty loop.\n");
1401           return false;
1402         }
1403     }
1404   else
1405     {
1406       struct loop *innerloop = loop->inner;
1407       edge entryedge;
1408
1409       /* Nested loop. We currently require that the loop is doubly-nested,
1410          contains a single inner loop, and the number of BBs is exactly 5.
1411          Vectorizable outer-loops look like this:
1412
1413                         (pre-header)
1414                            |
1415                           header <---+
1416                            |         |
1417                           inner-loop |
1418                            |         |
1419                           tail ------+
1420                            |
1421                         (exit-bb)
1422
1423          The inner-loop has the properties expected of inner-most loops
1424          as described above.  */
1425
1426       if ((loop->inner)->inner || (loop->inner)->next)
1427         {
1428           if (dump_enabled_p ())
1429             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1430                              "not vectorized: multiple nested loops.\n");
1431           return false;
1432         }
1433
1434       if (loop->num_nodes != 5)
1435         {
1436           if (dump_enabled_p ())
1437             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                              "not vectorized: control flow in loop.\n");
1439           return false;
1440         }
1441
1442       entryedge = loop_preheader_edge (innerloop);
1443       if (entryedge->src != loop->header
1444           || !single_exit (innerloop)
1445           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1446         {
1447           if (dump_enabled_p ())
1448             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1449                              "not vectorized: unsupported outerloop form.\n");
1450           return false;
1451         }
1452
1453       /* Analyze the inner-loop.  */
1454       tree inner_niterm1, inner_niter, inner_assumptions;
1455       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1456                                       &inner_assumptions, &inner_niterm1,
1457                                       &inner_niter, NULL)
1458           /* Don't support analyzing niter under assumptions for inner
1459              loop.  */
1460           || !integer_onep (inner_assumptions))
1461         {
1462           if (dump_enabled_p ())
1463             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                              "not vectorized: Bad inner loop.\n");
1465           return false;
1466         }
1467
1468       if (!expr_invariant_in_loop_p (loop, inner_niter))
1469         {
1470           if (dump_enabled_p ())
1471             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472                              "not vectorized: inner-loop count not"
1473                              " invariant.\n");
1474           return false;
1475         }
1476
1477       if (dump_enabled_p ())
1478         dump_printf_loc (MSG_NOTE, vect_location,
1479                          "Considering outer-loop vectorization.\n");
1480     }
1481
1482   if (!single_exit (loop)
1483       || EDGE_COUNT (loop->header->preds) != 2)
1484     {
1485       if (dump_enabled_p ())
1486         {
1487           if (!single_exit (loop))
1488             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1489                              "not vectorized: multiple exits.\n");
1490           else if (EDGE_COUNT (loop->header->preds) != 2)
1491             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1492                              "not vectorized: too many incoming edges.\n");
1493         }
1494       return false;
1495     }
1496
1497   /* We assume that the loop exit condition is at the end of the loop. i.e,
1498      that the loop is represented as a do-while (with a proper if-guard
1499      before the loop if needed), where the loop header contains all the
1500      executable statements, and the latch is empty.  */
1501   if (!empty_block_p (loop->latch)
1502       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1503     {
1504       if (dump_enabled_p ())
1505         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506                          "not vectorized: latch block not empty.\n");
1507       return false;
1508     }
1509
1510   /* Make sure the exit is not abnormal.  */
1511   edge e = single_exit (loop);
1512   if (e->flags & EDGE_ABNORMAL)
1513     {
1514       if (dump_enabled_p ())
1515         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1516                          "not vectorized: abnormal loop exit edge.\n");
1517       return false;
1518     }
1519
1520   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1521                                      number_of_iterationsm1);
1522   if (!*loop_cond)
1523     {
1524       if (dump_enabled_p ())
1525         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1526                          "not vectorized: complicated exit condition.\n");
1527       return false;
1528     }
1529
1530   if (integer_zerop (*assumptions)
1531       || !*number_of_iterations
1532       || chrec_contains_undetermined (*number_of_iterations))
1533     {
1534       if (dump_enabled_p ())
1535         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1536                          "not vectorized: number of iterations cannot be "
1537                          "computed.\n");
1538       return false;
1539     }
1540
1541   if (integer_zerop (*number_of_iterations))
1542     {
1543       if (dump_enabled_p ())
1544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1545                          "not vectorized: number of iterations = 0.\n");
1546       return false;
1547     }
1548
1549   return true;
1550 }
1551
1552 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1553
1554 loop_vec_info
1555 vect_analyze_loop_form (struct loop *loop)
1556 {
1557   tree assumptions, number_of_iterations, number_of_iterationsm1;
1558   gcond *loop_cond, *inner_loop_cond = NULL;
1559
1560   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1561                                   &assumptions, &number_of_iterationsm1,
1562                                   &number_of_iterations, &inner_loop_cond))
1563     return NULL;
1564
1565   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1566   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1567   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1568   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1569   if (!integer_onep (assumptions))
1570     {
1571       /* We consider to vectorize this loop by versioning it under
1572          some assumptions.  In order to do this, we need to clear
1573          existing information computed by scev and niter analyzer.  */
1574       scev_reset_htab ();
1575       free_numbers_of_iterations_estimates (loop);
1576       /* Also set flag for this loop so that following scev and niter
1577          analysis are done under the assumptions.  */
1578       loop_constraint_set (loop, LOOP_C_FINITE);
1579       /* Also record the assumptions for versioning.  */
1580       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1581     }
1582
1583   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1584     {
1585       if (dump_enabled_p ())
1586         {
1587           dump_printf_loc (MSG_NOTE, vect_location,
1588                            "Symbolic number of iterations is ");
1589           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1590           dump_printf (MSG_NOTE, "\n");
1591         }
1592     }
1593
1594   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1595   if (inner_loop_cond)
1596     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1597       = loop_exit_ctrl_vec_info_type;
1598
1599   gcc_assert (!loop->aux);
1600   loop->aux = loop_vinfo;
1601   return loop_vinfo;
1602 }
1603
1604
1605
1606 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1607    statements update the vectorization factor.  */
1608
1609 static void
1610 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1611 {
1612   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1613   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1614   int nbbs = loop->num_nodes;
1615   unsigned int vectorization_factor;
1616   int i;
1617
1618   if (dump_enabled_p ())
1619     dump_printf_loc (MSG_NOTE, vect_location,
1620                      "=== vect_update_vf_for_slp ===\n");
1621
1622   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1623   gcc_assert (vectorization_factor != 0);
1624
1625   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1626      vectorization factor of the loop is the unrolling factor required by
1627      the SLP instances.  If that unrolling factor is 1, we say, that we
1628      perform pure SLP on loop - cross iteration parallelism is not
1629      exploited.  */
1630   bool only_slp_in_loop = true;
1631   for (i = 0; i < nbbs; i++)
1632     {
1633       basic_block bb = bbs[i];
1634       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1635            gsi_next (&si))
1636         {
1637           gimple *stmt = gsi_stmt (si);
1638           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1639           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1640               && STMT_VINFO_RELATED_STMT (stmt_info))
1641             {
1642               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1643               stmt_info = vinfo_for_stmt (stmt);
1644             }
1645           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1646                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1647               && !PURE_SLP_STMT (stmt_info))
1648             /* STMT needs both SLP and loop-based vectorization.  */
1649             only_slp_in_loop = false;
1650         }
1651     }
1652
1653   if (only_slp_in_loop)
1654     {
1655       dump_printf_loc (MSG_NOTE, vect_location,
1656                        "Loop contains only SLP stmts\n");
1657       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1658     }
1659   else
1660     {
1661       dump_printf_loc (MSG_NOTE, vect_location,
1662                        "Loop contains SLP and non-SLP stmts\n");
1663       vectorization_factor
1664         = least_common_multiple (vectorization_factor,
1665                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1666     }
1667
1668   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1669   if (dump_enabled_p ())
1670     dump_printf_loc (MSG_NOTE, vect_location,
1671                      "Updating vectorization factor to %d\n",
1672                      vectorization_factor);
1673 }
1674
1675 /* Function vect_analyze_loop_operations.
1676
1677    Scan the loop stmts and make sure they are all vectorizable.  */
1678
1679 static bool
1680 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1681 {
1682   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1683   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1684   int nbbs = loop->num_nodes;
1685   int i;
1686   stmt_vec_info stmt_info;
1687   bool need_to_vectorize = false;
1688   bool ok;
1689
1690   if (dump_enabled_p ())
1691     dump_printf_loc (MSG_NOTE, vect_location,
1692                      "=== vect_analyze_loop_operations ===\n");
1693
1694   for (i = 0; i < nbbs; i++)
1695     {
1696       basic_block bb = bbs[i];
1697
1698       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1699            gsi_next (&si))
1700         {
1701           gphi *phi = si.phi ();
1702           ok = true;
1703
1704           stmt_info = vinfo_for_stmt (phi);
1705           if (dump_enabled_p ())
1706             {
1707               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1708               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1709             }
1710           if (virtual_operand_p (gimple_phi_result (phi)))
1711             continue;
1712
1713           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1714              (i.e., a phi in the tail of the outer-loop).  */
1715           if (! is_loop_header_bb_p (bb))
1716             {
1717               /* FORNOW: we currently don't support the case that these phis
1718                  are not used in the outerloop (unless it is double reduction,
1719                  i.e., this phi is vect_reduction_def), cause this case
1720                  requires to actually do something here.  */
1721               if (STMT_VINFO_LIVE_P (stmt_info)
1722                   && STMT_VINFO_DEF_TYPE (stmt_info)
1723                      != vect_double_reduction_def)
1724                 {
1725                   if (dump_enabled_p ())
1726                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1727                                      "Unsupported loop-closed phi in "
1728                                      "outer-loop.\n");
1729                   return false;
1730                 }
1731
1732               /* If PHI is used in the outer loop, we check that its operand
1733                  is defined in the inner loop.  */
1734               if (STMT_VINFO_RELEVANT_P (stmt_info))
1735                 {
1736                   tree phi_op;
1737                   gimple *op_def_stmt;
1738
1739                   if (gimple_phi_num_args (phi) != 1)
1740                     return false;
1741
1742                   phi_op = PHI_ARG_DEF (phi, 0);
1743                   if (TREE_CODE (phi_op) != SSA_NAME)
1744                     return false;
1745
1746                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1747                   if (gimple_nop_p (op_def_stmt)
1748                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1749                       || !vinfo_for_stmt (op_def_stmt))
1750                     return false;
1751
1752                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1753                         != vect_used_in_outer
1754                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1755                            != vect_used_in_outer_by_reduction)
1756                     return false;
1757                 }
1758
1759               continue;
1760             }
1761
1762           gcc_assert (stmt_info);
1763
1764           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1765                || STMT_VINFO_LIVE_P (stmt_info))
1766               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1767             {
1768               /* A scalar-dependence cycle that we don't support.  */
1769               if (dump_enabled_p ())
1770                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1771                                  "not vectorized: scalar dependence cycle.\n");
1772               return false;
1773             }
1774
1775           if (STMT_VINFO_RELEVANT_P (stmt_info))
1776             {
1777               need_to_vectorize = true;
1778               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1779                   && ! PURE_SLP_STMT (stmt_info))
1780                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1781               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1782                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1783                        && ! PURE_SLP_STMT (stmt_info))
1784                 ok = vectorizable_reduction (phi, NULL, NULL, NULL);
1785             }
1786
1787           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1788             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1789
1790           if (!ok)
1791             {
1792               if (dump_enabled_p ())
1793                 {
1794                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1795                                    "not vectorized: relevant phi not "
1796                                    "supported: ");
1797                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1798                 }
1799               return false;
1800             }
1801         }
1802
1803       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1804            gsi_next (&si))
1805         {
1806           gimple *stmt = gsi_stmt (si);
1807           if (!gimple_clobber_p (stmt)
1808               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1809             return false;
1810         }
1811     } /* bbs */
1812
1813   /* All operations in the loop are either irrelevant (deal with loop
1814      control, or dead), or only used outside the loop and can be moved
1815      out of the loop (e.g. invariants, inductions).  The loop can be
1816      optimized away by scalar optimizations.  We're better off not
1817      touching this loop.  */
1818   if (!need_to_vectorize)
1819     {
1820       if (dump_enabled_p ())
1821         dump_printf_loc (MSG_NOTE, vect_location,
1822                          "All the computation can be taken out of the loop.\n");
1823       if (dump_enabled_p ())
1824         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825                          "not vectorized: redundant loop. no profit to "
1826                          "vectorize.\n");
1827       return false;
1828     }
1829
1830   return true;
1831 }
1832
1833
1834 /* Function vect_analyze_loop_2.
1835
1836    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1837    for it.  The different analyses will record information in the
1838    loop_vec_info struct.  */
1839 static bool
1840 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1841 {
1842   bool ok;
1843   int max_vf = MAX_VECTORIZATION_FACTOR;
1844   int min_vf = 2;
1845   unsigned int n_stmts = 0;
1846
1847   /* The first group of checks is independent of the vector size.  */
1848   fatal = true;
1849
1850   /* Find all data references in the loop (which correspond to vdefs/vuses)
1851      and analyze their evolution in the loop.  */
1852
1853   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1854
1855   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1856   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1857     {
1858       if (dump_enabled_p ())
1859         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1860                          "not vectorized: loop nest containing two "
1861                          "or more consecutive inner loops cannot be "
1862                          "vectorized\n");
1863       return false;
1864     }
1865
1866   for (unsigned i = 0; i < loop->num_nodes; i++)
1867     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1868          !gsi_end_p (gsi); gsi_next (&gsi))
1869       {
1870         gimple *stmt = gsi_stmt (gsi);
1871         if (is_gimple_debug (stmt))
1872           continue;
1873         ++n_stmts;
1874         if (!find_data_references_in_stmt (loop, stmt,
1875                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1876           {
1877             if (is_gimple_call (stmt) && loop->safelen)
1878               {
1879                 tree fndecl = gimple_call_fndecl (stmt), op;
1880                 if (fndecl != NULL_TREE)
1881                   {
1882                     cgraph_node *node = cgraph_node::get (fndecl);
1883                     if (node != NULL && node->simd_clones != NULL)
1884                       {
1885                         unsigned int j, n = gimple_call_num_args (stmt);
1886                         for (j = 0; j < n; j++)
1887                           {
1888                             op = gimple_call_arg (stmt, j);
1889                             if (DECL_P (op)
1890                                 || (REFERENCE_CLASS_P (op)
1891                                     && get_base_address (op)))
1892                               break;
1893                           }
1894                         op = gimple_call_lhs (stmt);
1895                         /* Ignore #pragma omp declare simd functions
1896                            if they don't have data references in the
1897                            call stmt itself.  */
1898                         if (j == n
1899                             && !(op
1900                                  && (DECL_P (op)
1901                                      || (REFERENCE_CLASS_P (op)
1902                                          && get_base_address (op)))))
1903                           continue;
1904                       }
1905                   }
1906               }
1907             if (dump_enabled_p ())
1908               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1909                                "not vectorized: loop contains function "
1910                                "calls or data references that cannot "
1911                                "be analyzed\n");
1912             return false;
1913           }
1914       }
1915
1916   /* Analyze the data references and also adjust the minimal
1917      vectorization factor according to the loads and stores.  */
1918
1919   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1920   if (!ok)
1921     {
1922       if (dump_enabled_p ())
1923         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1924                          "bad data references.\n");
1925       return false;
1926     }
1927
1928   /* Classify all cross-iteration scalar data-flow cycles.
1929      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1930   vect_analyze_scalar_cycles (loop_vinfo);
1931
1932   vect_pattern_recog (loop_vinfo);
1933
1934   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1935
1936   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1937      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1938
1939   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1940   if (!ok)
1941     {
1942       if (dump_enabled_p ())
1943         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1944                          "bad data access.\n");
1945       return false;
1946     }
1947
1948   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1949
1950   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1951   if (!ok)
1952     {
1953       if (dump_enabled_p ())
1954         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1955                          "unexpected pattern.\n");
1956       return false;
1957     }
1958
1959   /* While the rest of the analysis below depends on it in some way.  */
1960   fatal = false;
1961
1962   /* Analyze data dependences between the data-refs in the loop
1963      and adjust the maximum vectorization factor according to
1964      the dependences.
1965      FORNOW: fail at the first data dependence that we encounter.  */
1966
1967   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1968   if (!ok
1969       || max_vf < min_vf)
1970     {
1971       if (dump_enabled_p ())
1972             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1973                              "bad data dependence.\n");
1974       return false;
1975     }
1976
1977   ok = vect_determine_vectorization_factor (loop_vinfo);
1978   if (!ok)
1979     {
1980       if (dump_enabled_p ())
1981         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1982                          "can't determine vectorization factor.\n");
1983       return false;
1984     }
1985   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1986     {
1987       if (dump_enabled_p ())
1988         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1989                          "bad data dependence.\n");
1990       return false;
1991     }
1992
1993   /* Compute the scalar iteration cost.  */
1994   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1995
1996   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1997   HOST_WIDE_INT estimated_niter;
1998   unsigned th;
1999   int min_scalar_loop_bound;
2000
2001   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
2002   ok = vect_analyze_slp (loop_vinfo, n_stmts);
2003   if (!ok)
2004     return false;
2005
2006   /* If there are any SLP instances mark them as pure_slp.  */
2007   bool slp = vect_make_slp_decision (loop_vinfo);
2008   if (slp)
2009     {
2010       /* Find stmts that need to be both vectorized and SLPed.  */
2011       vect_detect_hybrid_slp (loop_vinfo);
2012
2013       /* Update the vectorization factor based on the SLP decision.  */
2014       vect_update_vf_for_slp (loop_vinfo);
2015     }
2016
2017   /* This is the point where we can re-start analysis with SLP forced off.  */
2018 start_over:
2019
2020   /* Now the vectorization factor is final.  */
2021   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2022   gcc_assert (vectorization_factor != 0);
2023
2024   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2025     dump_printf_loc (MSG_NOTE, vect_location,
2026                      "vectorization_factor = %d, niters = "
2027                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
2028                      LOOP_VINFO_INT_NITERS (loop_vinfo));
2029
2030   HOST_WIDE_INT max_niter
2031     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2032   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2033        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
2034       || (max_niter != -1
2035           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
2036     {
2037       if (dump_enabled_p ())
2038         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2039                          "not vectorized: iteration count smaller than "
2040                          "vectorization factor.\n");
2041       return false;
2042     }
2043
2044   /* Analyze the alignment of the data-refs in the loop.
2045      Fail if a data reference is found that cannot be vectorized.  */
2046
2047   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2048   if (!ok)
2049     {
2050       if (dump_enabled_p ())
2051         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2052                          "bad data alignment.\n");
2053       return false;
2054     }
2055
2056   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2057      It is important to call pruning after vect_analyze_data_ref_accesses,
2058      since we use grouping information gathered by interleaving analysis.  */
2059   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2060   if (!ok)
2061     return false;
2062
2063   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2064      vectorization.  */
2065   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2066     {
2067     /* This pass will decide on using loop versioning and/or loop peeling in
2068        order to enhance the alignment of data references in the loop.  */
2069     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2070     if (!ok)
2071       {
2072         if (dump_enabled_p ())
2073           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2074                            "bad data alignment.\n");
2075         return false;
2076       }
2077     }
2078
2079   if (slp)
2080     {
2081       /* Analyze operations in the SLP instances.  Note this may
2082          remove unsupported SLP instances which makes the above
2083          SLP kind detection invalid.  */
2084       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2085       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2086                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2087       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2088         goto again;
2089     }
2090
2091   /* Scan all the remaining operations in the loop that are not subject
2092      to SLP and make sure they are vectorizable.  */
2093   ok = vect_analyze_loop_operations (loop_vinfo);
2094   if (!ok)
2095     {
2096       if (dump_enabled_p ())
2097         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2098                          "bad operation or unsupported loop bound.\n");
2099       return false;
2100     }
2101
2102   /* If epilog loop is required because of data accesses with gaps,
2103      one additional iteration needs to be peeled.  Check if there is
2104      enough iterations for vectorization.  */
2105   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2106       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2107     {
2108       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2109       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2110
2111       if (wi::to_widest (scalar_niters) < vf)
2112         {
2113           if (dump_enabled_p ())
2114             dump_printf_loc (MSG_NOTE, vect_location,
2115                              "loop has no enough iterations to support"
2116                              " peeling for gaps.\n");
2117           return false;
2118         }
2119     }
2120
2121   /* Analyze cost.  Decide if worth while to vectorize.  */
2122   int min_profitable_estimate, min_profitable_iters;
2123   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2124                                       &min_profitable_estimate);
2125
2126   if (min_profitable_iters < 0)
2127     {
2128       if (dump_enabled_p ())
2129         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2130                          "not vectorized: vectorization not profitable.\n");
2131       if (dump_enabled_p ())
2132         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2133                          "not vectorized: vector version will never be "
2134                          "profitable.\n");
2135       goto again;
2136     }
2137
2138   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2139                             * vectorization_factor) - 1);
2140
2141   /* Use the cost model only if it is more conservative than user specified
2142      threshold.  */
2143   th = (unsigned) min_scalar_loop_bound;
2144   if (min_profitable_iters
2145       && (!min_scalar_loop_bound
2146           || min_profitable_iters > min_scalar_loop_bound))
2147     th = (unsigned) min_profitable_iters;
2148
2149   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2150
2151   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2152       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2153     {
2154       if (dump_enabled_p ())
2155         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2156                          "not vectorized: vectorization not profitable.\n");
2157       if (dump_enabled_p ())
2158         dump_printf_loc (MSG_NOTE, vect_location,
2159                          "not vectorized: iteration count smaller than user "
2160                          "specified loop bound parameter or minimum profitable "
2161                          "iterations (whichever is more conservative).\n");
2162       goto again;
2163     }
2164
2165   estimated_niter
2166     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2167   if (estimated_niter == -1)
2168     estimated_niter = max_niter;
2169   if (estimated_niter != -1
2170       && ((unsigned HOST_WIDE_INT) estimated_niter
2171           <= MAX (th, (unsigned)min_profitable_estimate)))
2172     {
2173       if (dump_enabled_p ())
2174         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2175                          "not vectorized: estimated iteration count too "
2176                          "small.\n");
2177       if (dump_enabled_p ())
2178         dump_printf_loc (MSG_NOTE, vect_location,
2179                          "not vectorized: estimated iteration count smaller "
2180                          "than specified loop bound parameter or minimum "
2181                          "profitable iterations (whichever is more "
2182                          "conservative).\n");
2183       goto again;
2184     }
2185
2186   /* Decide whether we need to create an epilogue loop to handle
2187      remaining scalar iterations.  */
2188   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2189         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2190        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2191
2192   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2193       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2194     {
2195       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2196                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2197           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2198         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2199     }
2200   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2201            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2202                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2203                /* In case of versioning, check if the maximum number of
2204                   iterations is greater than th.  If they are identical,
2205                   the epilogue is unnecessary.  */
2206                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2207                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2208     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2209
2210   /* If an epilogue loop is required make sure we can create one.  */
2211   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2212       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2213     {
2214       if (dump_enabled_p ())
2215         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2216       if (!vect_can_advance_ivs_p (loop_vinfo)
2217           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2218                                            single_exit (LOOP_VINFO_LOOP
2219                                                          (loop_vinfo))))
2220         {
2221           if (dump_enabled_p ())
2222             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2223                              "not vectorized: can't create required "
2224                              "epilog loop\n");
2225           goto again;
2226         }
2227     }
2228
2229   /* During peeling, we need to check if number of loop iterations is
2230      enough for both peeled prolog loop and vector loop.  This check
2231      can be merged along with threshold check of loop versioning, so
2232      increase threshold for this case if necessary.  */
2233   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2234       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2235           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2236     {
2237       unsigned niters_th;
2238
2239       /* Niters for peeled prolog loop.  */
2240       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2241         {
2242           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2243           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2244
2245           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2246         }
2247       else
2248         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2249
2250       /* Niters for at least one iteration of vectorized loop.  */
2251       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2252       /* One additional iteration because of peeling for gap.  */
2253       if (!LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2254         niters_th++;
2255       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2256         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2257     }
2258
2259   gcc_assert (vectorization_factor
2260               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2261
2262   /* Ok to vectorize!  */
2263   return true;
2264
2265 again:
2266   /* Try again with SLP forced off but if we didn't do any SLP there is
2267      no point in re-trying.  */
2268   if (!slp)
2269     return false;
2270
2271   /* If there are reduction chains re-trying will fail anyway.  */
2272   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2273     return false;
2274
2275   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2276      via interleaving or lane instructions.  */
2277   slp_instance instance;
2278   slp_tree node;
2279   unsigned i, j;
2280   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2281     {
2282       stmt_vec_info vinfo;
2283       vinfo = vinfo_for_stmt
2284           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2285       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2286         continue;
2287       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2288       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2289       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2290       if (! vect_store_lanes_supported (vectype, size)
2291           && ! vect_grouped_store_supported (vectype, size))
2292         return false;
2293       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2294         {
2295           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2296           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2297           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2298           size = STMT_VINFO_GROUP_SIZE (vinfo);
2299           vectype = STMT_VINFO_VECTYPE (vinfo);
2300           if (! vect_load_lanes_supported (vectype, size)
2301               && ! vect_grouped_load_supported (vectype, single_element_p,
2302                                                 size))
2303             return false;
2304         }
2305     }
2306
2307   if (dump_enabled_p ())
2308     dump_printf_loc (MSG_NOTE, vect_location,
2309                      "re-trying with SLP disabled\n");
2310
2311   /* Roll back state appropriately.  No SLP this time.  */
2312   slp = false;
2313   /* Restore vectorization factor as it were without SLP.  */
2314   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2315   /* Free the SLP instances.  */
2316   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2317     vect_free_slp_instance (instance);
2318   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2319   /* Reset SLP type to loop_vect on all stmts.  */
2320   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2321     {
2322       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2323       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2324            !gsi_end_p (si); gsi_next (&si))
2325         {
2326           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2327           STMT_SLP_TYPE (stmt_info) = loop_vect;
2328         }
2329       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2330            !gsi_end_p (si); gsi_next (&si))
2331         {
2332           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2333           STMT_SLP_TYPE (stmt_info) = loop_vect;
2334           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2335             {
2336               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2337               STMT_SLP_TYPE (stmt_info) = loop_vect;
2338               for (gimple_stmt_iterator pi
2339                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2340                    !gsi_end_p (pi); gsi_next (&pi))
2341                 {
2342                   gimple *pstmt = gsi_stmt (pi);
2343                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2344                 }
2345             }
2346         }
2347     }
2348   /* Free optimized alias test DDRS.  */
2349   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2350   /* Reset target cost data.  */
2351   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2352   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2353     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2354   /* Reset assorted flags.  */
2355   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2356   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2357   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2358
2359   goto start_over;
2360 }
2361
2362 /* Function vect_analyze_loop.
2363
2364    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2365    for it.  The different analyses will record information in the
2366    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2367    be vectorized.  */
2368 loop_vec_info
2369 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2370 {
2371   loop_vec_info loop_vinfo;
2372   unsigned int vector_sizes;
2373
2374   /* Autodetect first vector size we try.  */
2375   current_vector_size = 0;
2376   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2377
2378   if (dump_enabled_p ())
2379     dump_printf_loc (MSG_NOTE, vect_location,
2380                      "===== analyze_loop_nest =====\n");
2381
2382   if (loop_outer (loop)
2383       && loop_vec_info_for_loop (loop_outer (loop))
2384       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2385     {
2386       if (dump_enabled_p ())
2387         dump_printf_loc (MSG_NOTE, vect_location,
2388                          "outer-loop already vectorized.\n");
2389       return NULL;
2390     }
2391
2392   while (1)
2393     {
2394       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2395       loop_vinfo = vect_analyze_loop_form (loop);
2396       if (!loop_vinfo)
2397         {
2398           if (dump_enabled_p ())
2399             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2400                              "bad loop form.\n");
2401           return NULL;
2402         }
2403
2404       bool fatal = false;
2405
2406       if (orig_loop_vinfo)
2407         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2408
2409       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2410         {
2411           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2412
2413           return loop_vinfo;
2414         }
2415
2416       destroy_loop_vec_info (loop_vinfo, true);
2417
2418       vector_sizes &= ~current_vector_size;
2419       if (fatal
2420           || vector_sizes == 0
2421           || current_vector_size == 0)
2422         return NULL;
2423
2424       /* Try the next biggest vector size.  */
2425       current_vector_size = 1 << floor_log2 (vector_sizes);
2426       if (dump_enabled_p ())
2427         dump_printf_loc (MSG_NOTE, vect_location,
2428                          "***** Re-trying analysis with "
2429                          "vector size %d\n", current_vector_size);
2430     }
2431 }
2432
2433
2434 /* Function reduction_code_for_scalar_code
2435
2436    Input:
2437    CODE - tree_code of a reduction operations.
2438
2439    Output:
2440    REDUC_CODE - the corresponding tree-code to be used to reduce the
2441       vector of partial results into a single scalar result, or ERROR_MARK
2442       if the operation is a supported reduction operation, but does not have
2443       such a tree-code.
2444
2445    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2446
2447 static bool
2448 reduction_code_for_scalar_code (enum tree_code code,
2449                                 enum tree_code *reduc_code)
2450 {
2451   switch (code)
2452     {
2453       case MAX_EXPR:
2454         *reduc_code = REDUC_MAX_EXPR;
2455         return true;
2456
2457       case MIN_EXPR:
2458         *reduc_code = REDUC_MIN_EXPR;
2459         return true;
2460
2461       case PLUS_EXPR:
2462         *reduc_code = REDUC_PLUS_EXPR;
2463         return true;
2464
2465       case MULT_EXPR:
2466       case MINUS_EXPR:
2467       case BIT_IOR_EXPR:
2468       case BIT_XOR_EXPR:
2469       case BIT_AND_EXPR:
2470         *reduc_code = ERROR_MARK;
2471         return true;
2472
2473       default:
2474        return false;
2475     }
2476 }
2477
2478
2479 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2480    STMT is printed with a message MSG. */
2481
2482 static void
2483 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2484 {
2485   dump_printf_loc (msg_type, vect_location, "%s", msg);
2486   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2487 }
2488
2489
2490 /* Detect SLP reduction of the form:
2491
2492    #a1 = phi <a5, a0>
2493    a2 = operation (a1)
2494    a3 = operation (a2)
2495    a4 = operation (a3)
2496    a5 = operation (a4)
2497
2498    #a = phi <a5>
2499
2500    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2501    FIRST_STMT is the first reduction stmt in the chain
2502    (a2 = operation (a1)).
2503
2504    Return TRUE if a reduction chain was detected.  */
2505
2506 static bool
2507 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2508                        gimple *first_stmt)
2509 {
2510   struct loop *loop = (gimple_bb (phi))->loop_father;
2511   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2512   enum tree_code code;
2513   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2514   stmt_vec_info use_stmt_info, current_stmt_info;
2515   tree lhs;
2516   imm_use_iterator imm_iter;
2517   use_operand_p use_p;
2518   int nloop_uses, size = 0, n_out_of_loop_uses;
2519   bool found = false;
2520
2521   if (loop != vect_loop)
2522     return false;
2523
2524   lhs = PHI_RESULT (phi);
2525   code = gimple_assign_rhs_code (first_stmt);
2526   while (1)
2527     {
2528       nloop_uses = 0;
2529       n_out_of_loop_uses = 0;
2530       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2531         {
2532           gimple *use_stmt = USE_STMT (use_p);
2533           if (is_gimple_debug (use_stmt))
2534             continue;
2535
2536           /* Check if we got back to the reduction phi.  */
2537           if (use_stmt == phi)
2538             {
2539               loop_use_stmt = use_stmt;
2540               found = true;
2541               break;
2542             }
2543
2544           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2545             {
2546               loop_use_stmt = use_stmt;
2547               nloop_uses++;
2548             }
2549            else
2550              n_out_of_loop_uses++;
2551
2552            /* There are can be either a single use in the loop or two uses in
2553               phi nodes.  */
2554            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2555              return false;
2556         }
2557
2558       if (found)
2559         break;
2560
2561       /* We reached a statement with no loop uses.  */
2562       if (nloop_uses == 0)
2563         return false;
2564
2565       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2566       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2567         return false;
2568
2569       if (!is_gimple_assign (loop_use_stmt)
2570           || code != gimple_assign_rhs_code (loop_use_stmt)
2571           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2572         return false;
2573
2574       /* Insert USE_STMT into reduction chain.  */
2575       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2576       if (current_stmt)
2577         {
2578           current_stmt_info = vinfo_for_stmt (current_stmt);
2579           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2580           GROUP_FIRST_ELEMENT (use_stmt_info)
2581             = GROUP_FIRST_ELEMENT (current_stmt_info);
2582         }
2583       else
2584         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2585
2586       lhs = gimple_assign_lhs (loop_use_stmt);
2587       current_stmt = loop_use_stmt;
2588       size++;
2589    }
2590
2591   if (!found || loop_use_stmt != phi || size < 2)
2592     return false;
2593
2594   /* Swap the operands, if needed, to make the reduction operand be the second
2595      operand.  */
2596   lhs = PHI_RESULT (phi);
2597   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2598   while (next_stmt)
2599     {
2600       if (gimple_assign_rhs2 (next_stmt) == lhs)
2601         {
2602           tree op = gimple_assign_rhs1 (next_stmt);
2603           gimple *def_stmt = NULL;
2604
2605           if (TREE_CODE (op) == SSA_NAME)
2606             def_stmt = SSA_NAME_DEF_STMT (op);
2607
2608           /* Check that the other def is either defined in the loop
2609              ("vect_internal_def"), or it's an induction (defined by a
2610              loop-header phi-node).  */
2611           if (def_stmt
2612               && gimple_bb (def_stmt)
2613               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2614               && (is_gimple_assign (def_stmt)
2615                   || is_gimple_call (def_stmt)
2616                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2617                            == vect_induction_def
2618                   || (gimple_code (def_stmt) == GIMPLE_PHI
2619                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2620                                   == vect_internal_def
2621                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2622             {
2623               lhs = gimple_assign_lhs (next_stmt);
2624               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2625               continue;
2626             }
2627
2628           return false;
2629         }
2630       else
2631         {
2632           tree op = gimple_assign_rhs2 (next_stmt);
2633           gimple *def_stmt = NULL;
2634
2635           if (TREE_CODE (op) == SSA_NAME)
2636             def_stmt = SSA_NAME_DEF_STMT (op);
2637
2638           /* Check that the other def is either defined in the loop
2639             ("vect_internal_def"), or it's an induction (defined by a
2640             loop-header phi-node).  */
2641           if (def_stmt
2642               && gimple_bb (def_stmt)
2643               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2644               && (is_gimple_assign (def_stmt)
2645                   || is_gimple_call (def_stmt)
2646                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2647                               == vect_induction_def
2648                   || (gimple_code (def_stmt) == GIMPLE_PHI
2649                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2650                                   == vect_internal_def
2651                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2652             {
2653               if (dump_enabled_p ())
2654                 {
2655                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2656                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2657                 }
2658
2659               swap_ssa_operands (next_stmt,
2660                                  gimple_assign_rhs1_ptr (next_stmt),
2661                                  gimple_assign_rhs2_ptr (next_stmt));
2662               update_stmt (next_stmt);
2663
2664               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2665                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2666             }
2667           else
2668             return false;
2669         }
2670
2671       lhs = gimple_assign_lhs (next_stmt);
2672       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2673     }
2674
2675   /* Save the chain for further analysis in SLP detection.  */
2676   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2677   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2678   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2679
2680   return true;
2681 }
2682
2683
2684 /* Function vect_is_simple_reduction
2685
2686    (1) Detect a cross-iteration def-use cycle that represents a simple
2687    reduction computation.  We look for the following pattern:
2688
2689    loop_header:
2690      a1 = phi < a0, a2 >
2691      a3 = ...
2692      a2 = operation (a3, a1)
2693
2694    or
2695
2696    a3 = ...
2697    loop_header:
2698      a1 = phi < a0, a2 >
2699      a2 = operation (a3, a1)
2700
2701    such that:
2702    1. operation is commutative and associative and it is safe to
2703       change the order of the computation
2704    2. no uses for a2 in the loop (a2 is used out of the loop)
2705    3. no uses of a1 in the loop besides the reduction operation
2706    4. no uses of a1 outside the loop.
2707
2708    Conditions 1,4 are tested here.
2709    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2710
2711    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2712    nested cycles.
2713
2714    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2715    reductions:
2716
2717      a1 = phi < a0, a2 >
2718      inner loop (def of a3)
2719      a2 = phi < a3 >
2720
2721    (4) Detect condition expressions, ie:
2722      for (int i = 0; i < N; i++)
2723        if (a[i] < val)
2724         ret_val = a[i];
2725
2726 */
2727
2728 static gimple *
2729 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2730                           bool *double_reduc,
2731                           bool need_wrapping_integral_overflow,
2732                           enum vect_reduction_type *v_reduc_type)
2733 {
2734   struct loop *loop = (gimple_bb (phi))->loop_father;
2735   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2736   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2737   enum tree_code orig_code, code;
2738   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2739   tree type;
2740   int nloop_uses;
2741   tree name;
2742   imm_use_iterator imm_iter;
2743   use_operand_p use_p;
2744   bool phi_def;
2745
2746   *double_reduc = false;
2747   *v_reduc_type = TREE_CODE_REDUCTION;
2748
2749   name = PHI_RESULT (phi);
2750   /* ???  If there are no uses of the PHI result the inner loop reduction
2751      won't be detected as possibly double-reduction by vectorizable_reduction
2752      because that tries to walk the PHI arg from the preheader edge which
2753      can be constant.  See PR60382.  */
2754   if (has_zero_uses (name))
2755     return NULL;
2756   nloop_uses = 0;
2757   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2758     {
2759       gimple *use_stmt = USE_STMT (use_p);
2760       if (is_gimple_debug (use_stmt))
2761         continue;
2762
2763       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2764         {
2765           if (dump_enabled_p ())
2766             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2767                              "intermediate value used outside loop.\n");
2768
2769           return NULL;
2770         }
2771
2772       nloop_uses++;
2773       if (nloop_uses > 1)
2774         {
2775           if (dump_enabled_p ())
2776             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2777                              "reduction value used in loop.\n");
2778           return NULL;
2779         }
2780
2781       phi_use_stmt = use_stmt;
2782     }
2783
2784   edge latch_e = loop_latch_edge (loop);
2785   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2786   if (TREE_CODE (loop_arg) != SSA_NAME)
2787     {
2788       if (dump_enabled_p ())
2789         {
2790           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2791                            "reduction: not ssa_name: ");
2792           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2793           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2794         }
2795       return NULL;
2796     }
2797
2798   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2799   if (is_gimple_assign (def_stmt))
2800     {
2801       name = gimple_assign_lhs (def_stmt);
2802       phi_def = false;
2803     }
2804   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2805     {
2806       name = PHI_RESULT (def_stmt);
2807       phi_def = true;
2808     }
2809   else
2810     {
2811       if (dump_enabled_p ())
2812         {
2813           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2814                            "reduction: unhandled reduction operation: ");
2815           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2816         }
2817       return NULL;
2818     }
2819
2820   nloop_uses = 0;
2821   auto_vec<gphi *, 3> lcphis;
2822   if (flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2823     FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2824       {
2825         gimple *use_stmt = USE_STMT (use_p);
2826         if (is_gimple_debug (use_stmt))
2827           continue;
2828         if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2829           nloop_uses++;
2830         else
2831           /* We can have more than one loop-closed PHI.  */
2832           lcphis.safe_push (as_a <gphi *> (use_stmt));
2833         if (nloop_uses > 1)
2834           {
2835             if (dump_enabled_p ())
2836               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2837                                "reduction used in loop.\n");
2838             return NULL;
2839           }
2840       }
2841
2842   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2843      defined in the inner loop.  */
2844   if (phi_def)
2845     {
2846       op1 = PHI_ARG_DEF (def_stmt, 0);
2847
2848       if (gimple_phi_num_args (def_stmt) != 1
2849           || TREE_CODE (op1) != SSA_NAME)
2850         {
2851           if (dump_enabled_p ())
2852             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2853                              "unsupported phi node definition.\n");
2854
2855           return NULL;
2856         }
2857
2858       def1 = SSA_NAME_DEF_STMT (op1);
2859       if (gimple_bb (def1)
2860           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2861           && loop->inner
2862           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2863           && is_gimple_assign (def1)
2864           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2865         {
2866           if (dump_enabled_p ())
2867             report_vect_op (MSG_NOTE, def_stmt,
2868                             "detected double reduction: ");
2869
2870           *double_reduc = true;
2871           return def_stmt;
2872         }
2873
2874       return NULL;
2875     }
2876
2877   /* If we are vectorizing an inner reduction we are executing that
2878      in the original order only in case we are not dealing with a
2879      double reduction.  */
2880   bool check_reduction = true;
2881   if (flow_loop_nested_p (vect_loop, loop))
2882     {
2883       gphi *lcphi;
2884       unsigned i;
2885       check_reduction = false;
2886       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2887         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2888           {
2889             gimple *use_stmt = USE_STMT (use_p);
2890             if (is_gimple_debug (use_stmt))
2891               continue;
2892             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2893               check_reduction = true;
2894           }
2895     }
2896
2897   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2898   code = orig_code = gimple_assign_rhs_code (def_stmt);
2899
2900   /* We can handle "res -= x[i]", which is non-associative by
2901      simply rewriting this into "res += -x[i]".  Avoid changing
2902      gimple instruction for the first simple tests and only do this
2903      if we're allowed to change code at all.  */
2904   if (code == MINUS_EXPR
2905       && (op1 = gimple_assign_rhs1 (def_stmt))
2906       && TREE_CODE (op1) == SSA_NAME
2907       && SSA_NAME_DEF_STMT (op1) == phi)
2908     code = PLUS_EXPR;
2909
2910   if (code == COND_EXPR)
2911     {
2912       if (! nested_in_vect_loop)
2913         *v_reduc_type = COND_REDUCTION;
2914
2915       op3 = gimple_assign_rhs1 (def_stmt);
2916       if (COMPARISON_CLASS_P (op3))
2917         {
2918           op4 = TREE_OPERAND (op3, 1);
2919           op3 = TREE_OPERAND (op3, 0);
2920         }
2921
2922       op1 = gimple_assign_rhs2 (def_stmt);
2923       op2 = gimple_assign_rhs3 (def_stmt);
2924     }
2925   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2926     {
2927       if (dump_enabled_p ())
2928         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2929                         "reduction: not commutative/associative: ");
2930       return NULL;
2931     }
2932   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2933     {
2934       op1 = gimple_assign_rhs1 (def_stmt);
2935       op2 = gimple_assign_rhs2 (def_stmt);
2936     }
2937   else
2938     {
2939       if (dump_enabled_p ())
2940         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2941                         "reduction: not handled operation: ");
2942       return NULL;
2943     }
2944
2945   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2946     {
2947       if (dump_enabled_p ())
2948         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2949                         "reduction: both uses not ssa_names: ");
2950
2951       return NULL;
2952     }
2953
2954   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2955   if ((TREE_CODE (op1) == SSA_NAME
2956        && !types_compatible_p (type,TREE_TYPE (op1)))
2957       || (TREE_CODE (op2) == SSA_NAME
2958           && !types_compatible_p (type, TREE_TYPE (op2)))
2959       || (op3 && TREE_CODE (op3) == SSA_NAME
2960           && !types_compatible_p (type, TREE_TYPE (op3)))
2961       || (op4 && TREE_CODE (op4) == SSA_NAME
2962           && !types_compatible_p (type, TREE_TYPE (op4))))
2963     {
2964       if (dump_enabled_p ())
2965         {
2966           dump_printf_loc (MSG_NOTE, vect_location,
2967                            "reduction: multiple types: operation type: ");
2968           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2969           dump_printf (MSG_NOTE, ", operands types: ");
2970           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2971                              TREE_TYPE (op1));
2972           dump_printf (MSG_NOTE, ",");
2973           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2974                              TREE_TYPE (op2));
2975           if (op3)
2976             {
2977               dump_printf (MSG_NOTE, ",");
2978               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2979                                  TREE_TYPE (op3));
2980             }
2981
2982           if (op4)
2983             {
2984               dump_printf (MSG_NOTE, ",");
2985               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2986                                  TREE_TYPE (op4));
2987             }
2988           dump_printf (MSG_NOTE, "\n");
2989         }
2990
2991       return NULL;
2992     }
2993
2994   /* Check that it's ok to change the order of the computation.
2995      Generally, when vectorizing a reduction we change the order of the
2996      computation.  This may change the behavior of the program in some
2997      cases, so we need to check that this is ok.  One exception is when
2998      vectorizing an outer-loop: the inner-loop is executed sequentially,
2999      and therefore vectorizing reductions in the inner-loop during
3000      outer-loop vectorization is safe.  */
3001
3002   if (*v_reduc_type != COND_REDUCTION
3003       && check_reduction)
3004     {
3005       /* CHECKME: check for !flag_finite_math_only too?  */
3006       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
3007         {
3008           /* Changing the order of operations changes the semantics.  */
3009           if (dump_enabled_p ())
3010             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3011                         "reduction: unsafe fp math optimization: ");
3012           return NULL;
3013         }
3014       else if (INTEGRAL_TYPE_P (type))
3015         {
3016           if (!operation_no_trapping_overflow (type, code))
3017             {
3018               /* Changing the order of operations changes the semantics.  */
3019               if (dump_enabled_p ())
3020                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3021                                 "reduction: unsafe int math optimization"
3022                                 " (overflow traps): ");
3023               return NULL;
3024             }
3025           if (need_wrapping_integral_overflow
3026               && !TYPE_OVERFLOW_WRAPS (type)
3027               && operation_can_overflow (code))
3028             {
3029               /* Changing the order of operations changes the semantics.  */
3030               if (dump_enabled_p ())
3031                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3032                                 "reduction: unsafe int math optimization"
3033                                 " (overflow doesn't wrap): ");
3034               return NULL;
3035             }
3036         }
3037       else if (SAT_FIXED_POINT_TYPE_P (type))
3038         {
3039           /* Changing the order of operations changes the semantics.  */
3040           if (dump_enabled_p ())
3041           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3042                           "reduction: unsafe fixed-point math optimization: ");
3043           return NULL;
3044         }
3045     }
3046
3047   /* Reduction is safe. We're dealing with one of the following:
3048      1) integer arithmetic and no trapv
3049      2) floating point arithmetic, and special flags permit this optimization
3050      3) nested cycle (i.e., outer loop vectorization).  */
3051   if (TREE_CODE (op1) == SSA_NAME)
3052     def1 = SSA_NAME_DEF_STMT (op1);
3053
3054   if (TREE_CODE (op2) == SSA_NAME)
3055     def2 = SSA_NAME_DEF_STMT (op2);
3056
3057   if (code != COND_EXPR
3058       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3059     {
3060       if (dump_enabled_p ())
3061         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3062       return NULL;
3063     }
3064
3065   /* Check that one def is the reduction def, defined by PHI,
3066      the other def is either defined in the loop ("vect_internal_def"),
3067      or it's an induction (defined by a loop-header phi-node).  */
3068
3069   if (def2 && def2 == phi
3070       && (code == COND_EXPR
3071           || !def1 || gimple_nop_p (def1)
3072           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3073           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3074               && (is_gimple_assign (def1)
3075                   || is_gimple_call (def1)
3076                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3077                       == vect_induction_def
3078                   || (gimple_code (def1) == GIMPLE_PHI
3079                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3080                           == vect_internal_def
3081                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3082     {
3083       if (dump_enabled_p ())
3084         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3085       return def_stmt;
3086     }
3087
3088   if (def1 && def1 == phi
3089       && (code == COND_EXPR
3090           || !def2 || gimple_nop_p (def2)
3091           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3092           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3093               && (is_gimple_assign (def2)
3094                   || is_gimple_call (def2)
3095                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3096                        == vect_induction_def
3097                   || (gimple_code (def2) == GIMPLE_PHI
3098                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3099                            == vect_internal_def
3100                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3101     {
3102       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3103         {
3104           /* Check if we can swap operands (just for simplicity - so that
3105              the rest of the code can assume that the reduction variable
3106              is always the last (second) argument).  */
3107           if (code == COND_EXPR)
3108             {
3109               /* Swap cond_expr by inverting the condition.  */
3110               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3111               enum tree_code invert_code = ERROR_MARK;
3112               enum tree_code cond_code = TREE_CODE (cond_expr);
3113
3114               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3115                 {
3116                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3117                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3118                 }
3119               if (invert_code != ERROR_MARK)
3120                 {
3121                   TREE_SET_CODE (cond_expr, invert_code);
3122                   swap_ssa_operands (def_stmt,
3123                                      gimple_assign_rhs2_ptr (def_stmt),
3124                                      gimple_assign_rhs3_ptr (def_stmt));
3125                 }
3126               else
3127                 {
3128                   if (dump_enabled_p ())
3129                     report_vect_op (MSG_NOTE, def_stmt,
3130                                     "detected reduction: cannot swap operands "
3131                                     "for cond_expr");
3132                   return NULL;
3133                 }
3134             }
3135           else
3136             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3137                                gimple_assign_rhs2_ptr (def_stmt));
3138
3139           if (dump_enabled_p ())
3140             report_vect_op (MSG_NOTE, def_stmt,
3141                             "detected reduction: need to swap operands: ");
3142
3143           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3144             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3145         }
3146       else
3147         {
3148           if (dump_enabled_p ())
3149             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3150         }
3151
3152       return def_stmt;
3153     }
3154
3155   /* Try to find SLP reduction chain.  */
3156   if (! nested_in_vect_loop
3157       && code != COND_EXPR
3158       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3159     {
3160       if (dump_enabled_p ())
3161         report_vect_op (MSG_NOTE, def_stmt,
3162                         "reduction: detected reduction chain: ");
3163
3164       return def_stmt;
3165     }
3166
3167   if (dump_enabled_p ())
3168     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3169                     "reduction: unknown pattern: ");
3170
3171   return NULL;
3172 }
3173
3174 /* Wrapper around vect_is_simple_reduction, which will modify code
3175    in-place if it enables detection of more reductions.  Arguments
3176    as there.  */
3177
3178 gimple *
3179 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3180                              bool *double_reduc,
3181                              bool need_wrapping_integral_overflow)
3182 {
3183   enum vect_reduction_type v_reduc_type;
3184   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3185                                           need_wrapping_integral_overflow,
3186                                           &v_reduc_type);
3187   if (def)
3188     {
3189       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3190       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3191       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3192       reduc_def_info = vinfo_for_stmt (def);
3193       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3194     }
3195   return def;
3196 }
3197
3198 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3199 int
3200 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3201                              int *peel_iters_epilogue,
3202                              stmt_vector_for_cost *scalar_cost_vec,
3203                              stmt_vector_for_cost *prologue_cost_vec,
3204                              stmt_vector_for_cost *epilogue_cost_vec)
3205 {
3206   int retval = 0;
3207   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3208
3209   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3210     {
3211       *peel_iters_epilogue = vf/2;
3212       if (dump_enabled_p ())
3213         dump_printf_loc (MSG_NOTE, vect_location,
3214                          "cost model: epilogue peel iters set to vf/2 "
3215                          "because loop iterations are unknown .\n");
3216
3217       /* If peeled iterations are known but number of scalar loop
3218          iterations are unknown, count a taken branch per peeled loop.  */
3219       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3220                                  NULL, 0, vect_prologue);
3221       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3222                                  NULL, 0, vect_epilogue);
3223     }
3224   else
3225     {
3226       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3227       peel_iters_prologue = niters < peel_iters_prologue ?
3228                             niters : peel_iters_prologue;
3229       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3230       /* If we need to peel for gaps, but no peeling is required, we have to
3231          peel VF iterations.  */
3232       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3233         *peel_iters_epilogue = vf;
3234     }
3235
3236   stmt_info_for_cost *si;
3237   int j;
3238   if (peel_iters_prologue)
3239     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3240         {
3241           stmt_vec_info stmt_info
3242             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3243           retval += record_stmt_cost (prologue_cost_vec,
3244                                       si->count * peel_iters_prologue,
3245                                       si->kind, stmt_info, si->misalign,
3246                                       vect_prologue);
3247         }
3248   if (*peel_iters_epilogue)
3249     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3250         {
3251           stmt_vec_info stmt_info
3252             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3253           retval += record_stmt_cost (epilogue_cost_vec,
3254                                       si->count * *peel_iters_epilogue,
3255                                       si->kind, stmt_info, si->misalign,
3256                                       vect_epilogue);
3257         }
3258
3259   return retval;
3260 }
3261
3262 /* Function vect_estimate_min_profitable_iters
3263
3264    Return the number of iterations required for the vector version of the
3265    loop to be profitable relative to the cost of the scalar version of the
3266    loop.
3267
3268    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3269    of iterations for vectorization.  -1 value means loop vectorization
3270    is not profitable.  This returned value may be used for dynamic
3271    profitability check.
3272
3273    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3274    for static check against estimated number of iterations.  */
3275
3276 static void
3277 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3278                                     int *ret_min_profitable_niters,
3279                                     int *ret_min_profitable_estimate)
3280 {
3281   int min_profitable_iters;
3282   int min_profitable_estimate;
3283   int peel_iters_prologue;
3284   int peel_iters_epilogue;
3285   unsigned vec_inside_cost = 0;
3286   int vec_outside_cost = 0;
3287   unsigned vec_prologue_cost = 0;
3288   unsigned vec_epilogue_cost = 0;
3289   int scalar_single_iter_cost = 0;
3290   int scalar_outside_cost = 0;
3291   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3292   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3293   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3294
3295   /* Cost model disabled.  */
3296   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3297     {
3298       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3299       *ret_min_profitable_niters = 0;
3300       *ret_min_profitable_estimate = 0;
3301       return;
3302     }
3303
3304   /* Requires loop versioning tests to handle misalignment.  */
3305   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3306     {
3307       /*  FIXME: Make cost depend on complexity of individual check.  */
3308       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3309       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3310                             vect_prologue);
3311       dump_printf (MSG_NOTE,
3312                    "cost model: Adding cost of checks for loop "
3313                    "versioning to treat misalignment.\n");
3314     }
3315
3316   /* Requires loop versioning with alias checks.  */
3317   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3318     {
3319       /*  FIXME: Make cost depend on complexity of individual check.  */
3320       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3321       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3322                             vect_prologue);
3323       dump_printf (MSG_NOTE,
3324                    "cost model: Adding cost of checks for loop "
3325                    "versioning aliasing.\n");
3326     }
3327
3328   /* Requires loop versioning with niter checks.  */
3329   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3330     {
3331       /*  FIXME: Make cost depend on complexity of individual check.  */
3332       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3333                             vect_prologue);
3334       dump_printf (MSG_NOTE,
3335                    "cost model: Adding cost of checks for loop "
3336                    "versioning niters.\n");
3337     }
3338
3339   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3340     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3341                           vect_prologue);
3342
3343   /* Count statements in scalar loop.  Using this as scalar cost for a single
3344      iteration for now.
3345
3346      TODO: Add outer loop support.
3347
3348      TODO: Consider assigning different costs to different scalar
3349      statements.  */
3350
3351   scalar_single_iter_cost
3352     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3353
3354   /* Add additional cost for the peeled instructions in prologue and epilogue
3355      loop.
3356
3357      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3358      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3359
3360      TODO: Build an expression that represents peel_iters for prologue and
3361      epilogue to be used in a run-time test.  */
3362
3363   if (npeel  < 0)
3364     {
3365       peel_iters_prologue = vf/2;
3366       dump_printf (MSG_NOTE, "cost model: "
3367                    "prologue peel iters set to vf/2.\n");
3368
3369       /* If peeling for alignment is unknown, loop bound of main loop becomes
3370          unknown.  */
3371       peel_iters_epilogue = vf/2;
3372       dump_printf (MSG_NOTE, "cost model: "
3373                    "epilogue peel iters set to vf/2 because "
3374                    "peeling for alignment is unknown.\n");
3375
3376       /* If peeled iterations are unknown, count a taken branch and a not taken
3377          branch per peeled loop. Even if scalar loop iterations are known,
3378          vector iterations are not known since peeled prologue iterations are
3379          not known. Hence guards remain the same.  */
3380       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3381                             NULL, 0, vect_prologue);
3382       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3383                             NULL, 0, vect_prologue);
3384       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3385                             NULL, 0, vect_epilogue);
3386       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3387                             NULL, 0, vect_epilogue);
3388       stmt_info_for_cost *si;
3389       int j;
3390       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3391         {
3392           struct _stmt_vec_info *stmt_info
3393             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3394           (void) add_stmt_cost (target_cost_data,
3395                                 si->count * peel_iters_prologue,
3396                                 si->kind, stmt_info, si->misalign,
3397                                 vect_prologue);
3398           (void) add_stmt_cost (target_cost_data,
3399                                 si->count * peel_iters_epilogue,
3400                                 si->kind, stmt_info, si->misalign,
3401                                 vect_epilogue);
3402         }
3403     }
3404   else
3405     {
3406       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3407       stmt_info_for_cost *si;
3408       int j;
3409       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3410
3411       prologue_cost_vec.create (2);
3412       epilogue_cost_vec.create (2);
3413       peel_iters_prologue = npeel;
3414
3415       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3416                                           &peel_iters_epilogue,
3417                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3418                                             (loop_vinfo),
3419                                           &prologue_cost_vec,
3420                                           &epilogue_cost_vec);
3421
3422       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3423         {
3424           struct _stmt_vec_info *stmt_info
3425             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3426           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3427                                 si->misalign, vect_prologue);
3428         }
3429
3430       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3431         {
3432           struct _stmt_vec_info *stmt_info
3433             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3434           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3435                                 si->misalign, vect_epilogue);
3436         }
3437
3438       prologue_cost_vec.release ();
3439       epilogue_cost_vec.release ();
3440     }
3441
3442   /* FORNOW: The scalar outside cost is incremented in one of the
3443      following ways:
3444
3445      1. The vectorizer checks for alignment and aliasing and generates
3446      a condition that allows dynamic vectorization.  A cost model
3447      check is ANDED with the versioning condition.  Hence scalar code
3448      path now has the added cost of the versioning check.
3449
3450        if (cost > th & versioning_check)
3451          jmp to vector code
3452
3453      Hence run-time scalar is incremented by not-taken branch cost.
3454
3455      2. The vectorizer then checks if a prologue is required.  If the
3456      cost model check was not done before during versioning, it has to
3457      be done before the prologue check.
3458
3459        if (cost <= th)
3460          prologue = scalar_iters
3461        if (prologue == 0)
3462          jmp to vector code
3463        else
3464          execute prologue
3465        if (prologue == num_iters)
3466          go to exit
3467
3468      Hence the run-time scalar cost is incremented by a taken branch,
3469      plus a not-taken branch, plus a taken branch cost.
3470
3471      3. The vectorizer then checks if an epilogue is required.  If the
3472      cost model check was not done before during prologue check, it
3473      has to be done with the epilogue check.
3474
3475        if (prologue == 0)
3476          jmp to vector code
3477        else
3478          execute prologue
3479        if (prologue == num_iters)
3480          go to exit
3481        vector code:
3482          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3483            jmp to epilogue
3484
3485      Hence the run-time scalar cost should be incremented by 2 taken
3486      branches.
3487
3488      TODO: The back end may reorder the BBS's differently and reverse
3489      conditions/branch directions.  Change the estimates below to
3490      something more reasonable.  */
3491
3492   /* If the number of iterations is known and we do not do versioning, we can
3493      decide whether to vectorize at compile time.  Hence the scalar version
3494      do not carry cost model guard costs.  */
3495   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3496       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3497     {
3498       /* Cost model check occurs at versioning.  */
3499       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3500         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3501       else
3502         {
3503           /* Cost model check occurs at prologue generation.  */
3504           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3505             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3506               + vect_get_stmt_cost (cond_branch_not_taken);
3507           /* Cost model check occurs at epilogue generation.  */
3508           else
3509             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3510         }
3511     }
3512
3513   /* Complete the target-specific cost calculations.  */
3514   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3515                &vec_inside_cost, &vec_epilogue_cost);
3516
3517   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3518
3519   if (dump_enabled_p ())
3520     {
3521       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3522       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3523                    vec_inside_cost);
3524       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3525                    vec_prologue_cost);
3526       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3527                    vec_epilogue_cost);
3528       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3529                    scalar_single_iter_cost);
3530       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3531                    scalar_outside_cost);
3532       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3533                    vec_outside_cost);
3534       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3535                    peel_iters_prologue);
3536       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3537                    peel_iters_epilogue);
3538     }
3539
3540   /* Calculate number of iterations required to make the vector version
3541      profitable, relative to the loop bodies only.  The following condition
3542      must hold true:
3543      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3544      where
3545      SIC = scalar iteration cost, VIC = vector iteration cost,
3546      VOC = vector outside cost, VF = vectorization factor,
3547      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3548      SOC = scalar outside cost for run time cost model check.  */
3549
3550   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3551     {
3552       if (vec_outside_cost <= 0)
3553         min_profitable_iters = 1;
3554       else
3555         {
3556           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3557                                   - vec_inside_cost * peel_iters_prologue
3558                                   - vec_inside_cost * peel_iters_epilogue)
3559                                  / ((scalar_single_iter_cost * vf)
3560                                     - vec_inside_cost);
3561
3562           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3563               <= (((int) vec_inside_cost * min_profitable_iters)
3564                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3565             min_profitable_iters++;
3566         }
3567     }
3568   /* vector version will never be profitable.  */
3569   else
3570     {
3571       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3572         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3573                     "did not happen for a simd loop");
3574
3575       if (dump_enabled_p ())
3576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3577                          "cost model: the vector iteration cost = %d "
3578                          "divided by the scalar iteration cost = %d "
3579                          "is greater or equal to the vectorization factor = %d"
3580                          ".\n",
3581                          vec_inside_cost, scalar_single_iter_cost, vf);
3582       *ret_min_profitable_niters = -1;
3583       *ret_min_profitable_estimate = -1;
3584       return;
3585     }
3586
3587   dump_printf (MSG_NOTE,
3588                "  Calculated minimum iters for profitability: %d\n",
3589                min_profitable_iters);
3590
3591   min_profitable_iters =
3592         min_profitable_iters < vf ? vf : min_profitable_iters;
3593
3594   /* Because the condition we create is:
3595      if (niters <= min_profitable_iters)
3596        then skip the vectorized loop.  */
3597   min_profitable_iters--;
3598
3599   if (dump_enabled_p ())
3600     dump_printf_loc (MSG_NOTE, vect_location,
3601                      "  Runtime profitability threshold = %d\n",
3602                      min_profitable_iters);
3603
3604   *ret_min_profitable_niters = min_profitable_iters;
3605
3606   /* Calculate number of iterations required to make the vector version
3607      profitable, relative to the loop bodies only.
3608
3609      Non-vectorized variant is SIC * niters and it must win over vector
3610      variant on the expected loop trip count.  The following condition must hold true:
3611      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3612
3613   if (vec_outside_cost <= 0)
3614     min_profitable_estimate = 1;
3615   else
3616     {
3617       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3618                                  - vec_inside_cost * peel_iters_prologue
3619                                  - vec_inside_cost * peel_iters_epilogue)
3620                                  / ((scalar_single_iter_cost * vf)
3621                                    - vec_inside_cost);
3622     }
3623   min_profitable_estimate --;
3624   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3625   if (dump_enabled_p ())
3626     dump_printf_loc (MSG_NOTE, vect_location,
3627                      "  Static estimate profitability threshold = %d\n",
3628                      min_profitable_estimate);
3629
3630   *ret_min_profitable_estimate = min_profitable_estimate;
3631 }
3632
3633 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3634    vector elements (not bits) for a vector of mode MODE.  */
3635 static void
3636 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3637                               unsigned char *sel)
3638 {
3639   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3640
3641   for (i = 0; i < nelt; i++)
3642     sel[i] = (i + offset) & (2*nelt - 1);
3643 }
3644
3645 /* Checks whether the target supports whole-vector shifts for vectors of mode
3646    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3647    it supports vec_perm_const with masks for all necessary shift amounts.  */
3648 static bool
3649 have_whole_vector_shift (enum machine_mode mode)
3650 {
3651   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3652     return true;
3653
3654   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3655     return false;
3656
3657   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3658   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3659
3660   for (i = nelt/2; i >= 1; i/=2)
3661     {
3662       calc_vec_perm_mask_for_shift (mode, i, sel);
3663       if (!can_vec_perm_p (mode, false, sel))
3664         return false;
3665     }
3666   return true;
3667 }
3668
3669 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3670
3671 static tree
3672 get_reduction_op (gimple *stmt, int reduc_index)
3673 {
3674   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3675     {
3676     case GIMPLE_SINGLE_RHS:
3677       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3678                   == ternary_op);
3679       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3680     case GIMPLE_UNARY_RHS:
3681       return gimple_assign_rhs1 (stmt);
3682     case GIMPLE_BINARY_RHS:
3683       return (reduc_index
3684               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3685     case GIMPLE_TERNARY_RHS:
3686       return gimple_op (stmt, reduc_index + 1);
3687     default:
3688       gcc_unreachable ();
3689     }
3690 }
3691
3692 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3693    functions. Design better to avoid maintenance issues.  */
3694
3695 /* Function vect_model_reduction_cost.
3696
3697    Models cost for a reduction operation, including the vector ops
3698    generated within the strip-mine loop, the initial definition before
3699    the loop, and the epilogue code that must be generated.  */
3700
3701 static void
3702 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3703                            int ncopies)
3704 {
3705   int prologue_cost = 0, epilogue_cost = 0;
3706   enum tree_code code;
3707   optab optab;
3708   tree vectype;
3709   gimple *orig_stmt;
3710   machine_mode mode;
3711   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3712   struct loop *loop = NULL;
3713   void *target_cost_data;
3714
3715   if (loop_vinfo)
3716     {
3717       loop = LOOP_VINFO_LOOP (loop_vinfo);
3718       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719     }
3720   else
3721     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3722
3723   /* Condition reductions generate two reductions in the loop.  */
3724   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3725     ncopies *= 2;
3726
3727   /* Cost of reduction op inside loop.  */
3728   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3729                                         stmt_info, 0, vect_body);
3730
3731   vectype = STMT_VINFO_VECTYPE (stmt_info);
3732   mode = TYPE_MODE (vectype);
3733   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3734
3735   if (!orig_stmt)
3736     orig_stmt = STMT_VINFO_STMT (stmt_info);
3737
3738   code = gimple_assign_rhs_code (orig_stmt);
3739
3740   /* Add in cost for initial definition.
3741      For cond reduction we have four vectors: initial index, step, initial
3742      result of the data reduction, initial value of the index reduction.  */
3743   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3744                        == COND_REDUCTION ? 4 : 1;
3745   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3746                                   scalar_to_vec, stmt_info, 0,
3747                                   vect_prologue);
3748
3749   /* Determine cost of epilogue code.
3750
3751      We have a reduction operator that will reduce the vector in one statement.
3752      Also requires scalar extract.  */
3753
3754   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3755     {
3756       if (reduc_code != ERROR_MARK)
3757         {
3758           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3759             {
3760               /* An EQ stmt and an COND_EXPR stmt.  */
3761               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3762                                               vector_stmt, stmt_info, 0,
3763                                               vect_epilogue);
3764               /* Reduction of the max index and a reduction of the found
3765                  values.  */
3766               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3767                                               vec_to_scalar, stmt_info, 0,
3768                                               vect_epilogue);
3769               /* A broadcast of the max value.  */
3770               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3771                                               scalar_to_vec, stmt_info, 0,
3772                                               vect_epilogue);
3773             }
3774           else
3775             {
3776               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3777                                               stmt_info, 0, vect_epilogue);
3778               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3779                                               vec_to_scalar, stmt_info, 0,
3780                                               vect_epilogue);
3781             }
3782         }
3783       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3784         {
3785           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3786           /* Extraction of scalar elements.  */
3787           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3788                                           vec_to_scalar, stmt_info, 0,
3789                                           vect_epilogue);
3790           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3791           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3792                                           scalar_stmt, stmt_info, 0,
3793                                           vect_epilogue);
3794         }
3795       else
3796         {
3797           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3798           tree bitsize =
3799             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3800           int element_bitsize = tree_to_uhwi (bitsize);
3801           int nelements = vec_size_in_bits / element_bitsize;
3802
3803           if (code == COND_EXPR)
3804             code = MAX_EXPR;
3805
3806           optab = optab_for_tree_code (code, vectype, optab_default);
3807
3808           /* We have a whole vector shift available.  */
3809           if (optab != unknown_optab
3810               && VECTOR_MODE_P (mode)
3811               && optab_handler (optab, mode) != CODE_FOR_nothing
3812               && have_whole_vector_shift (mode))
3813             {
3814               /* Final reduction via vector shifts and the reduction operator.
3815                  Also requires scalar extract.  */
3816               epilogue_cost += add_stmt_cost (target_cost_data,
3817                                               exact_log2 (nelements) * 2,
3818                                               vector_stmt, stmt_info, 0,
3819                                               vect_epilogue);
3820               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3821                                               vec_to_scalar, stmt_info, 0,
3822                                               vect_epilogue);
3823             }
3824           else
3825             /* Use extracts and reduction op for final reduction.  For N
3826                elements, we have N extracts and N-1 reduction ops.  */
3827             epilogue_cost += add_stmt_cost (target_cost_data,
3828                                             nelements + nelements - 1,
3829                                             vector_stmt, stmt_info, 0,
3830                                             vect_epilogue);
3831         }
3832     }
3833
3834   if (dump_enabled_p ())
3835     dump_printf (MSG_NOTE,
3836                  "vect_model_reduction_cost: inside_cost = %d, "
3837                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3838                  prologue_cost, epilogue_cost);
3839 }
3840
3841
3842 /* Function vect_model_induction_cost.
3843
3844    Models cost for induction operations.  */
3845
3846 static void
3847 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3848 {
3849   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3850   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3851   unsigned inside_cost, prologue_cost;
3852
3853   if (PURE_SLP_STMT (stmt_info))
3854     return;
3855
3856   /* loop cost for vec_loop.  */
3857   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3858                                stmt_info, 0, vect_body);
3859
3860   /* prologue cost for vec_init and vec_step.  */
3861   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3862                                  stmt_info, 0, vect_prologue);
3863
3864   if (dump_enabled_p ())
3865     dump_printf_loc (MSG_NOTE, vect_location,
3866                      "vect_model_induction_cost: inside_cost = %d, "
3867                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3868 }
3869
3870
3871
3872 /* Function get_initial_def_for_reduction
3873
3874    Input:
3875    STMT - a stmt that performs a reduction operation in the loop.
3876    INIT_VAL - the initial value of the reduction variable
3877
3878    Output:
3879    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3880         of the reduction (used for adjusting the epilog - see below).
3881    Return a vector variable, initialized according to the operation that STMT
3882         performs. This vector will be used as the initial value of the
3883         vector of partial results.
3884
3885    Option1 (adjust in epilog): Initialize the vector as follows:
3886      add/bit or/xor:    [0,0,...,0,0]
3887      mult/bit and:      [1,1,...,1,1]
3888      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3889    and when necessary (e.g. add/mult case) let the caller know
3890    that it needs to adjust the result by init_val.
3891
3892    Option2: Initialize the vector as follows:
3893      add/bit or/xor:    [init_val,0,0,...,0]
3894      mult/bit and:      [init_val,1,1,...,1]
3895      min/max/cond_expr: [init_val,init_val,...,init_val]
3896    and no adjustments are needed.
3897
3898    For example, for the following code:
3899
3900    s = init_val;
3901    for (i=0;i<n;i++)
3902      s = s + a[i];
3903
3904    STMT is 's = s + a[i]', and the reduction variable is 's'.
3905    For a vector of 4 units, we want to return either [0,0,0,init_val],
3906    or [0,0,0,0] and let the caller know that it needs to adjust
3907    the result at the end by 'init_val'.
3908
3909    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3910    initialization vector is simpler (same element in all entries), if
3911    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3912
3913    A cost model should help decide between these two schemes.  */
3914
3915 tree
3916 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3917                                tree *adjustment_def)
3918 {
3919   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3920   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3921   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3922   tree scalar_type = TREE_TYPE (init_val);
3923   tree vectype = get_vectype_for_scalar_type (scalar_type);
3924   int nunits;
3925   enum tree_code code = gimple_assign_rhs_code (stmt);
3926   tree def_for_init;
3927   tree init_def;
3928   tree *elts;
3929   int i;
3930   bool nested_in_vect_loop = false;
3931   REAL_VALUE_TYPE real_init_val = dconst0;
3932   int int_init_val = 0;
3933   gimple *def_stmt = NULL;
3934   gimple_seq stmts = NULL;
3935
3936   gcc_assert (vectype);
3937   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3938
3939   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3940               || SCALAR_FLOAT_TYPE_P (scalar_type));
3941
3942   if (nested_in_vect_loop_p (loop, stmt))
3943     nested_in_vect_loop = true;
3944   else
3945     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3946
3947   /* In case of double reduction we only create a vector variable to be put
3948      in the reduction phi node.  The actual statement creation is done in
3949      vect_create_epilog_for_reduction.  */
3950   if (adjustment_def && nested_in_vect_loop
3951       && TREE_CODE (init_val) == SSA_NAME
3952       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3953       && gimple_code (def_stmt) == GIMPLE_PHI
3954       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3955       && vinfo_for_stmt (def_stmt)
3956       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3957           == vect_double_reduction_def)
3958     {
3959       *adjustment_def = NULL;
3960       return vect_create_destination_var (init_val, vectype);
3961     }
3962
3963   /* In case of a nested reduction do not use an adjustment def as
3964      that case is not supported by the epilogue generation correctly
3965      if ncopies is not one.  */
3966   if (adjustment_def && nested_in_vect_loop)
3967     {
3968       *adjustment_def = NULL;
3969       return vect_get_vec_def_for_operand (init_val, stmt);
3970     }
3971
3972   switch (code)
3973     {
3974       case WIDEN_SUM_EXPR:
3975       case DOT_PROD_EXPR:
3976       case SAD_EXPR:
3977       case PLUS_EXPR:
3978       case MINUS_EXPR:
3979       case BIT_IOR_EXPR:
3980       case BIT_XOR_EXPR:
3981       case MULT_EXPR:
3982       case BIT_AND_EXPR:
3983         /* ADJUSMENT_DEF is NULL when called from
3984            vect_create_epilog_for_reduction to vectorize double reduction.  */
3985         if (adjustment_def)
3986           *adjustment_def = init_val;
3987
3988         if (code == MULT_EXPR)
3989           {
3990             real_init_val = dconst1;
3991             int_init_val = 1;
3992           }
3993
3994         if (code == BIT_AND_EXPR)
3995           int_init_val = -1;
3996
3997         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3998           def_for_init = build_real (scalar_type, real_init_val);
3999         else
4000           def_for_init = build_int_cst (scalar_type, int_init_val);
4001
4002         /* Create a vector of '0' or '1' except the first element.  */
4003         elts = XALLOCAVEC (tree, nunits);
4004         for (i = nunits - 2; i >= 0; --i)
4005           elts[i + 1] = def_for_init;
4006
4007         /* Option1: the first element is '0' or '1' as well.  */
4008         if (adjustment_def)
4009           {
4010             elts[0] = def_for_init;
4011             init_def = build_vector (vectype, elts);
4012             break;
4013           }
4014
4015         /* Option2: the first element is INIT_VAL.  */
4016         elts[0] = init_val;
4017         if (TREE_CONSTANT (init_val))
4018           init_def = build_vector (vectype, elts);
4019         else
4020           {
4021             vec<constructor_elt, va_gc> *v;
4022             vec_alloc (v, nunits);
4023             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4024             for (i = 1; i < nunits; ++i)
4025               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4026             init_def = build_constructor (vectype, v);
4027           }
4028
4029         break;
4030
4031       case MIN_EXPR:
4032       case MAX_EXPR:
4033       case COND_EXPR:
4034         if (adjustment_def)
4035           {
4036             *adjustment_def = NULL_TREE;
4037             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4038               {
4039                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4040                 break;
4041               }
4042           }
4043         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4044         if (! gimple_seq_empty_p (stmts))
4045           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4046         init_def = build_vector_from_val (vectype, init_val);
4047         break;
4048
4049       default:
4050         gcc_unreachable ();
4051     }
4052
4053   return init_def;
4054 }
4055
4056 /* Get at the initial defs for OP in the reduction SLP_NODE.
4057    NUMBER_OF_VECTORS is the number of vector defs to create.
4058    REDUC_INDEX is the index of the reduction operand in the statements.  */
4059
4060 static void
4061 get_initial_defs_for_reduction (slp_tree slp_node,
4062                                 vec<tree> *vec_oprnds,
4063                                 unsigned int number_of_vectors,
4064                                 int reduc_index, enum tree_code code)
4065 {
4066   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4067   gimple *stmt = stmts[0];
4068   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4069   unsigned nunits;
4070   tree vec_cst;
4071   tree *elts;
4072   unsigned j, number_of_places_left_in_vector;
4073   tree vector_type, scalar_type;
4074   tree vop;
4075   int group_size = stmts.length ();
4076   unsigned int vec_num, i;
4077   unsigned number_of_copies = 1;
4078   vec<tree> voprnds;
4079   voprnds.create (number_of_vectors);
4080   bool constant_p;
4081   tree neutral_op = NULL;
4082   gimple *def_stmt;
4083   struct loop *loop;
4084   gimple_seq ctor_seq = NULL;
4085
4086   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4087   scalar_type = TREE_TYPE (vector_type);
4088   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4089
4090   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def
4091               && reduc_index != -1);
4092
4093   /* op is the reduction operand of the first stmt already.  */
4094   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4095      we need either neutral operands or the original operands.  See
4096      get_initial_def_for_reduction() for details.  */
4097   switch (code)
4098     {
4099     case WIDEN_SUM_EXPR:
4100     case DOT_PROD_EXPR:
4101     case SAD_EXPR:
4102     case PLUS_EXPR:
4103     case MINUS_EXPR:
4104     case BIT_IOR_EXPR:
4105     case BIT_XOR_EXPR:
4106       neutral_op = build_zero_cst (scalar_type);
4107       break;
4108
4109     case MULT_EXPR:
4110       neutral_op = build_one_cst (scalar_type);
4111       break;
4112
4113     case BIT_AND_EXPR:
4114       neutral_op = build_all_ones_cst (scalar_type);
4115       break;
4116
4117     /* For MIN/MAX we don't have an easy neutral operand but
4118        the initial values can be used fine here.  Only for
4119        a reduction chain we have to force a neutral element.  */
4120     case MAX_EXPR:
4121     case MIN_EXPR:
4122       if (!GROUP_FIRST_ELEMENT (stmt_vinfo))
4123         neutral_op = NULL;
4124       else
4125         {
4126           tree op = get_reduction_op (stmts[0], reduc_index);
4127           def_stmt = SSA_NAME_DEF_STMT (op);
4128           loop = (gimple_bb (stmt))->loop_father;
4129           neutral_op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4130                                               loop_preheader_edge (loop));
4131         }
4132       break;
4133
4134     default:
4135       gcc_assert (!GROUP_FIRST_ELEMENT (stmt_vinfo));
4136       neutral_op = NULL;
4137     }
4138
4139   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4140      created vectors. It is greater than 1 if unrolling is performed.
4141
4142      For example, we have two scalar operands, s1 and s2 (e.g., group of
4143      strided accesses of size two), while NUNITS is four (i.e., four scalars
4144      of this type can be packed in a vector).  The output vector will contain
4145      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4146      will be 2).
4147
4148      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4149      containing the operands.
4150
4151      For example, NUNITS is four as before, and the group size is 8
4152      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4153      {s5, s6, s7, s8}.  */
4154
4155   number_of_copies = nunits * number_of_vectors / group_size;
4156
4157   number_of_places_left_in_vector = nunits;
4158   constant_p = true;
4159   elts = XALLOCAVEC (tree, nunits);
4160   for (j = 0; j < number_of_copies; j++)
4161     {
4162       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4163         {
4164           tree op = get_reduction_op (stmt, reduc_index);
4165           loop = (gimple_bb (stmt))->loop_father;
4166           def_stmt = SSA_NAME_DEF_STMT (op);
4167
4168           gcc_assert (loop);
4169
4170           /* Get the def before the loop.  In reduction chain we have only
4171              one initial value.  */
4172           if ((j != (number_of_copies - 1)
4173                || (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt))
4174                    && i != 0))
4175               && neutral_op)
4176             op = neutral_op;
4177           else
4178             op = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4179                                         loop_preheader_edge (loop));
4180
4181           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4182           number_of_places_left_in_vector--;
4183           elts[number_of_places_left_in_vector] = op;
4184           if (!CONSTANT_CLASS_P (op))
4185             constant_p = false;
4186
4187           if (number_of_places_left_in_vector == 0)
4188             {
4189               if (constant_p)
4190                 vec_cst = build_vector (vector_type, elts);
4191               else
4192                 {
4193                   vec<constructor_elt, va_gc> *v;
4194                   unsigned k;
4195                   vec_alloc (v, nunits);
4196                   for (k = 0; k < nunits; ++k)
4197                     CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[k]);
4198                   vec_cst = build_constructor (vector_type, v);
4199                 }
4200               tree init;
4201               gimple_stmt_iterator gsi;
4202               init = vect_init_vector (stmt, vec_cst, vector_type, NULL);
4203               if (ctor_seq != NULL)
4204                 {
4205                   gsi = gsi_for_stmt (SSA_NAME_DEF_STMT (init));
4206                   gsi_insert_seq_before_without_update (&gsi, ctor_seq,
4207                                                         GSI_SAME_STMT);
4208                   ctor_seq = NULL;
4209                 }
4210               voprnds.quick_push (init);
4211
4212               number_of_places_left_in_vector = nunits;
4213               constant_p = true;
4214             }
4215         }
4216     }
4217
4218   /* Since the vectors are created in the reverse order, we should invert
4219      them.  */
4220   vec_num = voprnds.length ();
4221   for (j = vec_num; j != 0; j--)
4222     {
4223       vop = voprnds[j - 1];
4224       vec_oprnds->quick_push (vop);
4225     }
4226
4227   voprnds.release ();
4228
4229   /* In case that VF is greater than the unrolling factor needed for the SLP
4230      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4231      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4232      to replicate the vectors.  */
4233   while (number_of_vectors > vec_oprnds->length ())
4234     {
4235       tree neutral_vec = NULL;
4236
4237       if (neutral_op)
4238         {
4239           if (!neutral_vec)
4240             neutral_vec = build_vector_from_val (vector_type, neutral_op);
4241
4242           vec_oprnds->quick_push (neutral_vec);
4243         }
4244       else
4245         {
4246           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4247             vec_oprnds->quick_push (vop);
4248         }
4249     }
4250 }
4251
4252
4253 /* Function vect_create_epilog_for_reduction
4254
4255    Create code at the loop-epilog to finalize the result of a reduction
4256    computation.
4257
4258    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4259      reduction statements.
4260    STMT is the scalar reduction stmt that is being vectorized.
4261    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4262      number of elements that we can fit in a vectype (nunits).  In this case
4263      we have to generate more than one vector stmt - i.e - we need to "unroll"
4264      the vector stmt by a factor VF/nunits.  For more details see documentation
4265      in vectorizable_operation.
4266    REDUC_CODE is the tree-code for the epilog reduction.
4267    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4268      computation.
4269    REDUC_INDEX is the index of the operand in the right hand side of the
4270      statement that is defined by REDUCTION_PHI.
4271    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4272    SLP_NODE is an SLP node containing a group of reduction statements. The
4273      first one in this group is STMT.
4274
4275    This function:
4276    1. Creates the reduction def-use cycles: sets the arguments for
4277       REDUCTION_PHIS:
4278       The loop-entry argument is the vectorized initial-value of the reduction.
4279       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4280       sums.
4281    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4282       by applying the operation specified by REDUC_CODE if available, or by
4283       other means (whole-vector shifts or a scalar loop).
4284       The function also creates a new phi node at the loop exit to preserve
4285       loop-closed form, as illustrated below.
4286
4287      The flow at the entry to this function:
4288
4289         loop:
4290           vec_def = phi <null, null>            # REDUCTION_PHI
4291           VECT_DEF = vector_stmt                # vectorized form of STMT
4292           s_loop = scalar_stmt                  # (scalar) STMT
4293         loop_exit:
4294           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4295           use <s_out0>
4296           use <s_out0>
4297
4298      The above is transformed by this function into:
4299
4300         loop:
4301           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4302           VECT_DEF = vector_stmt                # vectorized form of STMT
4303           s_loop = scalar_stmt                  # (scalar) STMT
4304         loop_exit:
4305           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4306           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4307           v_out2 = reduce <v_out1>
4308           s_out3 = extract_field <v_out2, 0>
4309           s_out4 = adjust_result <s_out3>
4310           use <s_out4>
4311           use <s_out4>
4312 */
4313
4314 static void
4315 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4316                                   int ncopies, enum tree_code reduc_code,
4317                                   vec<gimple *> reduction_phis,
4318                                   int reduc_index, bool double_reduc,
4319                                   slp_tree slp_node)
4320 {
4321   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4322   stmt_vec_info prev_phi_info;
4323   tree vectype;
4324   machine_mode mode;
4325   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4326   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4327   basic_block exit_bb;
4328   tree scalar_dest;
4329   tree scalar_type;
4330   gimple *new_phi = NULL, *phi;
4331   gimple_stmt_iterator exit_gsi;
4332   tree vec_dest;
4333   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4334   gimple *epilog_stmt = NULL;
4335   enum tree_code code = gimple_assign_rhs_code (stmt);
4336   gimple *exit_phi;
4337   tree bitsize;
4338   tree adjustment_def = NULL;
4339   tree vec_initial_def = NULL;
4340   tree expr, def, initial_def = NULL;
4341   tree orig_name, scalar_result;
4342   imm_use_iterator imm_iter, phi_imm_iter;
4343   use_operand_p use_p, phi_use_p;
4344   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4345   bool nested_in_vect_loop = false;
4346   auto_vec<gimple *> new_phis;
4347   auto_vec<gimple *> inner_phis;
4348   enum vect_def_type dt = vect_unknown_def_type;
4349   int j, i;
4350   auto_vec<tree> scalar_results;
4351   unsigned int group_size = 1, k, ratio;
4352   auto_vec<tree> vec_initial_defs;
4353   auto_vec<gimple *> phis;
4354   bool slp_reduc = false;
4355   tree new_phi_result;
4356   gimple *inner_phi = NULL;
4357   tree induction_index = NULL_TREE;
4358
4359   if (slp_node)
4360     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4361
4362   if (nested_in_vect_loop_p (loop, stmt))
4363     {
4364       outer_loop = loop;
4365       loop = loop->inner;
4366       nested_in_vect_loop = true;
4367       gcc_assert (!slp_node);
4368     }
4369
4370   vectype = STMT_VINFO_VECTYPE (stmt_info);
4371   gcc_assert (vectype);
4372   mode = TYPE_MODE (vectype);
4373
4374   /* 1. Create the reduction def-use cycle:
4375      Set the arguments of REDUCTION_PHIS, i.e., transform
4376
4377         loop:
4378           vec_def = phi <null, null>            # REDUCTION_PHI
4379           VECT_DEF = vector_stmt                # vectorized form of STMT
4380           ...
4381
4382      into:
4383
4384         loop:
4385           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4386           VECT_DEF = vector_stmt                # vectorized form of STMT
4387           ...
4388
4389      (in case of SLP, do it for all the phis). */
4390
4391   /* Get the loop-entry arguments.  */
4392   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4393   if (slp_node)
4394     {
4395       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4396       vec_initial_defs.reserve (vec_num);
4397       get_initial_defs_for_reduction (slp_node, &vec_initial_defs,
4398                                       vec_num, reduc_index, code);
4399     }
4400   else
4401     {
4402       /* Get at the scalar def before the loop, that defines the initial value
4403          of the reduction variable.  */
4404       tree reduction_op = get_reduction_op (stmt, reduc_index);
4405       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4406       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4407                                            loop_preheader_edge (loop));
4408       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4409       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4410                                                        &adjustment_def);
4411       vec_initial_defs.create (1);
4412       vec_initial_defs.quick_push (vec_initial_def);
4413     }
4414
4415   /* Set phi nodes arguments.  */
4416   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4417     {
4418       tree vec_init_def, def;
4419       gimple_seq stmts;
4420       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4421                                            true, NULL_TREE);
4422       if (stmts)
4423         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4424
4425       def = vect_defs[i];
4426       for (j = 0; j < ncopies; j++)
4427         {
4428           if (j != 0)
4429             {
4430               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4431               if (nested_in_vect_loop)
4432                 vec_init_def
4433                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4434                                                     vec_init_def);
4435             }
4436
4437           /* Set the loop-entry arg of the reduction-phi.  */
4438
4439           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4440               == INTEGER_INDUC_COND_REDUCTION)
4441             {
4442               /* Initialise the reduction phi to zero.  This prevents initial
4443                  values of non-zero interferring with the reduction op.  */
4444               gcc_assert (ncopies == 1);
4445               gcc_assert (i == 0);
4446
4447               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4448               tree zero_vec = build_zero_cst (vec_init_def_type);
4449
4450               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4451                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4452             }
4453           else
4454             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4455                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4456
4457           /* Set the loop-latch arg for the reduction-phi.  */
4458           if (j > 0)
4459             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4460
4461           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4462                        UNKNOWN_LOCATION);
4463
4464           if (dump_enabled_p ())
4465             {
4466               dump_printf_loc (MSG_NOTE, vect_location,
4467                                "transform reduction: created def-use cycle: ");
4468               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4469               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4470             }
4471         }
4472     }
4473
4474   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4475      which is updated with the current index of the loop for every match of
4476      the original loop's cond_expr (VEC_STMT).  This results in a vector
4477      containing the last time the condition passed for that vector lane.
4478      The first match will be a 1 to allow 0 to be used for non-matching
4479      indexes.  If there are no matches at all then the vector will be all
4480      zeroes.  */
4481   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4482     {
4483       tree indx_before_incr, indx_after_incr;
4484       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4485       int k;
4486
4487       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4488       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4489
4490       int scalar_precision
4491         = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype)));
4492       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4493       tree cr_index_vector_type = build_vector_type
4494         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4495
4496       /* First we create a simple vector induction variable which starts
4497          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4498          vector size (STEP).  */
4499
4500       /* Create a {1,2,3,...} vector.  */
4501       tree *vtemp = XALLOCAVEC (tree, nunits_out);
4502       for (k = 0; k < nunits_out; ++k)
4503         vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
4504       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4505
4506       /* Create a vector of the step value.  */
4507       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4508       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4509
4510       /* Create an induction variable.  */
4511       gimple_stmt_iterator incr_gsi;
4512       bool insert_after;
4513       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4514       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4515                  insert_after, &indx_before_incr, &indx_after_incr);
4516
4517       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4518          filled with zeros (VEC_ZERO).  */
4519
4520       /* Create a vector of 0s.  */
4521       tree zero = build_zero_cst (cr_index_scalar_type);
4522       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4523
4524       /* Create a vector phi node.  */
4525       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4526       new_phi = create_phi_node (new_phi_tree, loop->header);
4527       set_vinfo_for_stmt (new_phi,
4528                           new_stmt_vec_info (new_phi, loop_vinfo));
4529       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4530                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4531
4532       /* Now take the condition from the loops original cond_expr
4533          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4534          every match uses values from the induction variable
4535          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4536          (NEW_PHI_TREE).
4537          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4538          the new cond_expr (INDEX_COND_EXPR).  */
4539
4540       /* Duplicate the condition from vec_stmt.  */
4541       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4542
4543       /* Create a conditional, where the condition is taken from vec_stmt
4544          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4545          else is the phi (NEW_PHI_TREE).  */
4546       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4547                                      ccompare, indx_before_incr,
4548                                      new_phi_tree);
4549       induction_index = make_ssa_name (cr_index_vector_type);
4550       gimple *index_condition = gimple_build_assign (induction_index,
4551                                                      index_cond_expr);
4552       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4553       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4554                                                         loop_vinfo);
4555       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4556       set_vinfo_for_stmt (index_condition, index_vec_info);
4557
4558       /* Update the phi with the vec cond.  */
4559       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4560                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4561     }
4562
4563   /* 2. Create epilog code.
4564         The reduction epilog code operates across the elements of the vector
4565         of partial results computed by the vectorized loop.
4566         The reduction epilog code consists of:
4567
4568         step 1: compute the scalar result in a vector (v_out2)
4569         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4570         step 3: adjust the scalar result (s_out3) if needed.
4571
4572         Step 1 can be accomplished using one the following three schemes:
4573           (scheme 1) using reduc_code, if available.
4574           (scheme 2) using whole-vector shifts, if available.
4575           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4576                      combined.
4577
4578           The overall epilog code looks like this:
4579
4580           s_out0 = phi <s_loop>         # original EXIT_PHI
4581           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4582           v_out2 = reduce <v_out1>              # step 1
4583           s_out3 = extract_field <v_out2, 0>    # step 2
4584           s_out4 = adjust_result <s_out3>       # step 3
4585
4586           (step 3 is optional, and steps 1 and 2 may be combined).
4587           Lastly, the uses of s_out0 are replaced by s_out4.  */
4588
4589
4590   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4591          v_out1 = phi <VECT_DEF>
4592          Store them in NEW_PHIS.  */
4593
4594   exit_bb = single_exit (loop)->dest;
4595   prev_phi_info = NULL;
4596   new_phis.create (vect_defs.length ());
4597   FOR_EACH_VEC_ELT (vect_defs, i, def)
4598     {
4599       for (j = 0; j < ncopies; j++)
4600         {
4601           tree new_def = copy_ssa_name (def);
4602           phi = create_phi_node (new_def, exit_bb);
4603           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4604           if (j == 0)
4605             new_phis.quick_push (phi);
4606           else
4607             {
4608               def = vect_get_vec_def_for_stmt_copy (dt, def);
4609               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4610             }
4611
4612           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4613           prev_phi_info = vinfo_for_stmt (phi);
4614         }
4615     }
4616
4617   /* The epilogue is created for the outer-loop, i.e., for the loop being
4618      vectorized.  Create exit phis for the outer loop.  */
4619   if (double_reduc)
4620     {
4621       loop = outer_loop;
4622       exit_bb = single_exit (loop)->dest;
4623       inner_phis.create (vect_defs.length ());
4624       FOR_EACH_VEC_ELT (new_phis, i, phi)
4625         {
4626           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4627           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4628           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4629                            PHI_RESULT (phi));
4630           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4631                                                             loop_vinfo));
4632           inner_phis.quick_push (phi);
4633           new_phis[i] = outer_phi;
4634           prev_phi_info = vinfo_for_stmt (outer_phi);
4635           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4636             {
4637               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4638               new_result = copy_ssa_name (PHI_RESULT (phi));
4639               outer_phi = create_phi_node (new_result, exit_bb);
4640               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4641                                PHI_RESULT (phi));
4642               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4643                                                                 loop_vinfo));
4644               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4645               prev_phi_info = vinfo_for_stmt (outer_phi);
4646             }
4647         }
4648     }
4649
4650   exit_gsi = gsi_after_labels (exit_bb);
4651
4652   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4653          (i.e. when reduc_code is not available) and in the final adjustment
4654          code (if needed).  Also get the original scalar reduction variable as
4655          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4656          represents a reduction pattern), the tree-code and scalar-def are
4657          taken from the original stmt that the pattern-stmt (STMT) replaces.
4658          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4659          are taken from STMT.  */
4660
4661   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4662   if (!orig_stmt)
4663     {
4664       /* Regular reduction  */
4665       orig_stmt = stmt;
4666     }
4667   else
4668     {
4669       /* Reduction pattern  */
4670       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4671       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4672       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4673     }
4674
4675   code = gimple_assign_rhs_code (orig_stmt);
4676   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4677      partial results are added and not subtracted.  */
4678   if (code == MINUS_EXPR)
4679     code = PLUS_EXPR;
4680
4681   scalar_dest = gimple_assign_lhs (orig_stmt);
4682   scalar_type = TREE_TYPE (scalar_dest);
4683   scalar_results.create (group_size);
4684   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4685   bitsize = TYPE_SIZE (scalar_type);
4686
4687   /* In case this is a reduction in an inner-loop while vectorizing an outer
4688      loop - we don't need to extract a single scalar result at the end of the
4689      inner-loop (unless it is double reduction, i.e., the use of reduction is
4690      outside the outer-loop).  The final vector of partial results will be used
4691      in the vectorized outer-loop, or reduced to a scalar result at the end of
4692      the outer-loop.  */
4693   if (nested_in_vect_loop && !double_reduc)
4694     goto vect_finalize_reduction;
4695
4696   /* SLP reduction without reduction chain, e.g.,
4697      # a1 = phi <a2, a0>
4698      # b1 = phi <b2, b0>
4699      a2 = operation (a1)
4700      b2 = operation (b1)  */
4701   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4702
4703   /* In case of reduction chain, e.g.,
4704      # a1 = phi <a3, a0>
4705      a2 = operation (a1)
4706      a3 = operation (a2),
4707
4708      we may end up with more than one vector result.  Here we reduce them to
4709      one vector.  */
4710   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4711     {
4712       tree first_vect = PHI_RESULT (new_phis[0]);
4713       tree tmp;
4714       gassign *new_vec_stmt = NULL;
4715
4716       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4717       for (k = 1; k < new_phis.length (); k++)
4718         {
4719           gimple *next_phi = new_phis[k];
4720           tree second_vect = PHI_RESULT (next_phi);
4721
4722           tmp = build2 (code, vectype,  first_vect, second_vect);
4723           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4724           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4725           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4726           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4727         }
4728
4729       new_phi_result = first_vect;
4730       if (new_vec_stmt)
4731         {
4732           new_phis.truncate (0);
4733           new_phis.safe_push (new_vec_stmt);
4734         }
4735     }
4736   else
4737     new_phi_result = PHI_RESULT (new_phis[0]);
4738
4739   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4740       && reduc_code != ERROR_MARK)
4741     {
4742       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4743          various data values where the condition matched and another vector
4744          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4745          need to extract the last matching index (which will be the index with
4746          highest value) and use this to index into the data vector.
4747          For the case where there were no matches, the data vector will contain
4748          all default values and the index vector will be all zeros.  */
4749
4750       /* Get various versions of the type of the vector of indexes.  */
4751       tree index_vec_type = TREE_TYPE (induction_index);
4752       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4753       tree index_scalar_type = TREE_TYPE (index_vec_type);
4754       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4755         (index_vec_type);
4756
4757       /* Get an unsigned integer version of the type of the data vector.  */
4758       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4759       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4760       tree vectype_unsigned = build_vector_type
4761         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4762
4763       /* First we need to create a vector (ZERO_VEC) of zeros and another
4764          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4765          can create using a MAX reduction and then expanding.
4766          In the case where the loop never made any matches, the max index will
4767          be zero.  */
4768
4769       /* Vector of {0, 0, 0,...}.  */
4770       tree zero_vec = make_ssa_name (vectype);
4771       tree zero_vec_rhs = build_zero_cst (vectype);
4772       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4773       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4774
4775       /* Find maximum value from the vector of found indexes.  */
4776       tree max_index = make_ssa_name (index_scalar_type);
4777       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4778                                                     induction_index);
4779       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4780
4781       /* Vector of {max_index, max_index, max_index,...}.  */
4782       tree max_index_vec = make_ssa_name (index_vec_type);
4783       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4784                                                       max_index);
4785       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4786                                                         max_index_vec_rhs);
4787       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4788
4789       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4790          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4791          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4792          otherwise.  Only one value should match, resulting in a vector
4793          (VEC_COND) with one data value and the rest zeros.
4794          In the case where the loop never made any matches, every index will
4795          match, resulting in a vector with all data values (which will all be
4796          the default value).  */
4797
4798       /* Compare the max index vector to the vector of found indexes to find
4799          the position of the max value.  */
4800       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4801       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4802                                                       induction_index,
4803                                                       max_index_vec);
4804       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4805
4806       /* Use the compare to choose either values from the data vector or
4807          zero.  */
4808       tree vec_cond = make_ssa_name (vectype);
4809       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4810                                                    vec_compare, new_phi_result,
4811                                                    zero_vec);
4812       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4813
4814       /* Finally we need to extract the data value from the vector (VEC_COND)
4815          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4816          reduction, but because this doesn't exist, we can use a MAX reduction
4817          instead.  The data value might be signed or a float so we need to cast
4818          it first.
4819          In the case where the loop never made any matches, the data values are
4820          all identical, and so will reduce down correctly.  */
4821
4822       /* Make the matched data values unsigned.  */
4823       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4824       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4825                                        vec_cond);
4826       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4827                                                         VIEW_CONVERT_EXPR,
4828                                                         vec_cond_cast_rhs);
4829       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4830
4831       /* Reduce down to a scalar value.  */
4832       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4833       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4834                                       optab_default);
4835       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4836                   != CODE_FOR_nothing);
4837       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4838                                                      REDUC_MAX_EXPR,
4839                                                      vec_cond_cast);
4840       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4841
4842       /* Convert the reduced value back to the result type and set as the
4843          result.  */
4844       gimple_seq stmts = NULL;
4845       new_temp = gimple_convert (&stmts, scalar_type, data_reduc);
4846       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4847       scalar_results.safe_push (new_temp);
4848     }
4849   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4850            && reduc_code == ERROR_MARK)
4851     {
4852       /* Condition redution without supported REDUC_MAX_EXPR.  Generate
4853          idx = 0;
4854          idx_val = induction_index[0];
4855          val = data_reduc[0];
4856          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4857            if (induction_index[i] > idx_val)
4858              val = data_reduc[i], idx_val = induction_index[i];
4859          return val;  */
4860
4861       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4862       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4863       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4864       unsigned HOST_WIDE_INT v_size
4865         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4866       tree idx_val = NULL_TREE, val = NULL_TREE;
4867       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4868         {
4869           tree old_idx_val = idx_val;
4870           tree old_val = val;
4871           idx_val = make_ssa_name (idx_eltype);
4872           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4873                                              build3 (BIT_FIELD_REF, idx_eltype,
4874                                                      induction_index,
4875                                                      bitsize_int (el_size),
4876                                                      bitsize_int (off)));
4877           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4878           val = make_ssa_name (data_eltype);
4879           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4880                                              build3 (BIT_FIELD_REF,
4881                                                      data_eltype,
4882                                                      new_phi_result,
4883                                                      bitsize_int (el_size),
4884                                                      bitsize_int (off)));
4885           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4886           if (off != 0)
4887             {
4888               tree new_idx_val = idx_val;
4889               tree new_val = val;
4890               if (off != v_size - el_size)
4891                 {
4892                   new_idx_val = make_ssa_name (idx_eltype);
4893                   epilog_stmt = gimple_build_assign (new_idx_val,
4894                                                      MAX_EXPR, idx_val,
4895                                                      old_idx_val);
4896                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4897                 }
4898               new_val = make_ssa_name (data_eltype);
4899               epilog_stmt = gimple_build_assign (new_val,
4900                                                  COND_EXPR,
4901                                                  build2 (GT_EXPR,
4902                                                          boolean_type_node,
4903                                                          idx_val,
4904                                                          old_idx_val),
4905                                                  val, old_val);
4906               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4907               idx_val = new_idx_val;
4908               val = new_val;
4909             }
4910         }
4911       /* Convert the reduced value back to the result type and set as the
4912          result.  */
4913       gimple_seq stmts = NULL;
4914       val = gimple_convert (&stmts, scalar_type, val);
4915       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4916       scalar_results.safe_push (val);
4917     }
4918
4919   /* 2.3 Create the reduction code, using one of the three schemes described
4920          above. In SLP we simply need to extract all the elements from the
4921          vector (without reducing them), so we use scalar shifts.  */
4922   else if (reduc_code != ERROR_MARK && !slp_reduc)
4923     {
4924       tree tmp;
4925       tree vec_elem_type;
4926
4927       /* Case 1:  Create:
4928          v_out2 = reduc_expr <v_out1>  */
4929
4930       if (dump_enabled_p ())
4931         dump_printf_loc (MSG_NOTE, vect_location,
4932                          "Reduce using direct vector reduction.\n");
4933
4934       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4935       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4936         {
4937           tree tmp_dest =
4938               vect_create_destination_var (scalar_dest, vec_elem_type);
4939           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4940           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4941           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4942           gimple_assign_set_lhs (epilog_stmt, new_temp);
4943           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4944
4945           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4946         }
4947       else
4948         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4949
4950       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4951       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4952       gimple_assign_set_lhs (epilog_stmt, new_temp);
4953       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4954
4955       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4956           == INTEGER_INDUC_COND_REDUCTION)
4957         {
4958           /* Earlier we set the initial value to be zero.  Check the result
4959              and if it is zero then replace with the original initial
4960              value.  */
4961           tree zero = build_zero_cst (scalar_type);
4962           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4963
4964           tmp = make_ssa_name (new_scalar_dest);
4965           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4966                                              initial_def, new_temp);
4967           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4968           new_temp = tmp;
4969         }
4970
4971       scalar_results.safe_push (new_temp);
4972     }
4973   else
4974     {
4975       bool reduce_with_shift = have_whole_vector_shift (mode);
4976       int element_bitsize = tree_to_uhwi (bitsize);
4977       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4978       tree vec_temp;
4979
4980       /* COND reductions all do the final reduction with MAX_EXPR.  */
4981       if (code == COND_EXPR)
4982         code = MAX_EXPR;
4983
4984       /* Regardless of whether we have a whole vector shift, if we're
4985          emulating the operation via tree-vect-generic, we don't want
4986          to use it.  Only the first round of the reduction is likely
4987          to still be profitable via emulation.  */
4988       /* ??? It might be better to emit a reduction tree code here, so that
4989          tree-vect-generic can expand the first round via bit tricks.  */
4990       if (!VECTOR_MODE_P (mode))
4991         reduce_with_shift = false;
4992       else
4993         {
4994           optab optab = optab_for_tree_code (code, vectype, optab_default);
4995           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4996             reduce_with_shift = false;
4997         }
4998
4999       if (reduce_with_shift && !slp_reduc)
5000         {
5001           int nelements = vec_size_in_bits / element_bitsize;
5002           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
5003
5004           int elt_offset;
5005
5006           tree zero_vec = build_zero_cst (vectype);
5007           /* Case 2: Create:
5008              for (offset = nelements/2; offset >= 1; offset/=2)
5009                 {
5010                   Create:  va' = vec_shift <va, offset>
5011                   Create:  va = vop <va, va'>
5012                 }  */
5013
5014           tree rhs;
5015
5016           if (dump_enabled_p ())
5017             dump_printf_loc (MSG_NOTE, vect_location,
5018                              "Reduce using vector shifts\n");
5019
5020           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5021           new_temp = new_phi_result;
5022           for (elt_offset = nelements / 2;
5023                elt_offset >= 1;
5024                elt_offset /= 2)
5025             {
5026               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
5027               tree mask = vect_gen_perm_mask_any (vectype, sel);
5028               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5029                                                  new_temp, zero_vec, mask);
5030               new_name = make_ssa_name (vec_dest, epilog_stmt);
5031               gimple_assign_set_lhs (epilog_stmt, new_name);
5032               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5033
5034               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5035                                                  new_temp);
5036               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5037               gimple_assign_set_lhs (epilog_stmt, new_temp);
5038               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039             }
5040
5041           /* 2.4  Extract the final scalar result.  Create:
5042              s_out3 = extract_field <v_out2, bitpos>  */
5043
5044           if (dump_enabled_p ())
5045             dump_printf_loc (MSG_NOTE, vect_location,
5046                              "extract scalar result\n");
5047
5048           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5049                         bitsize, bitsize_zero_node);
5050           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5051           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5052           gimple_assign_set_lhs (epilog_stmt, new_temp);
5053           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5054           scalar_results.safe_push (new_temp);
5055         }
5056       else
5057         {
5058           /* Case 3: Create:
5059              s = extract_field <v_out2, 0>
5060              for (offset = element_size;
5061                   offset < vector_size;
5062                   offset += element_size;)
5063                {
5064                  Create:  s' = extract_field <v_out2, offset>
5065                  Create:  s = op <s, s'>  // For non SLP cases
5066                }  */
5067
5068           if (dump_enabled_p ())
5069             dump_printf_loc (MSG_NOTE, vect_location,
5070                              "Reduce using scalar code.\n");
5071
5072           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5073           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5074             {
5075               int bit_offset;
5076               if (gimple_code (new_phi) == GIMPLE_PHI)
5077                 vec_temp = PHI_RESULT (new_phi);
5078               else
5079                 vec_temp = gimple_assign_lhs (new_phi);
5080               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5081                             bitsize_zero_node);
5082               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5083               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5084               gimple_assign_set_lhs (epilog_stmt, new_temp);
5085               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5086
5087               /* In SLP we don't need to apply reduction operation, so we just
5088                  collect s' values in SCALAR_RESULTS.  */
5089               if (slp_reduc)
5090                 scalar_results.safe_push (new_temp);
5091
5092               for (bit_offset = element_bitsize;
5093                    bit_offset < vec_size_in_bits;
5094                    bit_offset += element_bitsize)
5095                 {
5096                   tree bitpos = bitsize_int (bit_offset);
5097                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5098                                      bitsize, bitpos);
5099
5100                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5101                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5102                   gimple_assign_set_lhs (epilog_stmt, new_name);
5103                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5104
5105                   if (slp_reduc)
5106                     {
5107                       /* In SLP we don't need to apply reduction operation, so
5108                          we just collect s' values in SCALAR_RESULTS.  */
5109                       new_temp = new_name;
5110                       scalar_results.safe_push (new_name);
5111                     }
5112                   else
5113                     {
5114                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5115                                                          new_name, new_temp);
5116                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5117                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5118                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5119                     }
5120                 }
5121             }
5122
5123           /* The only case where we need to reduce scalar results in SLP, is
5124              unrolling.  If the size of SCALAR_RESULTS is greater than
5125              GROUP_SIZE, we reduce them combining elements modulo
5126              GROUP_SIZE.  */
5127           if (slp_reduc)
5128             {
5129               tree res, first_res, new_res;
5130               gimple *new_stmt;
5131
5132               /* Reduce multiple scalar results in case of SLP unrolling.  */
5133               for (j = group_size; scalar_results.iterate (j, &res);
5134                    j++)
5135                 {
5136                   first_res = scalar_results[j % group_size];
5137                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5138                                                   first_res, res);
5139                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5140                   gimple_assign_set_lhs (new_stmt, new_res);
5141                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5142                   scalar_results[j % group_size] = new_res;
5143                 }
5144             }
5145           else
5146             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5147             scalar_results.safe_push (new_temp);
5148         }
5149
5150       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5151           == INTEGER_INDUC_COND_REDUCTION)
5152         {
5153           /* Earlier we set the initial value to be zero.  Check the result
5154              and if it is zero then replace with the original initial
5155              value.  */
5156           tree zero = build_zero_cst (scalar_type);
5157           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5158
5159           tree tmp = make_ssa_name (new_scalar_dest);
5160           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5161                                              initial_def, new_temp);
5162           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5163           scalar_results[0] = tmp;
5164         }
5165     }
5166
5167 vect_finalize_reduction:
5168
5169   if (double_reduc)
5170     loop = loop->inner;
5171
5172   /* 2.5 Adjust the final result by the initial value of the reduction
5173          variable. (When such adjustment is not needed, then
5174          'adjustment_def' is zero).  For example, if code is PLUS we create:
5175          new_temp = loop_exit_def + adjustment_def  */
5176
5177   if (adjustment_def)
5178     {
5179       gcc_assert (!slp_reduc);
5180       if (nested_in_vect_loop)
5181         {
5182           new_phi = new_phis[0];
5183           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5184           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5185           new_dest = vect_create_destination_var (scalar_dest, vectype);
5186         }
5187       else
5188         {
5189           new_temp = scalar_results[0];
5190           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5191           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5192           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5193         }
5194
5195       epilog_stmt = gimple_build_assign (new_dest, expr);
5196       new_temp = make_ssa_name (new_dest, epilog_stmt);
5197       gimple_assign_set_lhs (epilog_stmt, new_temp);
5198       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5199       if (nested_in_vect_loop)
5200         {
5201           set_vinfo_for_stmt (epilog_stmt,
5202                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5203           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5204                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5205
5206           if (!double_reduc)
5207             scalar_results.quick_push (new_temp);
5208           else
5209             scalar_results[0] = new_temp;
5210         }
5211       else
5212         scalar_results[0] = new_temp;
5213
5214       new_phis[0] = epilog_stmt;
5215     }
5216
5217   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5218           phis with new adjusted scalar results, i.e., replace use <s_out0>
5219           with use <s_out4>.
5220
5221      Transform:
5222         loop_exit:
5223           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5224           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5225           v_out2 = reduce <v_out1>
5226           s_out3 = extract_field <v_out2, 0>
5227           s_out4 = adjust_result <s_out3>
5228           use <s_out0>
5229           use <s_out0>
5230
5231      into:
5232
5233         loop_exit:
5234           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5235           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5236           v_out2 = reduce <v_out1>
5237           s_out3 = extract_field <v_out2, 0>
5238           s_out4 = adjust_result <s_out3>
5239           use <s_out4>
5240           use <s_out4> */
5241
5242
5243   /* In SLP reduction chain we reduce vector results into one vector if
5244      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5245      the last stmt in the reduction chain, since we are looking for the loop
5246      exit phi node.  */
5247   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5248     {
5249       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5250       /* Handle reduction patterns.  */
5251       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5252         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5253
5254       scalar_dest = gimple_assign_lhs (dest_stmt);
5255       group_size = 1;
5256     }
5257
5258   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5259      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5260      need to match SCALAR_RESULTS with corresponding statements.  The first
5261      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5262      the first vector stmt, etc.
5263      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5264   if (group_size > new_phis.length ())
5265     {
5266       ratio = group_size / new_phis.length ();
5267       gcc_assert (!(group_size % new_phis.length ()));
5268     }
5269   else
5270     ratio = 1;
5271
5272   for (k = 0; k < group_size; k++)
5273     {
5274       if (k % ratio == 0)
5275         {
5276           epilog_stmt = new_phis[k / ratio];
5277           reduction_phi = reduction_phis[k / ratio];
5278           if (double_reduc)
5279             inner_phi = inner_phis[k / ratio];
5280         }
5281
5282       if (slp_reduc)
5283         {
5284           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5285
5286           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5287           /* SLP statements can't participate in patterns.  */
5288           gcc_assert (!orig_stmt);
5289           scalar_dest = gimple_assign_lhs (current_stmt);
5290         }
5291
5292       phis.create (3);
5293       /* Find the loop-closed-use at the loop exit of the original scalar
5294          result.  (The reduction result is expected to have two immediate uses -
5295          one at the latch block, and one at the loop exit).  */
5296       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5297         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5298             && !is_gimple_debug (USE_STMT (use_p)))
5299           phis.safe_push (USE_STMT (use_p));
5300
5301       /* While we expect to have found an exit_phi because of loop-closed-ssa
5302          form we can end up without one if the scalar cycle is dead.  */
5303
5304       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5305         {
5306           if (outer_loop)
5307             {
5308               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5309               gphi *vect_phi;
5310
5311               /* FORNOW. Currently not supporting the case that an inner-loop
5312                  reduction is not used in the outer-loop (but only outside the
5313                  outer-loop), unless it is double reduction.  */
5314               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5315                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5316                           || double_reduc);
5317
5318               if (double_reduc)
5319                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5320               else
5321                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5322               if (!double_reduc
5323                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5324                       != vect_double_reduction_def)
5325                 continue;
5326
5327               /* Handle double reduction:
5328
5329                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5330                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5331                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5332                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5333
5334                  At that point the regular reduction (stmt2 and stmt3) is
5335                  already vectorized, as well as the exit phi node, stmt4.
5336                  Here we vectorize the phi node of double reduction, stmt1, and
5337                  update all relevant statements.  */
5338
5339               /* Go through all the uses of s2 to find double reduction phi
5340                  node, i.e., stmt1 above.  */
5341               orig_name = PHI_RESULT (exit_phi);
5342               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5343                 {
5344                   stmt_vec_info use_stmt_vinfo;
5345                   stmt_vec_info new_phi_vinfo;
5346                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5347                   basic_block bb = gimple_bb (use_stmt);
5348                   gimple *use;
5349
5350                   /* Check that USE_STMT is really double reduction phi
5351                      node.  */
5352                   if (gimple_code (use_stmt) != GIMPLE_PHI
5353                       || gimple_phi_num_args (use_stmt) != 2
5354                       || bb->loop_father != outer_loop)
5355                     continue;
5356                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5357                   if (!use_stmt_vinfo
5358                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5359                           != vect_double_reduction_def)
5360                     continue;
5361
5362                   /* Create vector phi node for double reduction:
5363                      vs1 = phi <vs0, vs2>
5364                      vs1 was created previously in this function by a call to
5365                        vect_get_vec_def_for_operand and is stored in
5366                        vec_initial_def;
5367                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5368                      vs0 is created here.  */
5369
5370                   /* Create vector phi node.  */
5371                   vect_phi = create_phi_node (vec_initial_def, bb);
5372                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5373                                     loop_vec_info_for_loop (outer_loop));
5374                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5375
5376                   /* Create vs0 - initial def of the double reduction phi.  */
5377                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5378                                              loop_preheader_edge (outer_loop));
5379                   init_def = get_initial_def_for_reduction (stmt,
5380                                                           preheader_arg, NULL);
5381                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5382                                                     vectype, NULL);
5383
5384                   /* Update phi node arguments with vs0 and vs2.  */
5385                   add_phi_arg (vect_phi, vect_phi_init,
5386                                loop_preheader_edge (outer_loop),
5387                                UNKNOWN_LOCATION);
5388                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5389                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5390                   if (dump_enabled_p ())
5391                     {
5392                       dump_printf_loc (MSG_NOTE, vect_location,
5393                                        "created double reduction phi node: ");
5394                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5395                     }
5396
5397                   vect_phi_res = PHI_RESULT (vect_phi);
5398
5399                   /* Replace the use, i.e., set the correct vs1 in the regular
5400                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5401                      loop is redundant.  */
5402                   use = reduction_phi;
5403                   for (j = 0; j < ncopies; j++)
5404                     {
5405                       edge pr_edge = loop_preheader_edge (loop);
5406                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5407                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5408                     }
5409                 }
5410             }
5411         }
5412
5413       phis.release ();
5414       if (nested_in_vect_loop)
5415         {
5416           if (double_reduc)
5417             loop = outer_loop;
5418           else
5419             continue;
5420         }
5421
5422       phis.create (3);
5423       /* Find the loop-closed-use at the loop exit of the original scalar
5424          result.  (The reduction result is expected to have two immediate uses,
5425          one at the latch block, and one at the loop exit).  For double
5426          reductions we are looking for exit phis of the outer loop.  */
5427       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5428         {
5429           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5430             {
5431               if (!is_gimple_debug (USE_STMT (use_p)))
5432                 phis.safe_push (USE_STMT (use_p));
5433             }
5434           else
5435             {
5436               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5437                 {
5438                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5439
5440                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5441                     {
5442                       if (!flow_bb_inside_loop_p (loop,
5443                                              gimple_bb (USE_STMT (phi_use_p)))
5444                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5445                         phis.safe_push (USE_STMT (phi_use_p));
5446                     }
5447                 }
5448             }
5449         }
5450
5451       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5452         {
5453           /* Replace the uses:  */
5454           orig_name = PHI_RESULT (exit_phi);
5455           scalar_result = scalar_results[k];
5456           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5457             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5458               SET_USE (use_p, scalar_result);
5459         }
5460
5461       phis.release ();
5462     }
5463 }
5464
5465
5466 /* Function is_nonwrapping_integer_induction.
5467
5468    Check if STMT (which is part of loop LOOP) both increments and
5469    does not cause overflow.  */
5470
5471 static bool
5472 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5473 {
5474   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5475   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5476   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5477   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5478   widest_int ni, max_loop_value, lhs_max;
5479   bool overflow = false;
5480
5481   /* Make sure the loop is integer based.  */
5482   if (TREE_CODE (base) != INTEGER_CST
5483       || TREE_CODE (step) != INTEGER_CST)
5484     return false;
5485
5486   /* Check that the induction increments.  */
5487   if (tree_int_cst_sgn (step) == -1)
5488     return false;
5489
5490   /* Check that the max size of the loop will not wrap.  */
5491
5492   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5493     return true;
5494
5495   if (! max_stmt_executions (loop, &ni))
5496     return false;
5497
5498   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5499                             &overflow);
5500   if (overflow)
5501     return false;
5502
5503   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5504                             TYPE_SIGN (lhs_type), &overflow);
5505   if (overflow)
5506     return false;
5507
5508   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5509           <= TYPE_PRECISION (lhs_type));
5510 }
5511
5512 /* Function vectorizable_reduction.
5513
5514    Check if STMT performs a reduction operation that can be vectorized.
5515    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5516    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5517    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5518
5519    This function also handles reduction idioms (patterns) that have been
5520    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5521    of this form:
5522      X = pattern_expr (arg0, arg1, ..., X)
5523    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5524    sequence that had been detected and replaced by the pattern-stmt (STMT).
5525
5526    This function also handles reduction of condition expressions, for example:
5527      for (int i = 0; i < N; i++)
5528        if (a[i] < value)
5529          last = a[i];
5530    This is handled by vectorising the loop and creating an additional vector
5531    containing the loop indexes for which "a[i] < value" was true.  In the
5532    function epilogue this is reduced to a single max value and then used to
5533    index into the vector of results.
5534
5535    In some cases of reduction patterns, the type of the reduction variable X is
5536    different than the type of the other arguments of STMT.
5537    In such cases, the vectype that is used when transforming STMT into a vector
5538    stmt is different than the vectype that is used to determine the
5539    vectorization factor, because it consists of a different number of elements
5540    than the actual number of elements that are being operated upon in parallel.
5541
5542    For example, consider an accumulation of shorts into an int accumulator.
5543    On some targets it's possible to vectorize this pattern operating on 8
5544    shorts at a time (hence, the vectype for purposes of determining the
5545    vectorization factor should be V8HI); on the other hand, the vectype that
5546    is used to create the vector form is actually V4SI (the type of the result).
5547
5548    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5549    indicates what is the actual level of parallelism (V8HI in the example), so
5550    that the right vectorization factor would be derived.  This vectype
5551    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5552    be used to create the vectorized stmt.  The right vectype for the vectorized
5553    stmt is obtained from the type of the result X:
5554         get_vectype_for_scalar_type (TREE_TYPE (X))
5555
5556    This means that, contrary to "regular" reductions (or "regular" stmts in
5557    general), the following equation:
5558       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5559    does *NOT* necessarily hold for reduction patterns.  */
5560
5561 bool
5562 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5563                         gimple **vec_stmt, slp_tree slp_node)
5564 {
5565   tree vec_dest;
5566   tree scalar_dest;
5567   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5568   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5569   tree vectype_in = NULL_TREE;
5570   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5571   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5572   enum tree_code code, orig_code, epilog_reduc_code;
5573   machine_mode vec_mode;
5574   int op_type;
5575   optab optab, reduc_optab;
5576   tree new_temp = NULL_TREE;
5577   gimple *def_stmt;
5578   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5579   gphi *new_phi = NULL;
5580   tree scalar_type;
5581   bool is_simple_use;
5582   gimple *orig_stmt;
5583   stmt_vec_info orig_stmt_info;
5584   int i;
5585   int ncopies;
5586   int epilog_copies;
5587   stmt_vec_info prev_stmt_info, prev_phi_info;
5588   bool single_defuse_cycle = false;
5589   tree reduc_def = NULL_TREE;
5590   gimple *new_stmt = NULL;
5591   int j;
5592   tree ops[3];
5593   enum vect_def_type dts[3];
5594   bool nested_cycle = false, found_nested_cycle_def = false;
5595   gimple *reduc_def_stmt = NULL;
5596   bool double_reduc = false;
5597   basic_block def_bb;
5598   struct loop * def_stmt_loop, *outer_loop = NULL;
5599   tree def_arg;
5600   gimple *def_arg_stmt;
5601   auto_vec<tree> vec_oprnds0;
5602   auto_vec<tree> vec_oprnds1;
5603   auto_vec<tree> vect_defs;
5604   auto_vec<gimple *> phis;
5605   int vec_num;
5606   tree def0, tem;
5607   bool first_p = true;
5608   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5609   tree cond_reduc_val = NULL_TREE;
5610
5611   /* Make sure it was already recognized as a reduction computation.  */
5612   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5613       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5614     return false;
5615
5616   if (nested_in_vect_loop_p (loop, stmt))
5617     {
5618       outer_loop = loop;
5619       loop = loop->inner;
5620       nested_cycle = true;
5621     }
5622
5623   /* In case of reduction chain we switch to the first stmt in the chain, but
5624      we don't update STMT_INFO, since only the last stmt is marked as reduction
5625      and has reduction properties.  */
5626   if (GROUP_FIRST_ELEMENT (stmt_info)
5627       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5628     {
5629       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5630       first_p = false;
5631     }
5632
5633   if (gimple_code (stmt) == GIMPLE_PHI)
5634     {
5635       /* Analysis is fully done on the reduction stmt invocation.  */
5636       if (! vec_stmt)
5637         {
5638           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5639           return true;
5640         }
5641
5642       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5643       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5644         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5645       if (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt)) <= vect_used_only_live)
5646         single_defuse_cycle = true;
5647
5648       gcc_assert (is_gimple_assign (reduc_stmt));
5649       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5650         {
5651           tree op = gimple_op (reduc_stmt, k);
5652           if (op == gimple_phi_result (stmt))
5653             continue;
5654           if (k == 1
5655               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5656             continue;
5657           vectype_in = get_vectype_for_scalar_type (TREE_TYPE (op));
5658           break;
5659         }
5660       gcc_assert (vectype_in);
5661
5662       if (slp_node)
5663         ncopies = 1;
5664       else
5665         ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5666                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5667
5668       /* Create the destination vector  */
5669       scalar_dest = gimple_assign_lhs (reduc_stmt);
5670       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5671
5672       if (slp_node)
5673         /* The size vect_schedule_slp_instance computes is off for us.  */
5674         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5675                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5676                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5677       else
5678         vec_num = 1;
5679
5680       /* Generate the reduction PHIs upfront.  */
5681       prev_phi_info = NULL;
5682       for (j = 0; j < ncopies; j++)
5683         {
5684           if (j == 0 || !single_defuse_cycle)
5685             {
5686               for (i = 0; i < vec_num; i++)
5687                 {
5688                   /* Create the reduction-phi that defines the reduction
5689                      operand.  */
5690                   new_phi = create_phi_node (vec_dest, loop->header);
5691                   set_vinfo_for_stmt (new_phi,
5692                                       new_stmt_vec_info (new_phi, loop_vinfo));
5693
5694                   if (slp_node)
5695                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5696                   else
5697                     {
5698                       if (j == 0)
5699                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5700                       else
5701                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5702                       prev_phi_info = vinfo_for_stmt (new_phi);
5703                     }
5704                 }
5705             }
5706         }
5707
5708       return true;
5709     }
5710
5711   /* 1. Is vectorizable reduction?  */
5712   /* Not supportable if the reduction variable is used in the loop, unless
5713      it's a reduction chain.  */
5714   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5715       && !GROUP_FIRST_ELEMENT (stmt_info))
5716     return false;
5717
5718   /* Reductions that are not used even in an enclosing outer-loop,
5719      are expected to be "live" (used out of the loop).  */
5720   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5721       && !STMT_VINFO_LIVE_P (stmt_info))
5722     return false;
5723
5724   /* 2. Has this been recognized as a reduction pattern?
5725
5726      Check if STMT represents a pattern that has been recognized
5727      in earlier analysis stages.  For stmts that represent a pattern,
5728      the STMT_VINFO_RELATED_STMT field records the last stmt in
5729      the original sequence that constitutes the pattern.  */
5730
5731   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5732   if (orig_stmt)
5733     {
5734       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5735       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5736       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5737     }
5738
5739   /* 3. Check the operands of the operation.  The first operands are defined
5740         inside the loop body. The last operand is the reduction variable,
5741         which is defined by the loop-header-phi.  */
5742
5743   gcc_assert (is_gimple_assign (stmt));
5744
5745   /* Flatten RHS.  */
5746   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5747     {
5748     case GIMPLE_BINARY_RHS:
5749       code = gimple_assign_rhs_code (stmt);
5750       op_type = TREE_CODE_LENGTH (code);
5751       gcc_assert (op_type == binary_op);
5752       ops[0] = gimple_assign_rhs1 (stmt);
5753       ops[1] = gimple_assign_rhs2 (stmt);
5754       break;
5755
5756     case GIMPLE_TERNARY_RHS:
5757       code = gimple_assign_rhs_code (stmt);
5758       op_type = TREE_CODE_LENGTH (code);
5759       gcc_assert (op_type == ternary_op);
5760       ops[0] = gimple_assign_rhs1 (stmt);
5761       ops[1] = gimple_assign_rhs2 (stmt);
5762       ops[2] = gimple_assign_rhs3 (stmt);
5763       break;
5764
5765     case GIMPLE_UNARY_RHS:
5766       return false;
5767
5768     default:
5769       gcc_unreachable ();
5770     }
5771   /* The default is that the reduction variable is the last in statement.  */
5772   int reduc_index = op_type - 1;
5773   if (code == MINUS_EXPR)
5774     reduc_index = 0;
5775
5776   if (code == COND_EXPR && slp_node)
5777     return false;
5778
5779   scalar_dest = gimple_assign_lhs (stmt);
5780   scalar_type = TREE_TYPE (scalar_dest);
5781   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5782       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5783     return false;
5784
5785   /* Do not try to vectorize bit-precision reductions.  */
5786   if ((TYPE_PRECISION (scalar_type)
5787        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5788     return false;
5789
5790   /* All uses but the last are expected to be defined in the loop.
5791      The last use is the reduction variable.  In case of nested cycle this
5792      assumption is not true: we use reduc_index to record the index of the
5793      reduction variable.  */
5794   for (i = 0; i < op_type; i++)
5795     {
5796       if (i == reduc_index)
5797         continue;
5798
5799       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5800       if (i == 0 && code == COND_EXPR)
5801         continue;
5802
5803       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5804                                           &def_stmt, &dts[i], &tem);
5805       if (!vectype_in)
5806         vectype_in = tem;
5807       gcc_assert (is_simple_use);
5808
5809       dt = dts[i];
5810       if (dt != vect_internal_def
5811           && dt != vect_external_def
5812           && dt != vect_constant_def
5813           && dt != vect_induction_def
5814           && !(dt == vect_nested_cycle && nested_cycle))
5815         return false;
5816
5817       if (dt == vect_nested_cycle)
5818         {
5819           found_nested_cycle_def = true;
5820           reduc_def_stmt = def_stmt;
5821           reduc_index = i;
5822         }
5823
5824       if (i == 1 && code == COND_EXPR)
5825         {
5826           /* Record how value of COND_EXPR is defined.  */
5827           if (dt == vect_constant_def)
5828             {
5829               cond_reduc_dt = dt;
5830               cond_reduc_val = ops[i];
5831             }
5832           if (dt == vect_induction_def && def_stmt != NULL
5833               && is_nonwrapping_integer_induction (def_stmt, loop))
5834             cond_reduc_dt = dt;
5835         }
5836     }
5837
5838   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5839                                       &def_stmt, &dts[reduc_index], &tem);
5840   if (!vectype_in)
5841     vectype_in = tem;
5842   gcc_assert (is_simple_use);
5843   if (!found_nested_cycle_def)
5844     reduc_def_stmt = def_stmt;
5845
5846   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5847     return false;
5848
5849   dt = dts[reduc_index];
5850   if (!(dt == vect_reduction_def
5851         || dt == vect_nested_cycle
5852         || ((dt == vect_internal_def || dt == vect_external_def
5853              || dt == vect_constant_def || dt == vect_induction_def)
5854             && nested_cycle && found_nested_cycle_def)))
5855     {
5856       /* For pattern recognized stmts, orig_stmt might be a reduction,
5857          but some helper statements for the pattern might not, or
5858          might be COND_EXPRs with reduction uses in the condition.  */
5859       gcc_assert (orig_stmt);
5860       return false;
5861     }
5862
5863   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5864   enum vect_reduction_type v_reduc_type
5865     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5866   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5867
5868   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5869   /* If we have a condition reduction, see if we can simplify it further.  */
5870   if (v_reduc_type == COND_REDUCTION)
5871     {
5872       if (cond_reduc_dt == vect_induction_def)
5873         {
5874           if (dump_enabled_p ())
5875             dump_printf_loc (MSG_NOTE, vect_location,
5876                              "condition expression based on "
5877                              "integer induction.\n");
5878           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5879             = INTEGER_INDUC_COND_REDUCTION;
5880         }
5881
5882       /* Loop peeling modifies initial value of reduction PHI, which
5883          makes the reduction stmt to be transformed different to the
5884          original stmt analyzed.  We need to record reduction code for
5885          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5886          it can be used directly at transform stage.  */
5887       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5888           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5889         {
5890           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5891           gcc_assert (cond_reduc_dt == vect_constant_def);
5892           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5893         }
5894       else if (cond_reduc_dt == vect_constant_def)
5895         {
5896           enum vect_def_type cond_initial_dt;
5897           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5898           tree cond_initial_val
5899             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5900
5901           gcc_assert (cond_reduc_val != NULL_TREE);
5902           vect_is_simple_use (cond_initial_val, loop_vinfo,
5903                               &def_stmt, &cond_initial_dt);
5904           if (cond_initial_dt == vect_constant_def
5905               && types_compatible_p (TREE_TYPE (cond_initial_val),
5906                                      TREE_TYPE (cond_reduc_val)))
5907             {
5908               tree e = fold_binary (LE_EXPR, boolean_type_node,
5909                                     cond_initial_val, cond_reduc_val);
5910               if (e && (integer_onep (e) || integer_zerop (e)))
5911                 {
5912                   if (dump_enabled_p ())
5913                     dump_printf_loc (MSG_NOTE, vect_location,
5914                                      "condition expression based on "
5915                                      "compile time constant.\n");
5916                   /* Record reduction code at analysis stage.  */
5917                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5918                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5919                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5920                     = CONST_COND_REDUCTION;
5921                 }
5922             }
5923         }
5924     }
5925
5926   if (orig_stmt)
5927     gcc_assert (tmp == orig_stmt
5928                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5929   else
5930     /* We changed STMT to be the first stmt in reduction chain, hence we
5931        check that in this case the first element in the chain is STMT.  */
5932     gcc_assert (stmt == tmp
5933                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5934
5935   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5936     return false;
5937
5938   if (slp_node)
5939     ncopies = 1;
5940   else
5941     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5942                / TYPE_VECTOR_SUBPARTS (vectype_in));
5943
5944   gcc_assert (ncopies >= 1);
5945
5946   vec_mode = TYPE_MODE (vectype_in);
5947
5948   if (code == COND_EXPR)
5949     {
5950       /* Only call during the analysis stage, otherwise we'll lose
5951          STMT_VINFO_TYPE.  */
5952       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5953                                                 ops[reduc_index], 0, NULL))
5954         {
5955           if (dump_enabled_p ())
5956             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5957                              "unsupported condition in reduction\n");
5958           return false;
5959         }
5960     }
5961   else
5962     {
5963       /* 4. Supportable by target?  */
5964
5965       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5966           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5967         {
5968           /* Shifts and rotates are only supported by vectorizable_shifts,
5969              not vectorizable_reduction.  */
5970           if (dump_enabled_p ())
5971             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5972                              "unsupported shift or rotation.\n");
5973           return false;
5974         }
5975
5976       /* 4.1. check support for the operation in the loop  */
5977       optab = optab_for_tree_code (code, vectype_in, optab_default);
5978       if (!optab)
5979         {
5980           if (dump_enabled_p ())
5981             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5982                              "no optab.\n");
5983
5984           return false;
5985         }
5986
5987       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5988         {
5989           if (dump_enabled_p ())
5990             dump_printf (MSG_NOTE, "op not supported by target.\n");
5991
5992           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5993               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5994                   < vect_min_worthwhile_factor (code))
5995             return false;
5996
5997           if (dump_enabled_p ())
5998             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5999         }
6000
6001       /* Worthwhile without SIMD support?  */
6002       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6003           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6004              < vect_min_worthwhile_factor (code))
6005         {
6006           if (dump_enabled_p ())
6007             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6008                              "not worthwhile without SIMD support.\n");
6009
6010           return false;
6011         }
6012     }
6013
6014   /* 4.2. Check support for the epilog operation.
6015
6016           If STMT represents a reduction pattern, then the type of the
6017           reduction variable may be different than the type of the rest
6018           of the arguments.  For example, consider the case of accumulation
6019           of shorts into an int accumulator; The original code:
6020                         S1: int_a = (int) short_a;
6021           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6022
6023           was replaced with:
6024                         STMT: int_acc = widen_sum <short_a, int_acc>
6025
6026           This means that:
6027           1. The tree-code that is used to create the vector operation in the
6028              epilog code (that reduces the partial results) is not the
6029              tree-code of STMT, but is rather the tree-code of the original
6030              stmt from the pattern that STMT is replacing.  I.e, in the example
6031              above we want to use 'widen_sum' in the loop, but 'plus' in the
6032              epilog.
6033           2. The type (mode) we use to check available target support
6034              for the vector operation to be created in the *epilog*, is
6035              determined by the type of the reduction variable (in the example
6036              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6037              However the type (mode) we use to check available target support
6038              for the vector operation to be created *inside the loop*, is
6039              determined by the type of the other arguments to STMT (in the
6040              example we'd check this: optab_handler (widen_sum_optab,
6041              vect_short_mode)).
6042
6043           This is contrary to "regular" reductions, in which the types of all
6044           the arguments are the same as the type of the reduction variable.
6045           For "regular" reductions we can therefore use the same vector type
6046           (and also the same tree-code) when generating the epilog code and
6047           when generating the code inside the loop.  */
6048
6049   if (orig_stmt)
6050     {
6051       /* This is a reduction pattern: get the vectype from the type of the
6052          reduction variable, and get the tree-code from orig_stmt.  */
6053       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6054                   == TREE_CODE_REDUCTION);
6055       orig_code = gimple_assign_rhs_code (orig_stmt);
6056       gcc_assert (vectype_out);
6057       vec_mode = TYPE_MODE (vectype_out);
6058     }
6059   else
6060     {
6061       /* Regular reduction: use the same vectype and tree-code as used for
6062          the vector code inside the loop can be used for the epilog code. */
6063       orig_code = code;
6064
6065       if (code == MINUS_EXPR)
6066         orig_code = PLUS_EXPR;
6067
6068       /* For simple condition reductions, replace with the actual expression
6069          we want to base our reduction around.  */
6070       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6071         {
6072           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6073           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6074         }
6075       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6076                  == INTEGER_INDUC_COND_REDUCTION)
6077         orig_code = MAX_EXPR;
6078     }
6079
6080   if (nested_cycle)
6081     {
6082       def_bb = gimple_bb (reduc_def_stmt);
6083       def_stmt_loop = def_bb->loop_father;
6084       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6085                                        loop_preheader_edge (def_stmt_loop));
6086       if (TREE_CODE (def_arg) == SSA_NAME
6087           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6088           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6089           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6090           && vinfo_for_stmt (def_arg_stmt)
6091           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6092               == vect_double_reduction_def)
6093         double_reduc = true;
6094     }
6095
6096   epilog_reduc_code = ERROR_MARK;
6097
6098   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6099     {
6100       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6101         {
6102           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6103                                          optab_default);
6104           if (!reduc_optab)
6105             {
6106               if (dump_enabled_p ())
6107                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6108                                  "no optab for reduction.\n");
6109
6110               epilog_reduc_code = ERROR_MARK;
6111             }
6112           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6113             {
6114               if (dump_enabled_p ())
6115                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6116                                  "reduc op not supported by target.\n");
6117
6118               epilog_reduc_code = ERROR_MARK;
6119             }
6120         }
6121       else
6122         {
6123           if (!nested_cycle || double_reduc)
6124             {
6125               if (dump_enabled_p ())
6126                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6127                                  "no reduc code for scalar code.\n");
6128
6129               return false;
6130             }
6131         }
6132     }
6133   else
6134     {
6135       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
6136       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6137       cr_index_vector_type = build_vector_type
6138         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6139
6140       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6141                                    optab_default);
6142       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6143           != CODE_FOR_nothing)
6144         epilog_reduc_code = REDUC_MAX_EXPR;
6145     }
6146
6147   if ((double_reduc
6148        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6149       && ncopies > 1)
6150     {
6151       if (dump_enabled_p ())
6152         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6153                          "multiple types in double reduction or condition "
6154                          "reduction.\n");
6155       return false;
6156     }
6157
6158   /* In case of widenning multiplication by a constant, we update the type
6159      of the constant to be the type of the other operand.  We check that the
6160      constant fits the type in the pattern recognition pass.  */
6161   if (code == DOT_PROD_EXPR
6162       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6163     {
6164       if (TREE_CODE (ops[0]) == INTEGER_CST)
6165         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6166       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6167         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6168       else
6169         {
6170           if (dump_enabled_p ())
6171             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6172                              "invalid types in dot-prod\n");
6173
6174           return false;
6175         }
6176     }
6177
6178   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6179     {
6180       widest_int ni;
6181
6182       if (! max_loop_iterations (loop, &ni))
6183         {
6184           if (dump_enabled_p ())
6185             dump_printf_loc (MSG_NOTE, vect_location,
6186                              "loop count not known, cannot create cond "
6187                              "reduction.\n");
6188           return false;
6189         }
6190       /* Convert backedges to iterations.  */
6191       ni += 1;
6192
6193       /* The additional index will be the same type as the condition.  Check
6194          that the loop can fit into this less one (because we'll use up the
6195          zero slot for when there are no matches).  */
6196       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6197       if (wi::geu_p (ni, wi::to_widest (max_index)))
6198         {
6199           if (dump_enabled_p ())
6200             dump_printf_loc (MSG_NOTE, vect_location,
6201                              "loop size is greater than data size.\n");
6202           return false;
6203         }
6204     }
6205
6206   if (!vec_stmt) /* transformation not required.  */
6207     {
6208       if (first_p)
6209         vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6210       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6211       return true;
6212     }
6213
6214   /* Transform.  */
6215
6216   if (dump_enabled_p ())
6217     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6218
6219   /* FORNOW: Multiple types are not supported for condition.  */
6220   if (code == COND_EXPR)
6221     gcc_assert (ncopies == 1);
6222
6223   /* Create the destination vector  */
6224   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6225
6226   /* In case the vectorization factor (VF) is bigger than the number
6227      of elements that we can fit in a vectype (nunits), we have to generate
6228      more than one vector stmt - i.e - we need to "unroll" the
6229      vector stmt by a factor VF/nunits.  For more details see documentation
6230      in vectorizable_operation.  */
6231
6232   /* If the reduction is used in an outer loop we need to generate
6233      VF intermediate results, like so (e.g. for ncopies=2):
6234         r0 = phi (init, r0)
6235         r1 = phi (init, r1)
6236         r0 = x0 + r0;
6237         r1 = x1 + r1;
6238     (i.e. we generate VF results in 2 registers).
6239     In this case we have a separate def-use cycle for each copy, and therefore
6240     for each copy we get the vector def for the reduction variable from the
6241     respective phi node created for this copy.
6242
6243     Otherwise (the reduction is unused in the loop nest), we can combine
6244     together intermediate results, like so (e.g. for ncopies=2):
6245         r = phi (init, r)
6246         r = x0 + r;
6247         r = x1 + r;
6248    (i.e. we generate VF/2 results in a single register).
6249    In this case for each copy we get the vector def for the reduction variable
6250    from the vectorized reduction operation generated in the previous iteration.
6251   */
6252
6253   if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6254     {
6255       single_defuse_cycle = true;
6256       epilog_copies = 1;
6257     }
6258   else
6259     epilog_copies = ncopies;
6260
6261   prev_stmt_info = NULL;
6262   prev_phi_info = NULL;
6263   if (slp_node)
6264     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6265   else
6266     {
6267       vec_num = 1;
6268       vec_oprnds0.create (1);
6269       if (op_type == ternary_op)
6270         vec_oprnds1.create (1);
6271     }
6272
6273   phis.create (vec_num);
6274   vect_defs.create (vec_num);
6275   if (!slp_node)
6276     vect_defs.quick_push (NULL_TREE);
6277
6278   auto_vec<tree> vec_oprnds;
6279   for (j = 0; j < ncopies; j++)
6280     {
6281       if (j == 0 || !single_defuse_cycle)
6282         {
6283           for (i = 0; i < vec_num; i++)
6284             {
6285               /* Get the created reduction-phi that defines the reduction
6286                  operand.  */
6287               tree reduc_def = gimple_phi_result (reduc_def_stmt);
6288               if (j == 0)
6289                 vect_get_vec_defs (reduc_def, NULL, stmt, &vec_oprnds, NULL,
6290                                    slp_node);
6291               else
6292                 {
6293                   dt = vect_reduction_def;
6294                   vect_get_vec_defs_for_stmt_copy (&dt,
6295                                                    &vec_oprnds, NULL);
6296                 }
6297               new_phi = as_a <gphi *> (SSA_NAME_DEF_STMT (vec_oprnds[i]));
6298               if (j == 0 || slp_node)
6299                 phis.quick_push (new_phi);
6300             }
6301         }
6302
6303       if (code == COND_EXPR)
6304         {
6305           gcc_assert (!slp_node);
6306           vectorizable_condition (stmt, gsi, vec_stmt,
6307                                   PHI_RESULT (phis[0]),
6308                                   reduc_index, NULL);
6309           /* Multiple types are not supported for condition.  */
6310           break;
6311         }
6312
6313       /* Handle uses.  */
6314       if (j == 0)
6315         {
6316           if (slp_node)
6317             {
6318               /* Get vec defs for all the operands except the reduction index,
6319                  ensuring the ordering of the ops in the vector is kept.  */
6320               auto_vec<tree, 3> slp_ops;
6321               auto_vec<vec<tree>, 3> vec_defs;
6322
6323               slp_ops.quick_push (reduc_index == 0 ? NULL : ops[0]);
6324               slp_ops.quick_push (reduc_index == 1 ? NULL : ops[1]);
6325               if (op_type == ternary_op)
6326                 slp_ops.quick_push (reduc_index == 2 ? NULL : ops[2]);
6327
6328               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6329
6330               vec_oprnds0.safe_splice (vec_defs[reduc_index == 0 ? 1 : 0]);
6331               vec_defs[reduc_index == 0 ? 1 : 0].release ();
6332               if (op_type == ternary_op)
6333                 {
6334                   vec_oprnds1.safe_splice (vec_defs[reduc_index == 2 ? 1 : 2]);
6335                   vec_defs[reduc_index == 2 ? 1 : 2].release ();
6336                 }
6337             }
6338           else
6339             {
6340               vec_oprnds0.quick_push
6341                 (vect_get_vec_def_for_operand (ops[!reduc_index], stmt));
6342               if (op_type == ternary_op)
6343                 vec_oprnds1.quick_push
6344                   (vect_get_vec_def_for_operand (reduc_index == 0
6345                                                  ? ops[2] : ops[1], stmt));
6346             }
6347         }
6348       else
6349         {
6350           if (!slp_node)
6351             {
6352               vec_oprnds0[0]
6353                 = vect_get_vec_def_for_stmt_copy (dts[!reduc_index],
6354                                                   vec_oprnds0[0]);
6355               if (op_type == ternary_op)
6356                 vec_oprnds1[0]
6357                   = vect_get_vec_def_for_stmt_copy (dts[reduc_index == 0
6358                                                         ? 2 : 1],
6359                                                     vec_oprnds1[0]);
6360             }
6361
6362           if (single_defuse_cycle)
6363             reduc_def = gimple_assign_lhs (new_stmt);
6364         }
6365
6366       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6367         {
6368           if (slp_node)
6369             reduc_def = PHI_RESULT (phis[i]);
6370           else
6371             {
6372               if (!single_defuse_cycle || j == 0)
6373                 reduc_def = PHI_RESULT (new_phi);
6374             }
6375
6376           tree vop[3] = { def0, NULL_TREE, NULL_TREE };
6377           if (op_type == ternary_op)
6378             vop[1] = vec_oprnds1[i];
6379           for (int k = 2; k > reduc_index; --k)
6380             vop[k] = vop[k - 1];
6381           vop[reduc_index] = reduc_def;
6382
6383           new_temp = make_ssa_name (vec_dest, new_stmt);
6384           new_stmt = gimple_build_assign (new_temp, code,
6385                                           vop[0], vop[1], vop[2]);
6386           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6387
6388           if (slp_node)
6389             {
6390               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6391               vect_defs.quick_push (new_temp);
6392             }
6393           else
6394             vect_defs[0] = new_temp;
6395         }
6396
6397       if (slp_node)
6398         continue;
6399
6400       if (j == 0)
6401         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6402       else
6403         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6404
6405       prev_stmt_info = vinfo_for_stmt (new_stmt);
6406     }
6407
6408   /* Finalize the reduction-phi (set its arguments) and create the
6409      epilog reduction code.  */
6410   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6411     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6412
6413   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6414                                     epilog_reduc_code, phis, reduc_index,
6415                                     double_reduc, slp_node);
6416
6417   return true;
6418 }
6419
6420 /* Function vect_min_worthwhile_factor.
6421
6422    For a loop where we could vectorize the operation indicated by CODE,
6423    return the minimum vectorization factor that makes it worthwhile
6424    to use generic vectors.  */
6425 int
6426 vect_min_worthwhile_factor (enum tree_code code)
6427 {
6428   switch (code)
6429     {
6430     case PLUS_EXPR:
6431     case MINUS_EXPR:
6432     case NEGATE_EXPR:
6433       return 4;
6434
6435     case BIT_AND_EXPR:
6436     case BIT_IOR_EXPR:
6437     case BIT_XOR_EXPR:
6438     case BIT_NOT_EXPR:
6439       return 2;
6440
6441     default:
6442       return INT_MAX;
6443     }
6444 }
6445
6446
6447 /* Function vectorizable_induction
6448
6449    Check if PHI performs an induction computation that can be vectorized.
6450    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6451    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6452    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6453
6454 bool
6455 vectorizable_induction (gimple *phi,
6456                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6457                         gimple **vec_stmt, slp_tree slp_node)
6458 {
6459   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6460   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6461   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6462   unsigned ncopies;
6463   bool nested_in_vect_loop = false;
6464   struct loop *iv_loop;
6465   tree vec_def;
6466   edge pe = loop_preheader_edge (loop);
6467   basic_block new_bb;
6468   tree new_vec, vec_init, vec_step, t;
6469   tree new_name;
6470   gimple *new_stmt;
6471   gphi *induction_phi;
6472   tree induc_def, vec_dest;
6473   tree init_expr, step_expr;
6474   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6475   unsigned i;
6476   tree expr;
6477   gimple_seq stmts;
6478   imm_use_iterator imm_iter;
6479   use_operand_p use_p;
6480   gimple *exit_phi;
6481   edge latch_e;
6482   tree loop_arg;
6483   gimple_stmt_iterator si;
6484   basic_block bb = gimple_bb (phi);
6485
6486   if (gimple_code (phi) != GIMPLE_PHI)
6487     return false;
6488
6489   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6490     return false;
6491
6492   /* Make sure it was recognized as induction computation.  */
6493   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6494     return false;
6495
6496   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6497   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6498
6499   if (slp_node)
6500     ncopies = 1;
6501   else
6502     ncopies = vf / nunits;
6503   gcc_assert (ncopies >= 1);
6504
6505   /* FORNOW. These restrictions should be relaxed.  */
6506   if (nested_in_vect_loop_p (loop, phi))
6507     {
6508       imm_use_iterator imm_iter;
6509       use_operand_p use_p;
6510       gimple *exit_phi;
6511       edge latch_e;
6512       tree loop_arg;
6513
6514       if (ncopies > 1)
6515         {
6516           if (dump_enabled_p ())
6517             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6518                              "multiple types in nested loop.\n");
6519           return false;
6520         }
6521
6522       /* FORNOW: outer loop induction with SLP not supported.  */
6523       if (STMT_SLP_TYPE (stmt_info))
6524         return false;
6525
6526       exit_phi = NULL;
6527       latch_e = loop_latch_edge (loop->inner);
6528       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6529       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6530         {
6531           gimple *use_stmt = USE_STMT (use_p);
6532           if (is_gimple_debug (use_stmt))
6533             continue;
6534
6535           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6536             {
6537               exit_phi = use_stmt;
6538               break;
6539             }
6540         }
6541       if (exit_phi)
6542         {
6543           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6544           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6545                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6546             {
6547               if (dump_enabled_p ())
6548                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6549                                  "inner-loop induction only used outside "
6550                                  "of the outer vectorized loop.\n");
6551               return false;
6552             }
6553         }
6554
6555       nested_in_vect_loop = true;
6556       iv_loop = loop->inner;
6557     }
6558   else
6559     iv_loop = loop;
6560   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6561
6562   if (!vec_stmt) /* transformation not required.  */
6563     {
6564       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6565       if (dump_enabled_p ())
6566         dump_printf_loc (MSG_NOTE, vect_location,
6567                          "=== vectorizable_induction ===\n");
6568       vect_model_induction_cost (stmt_info, ncopies);
6569       return true;
6570     }
6571
6572   /* Transform.  */
6573
6574   /* Compute a vector variable, initialized with the first VF values of
6575      the induction variable.  E.g., for an iv with IV_PHI='X' and
6576      evolution S, for a vector of 4 units, we want to compute:
6577      [X, X + S, X + 2*S, X + 3*S].  */
6578
6579   if (dump_enabled_p ())
6580     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6581
6582   latch_e = loop_latch_edge (iv_loop);
6583   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6584
6585   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6586   gcc_assert (step_expr != NULL_TREE);
6587
6588   pe = loop_preheader_edge (iv_loop);
6589   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6590                                      loop_preheader_edge (iv_loop));
6591
6592   /* Convert the step to the desired type.  */
6593   stmts = NULL;
6594   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6595   if (stmts)
6596     {
6597       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6598       gcc_assert (!new_bb);
6599     }
6600
6601   /* Find the first insertion point in the BB.  */
6602   si = gsi_after_labels (bb);
6603
6604   /* For SLP induction we have to generate several IVs as for example
6605      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6606      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6607      [VF*S, VF*S, VF*S, VF*S] for all.  */
6608   if (slp_node)
6609     {
6610       /* Convert the init to the desired type.  */
6611       stmts = NULL;
6612       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6613       if (stmts)
6614         {
6615           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6616           gcc_assert (!new_bb);
6617         }
6618
6619       /* Generate [VF*S, VF*S, ... ].  */
6620       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6621         {
6622           expr = build_int_cst (integer_type_node, vf);
6623           expr = fold_convert (TREE_TYPE (step_expr), expr);
6624         }
6625       else
6626         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6627       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6628                               expr, step_expr);
6629       if (! CONSTANT_CLASS_P (new_name))
6630         new_name = vect_init_vector (phi, new_name,
6631                                      TREE_TYPE (step_expr), NULL);
6632       new_vec = build_vector_from_val (vectype, new_name);
6633       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6634
6635       /* Now generate the IVs.  */
6636       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6637       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6638       unsigned elts = nunits * nvects;
6639       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6640       gcc_assert (elts % group_size == 0);
6641       tree elt = init_expr;
6642       unsigned ivn;
6643       for (ivn = 0; ivn < nivs; ++ivn)
6644         {
6645           tree *elts = XALLOCAVEC (tree, nunits);
6646           bool constant_p = true;
6647           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6648             {
6649               if (ivn*nunits + eltn >= group_size
6650                   && (ivn*nunits + eltn) % group_size == 0)
6651                 {
6652                   stmts = NULL;
6653                   elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6654                                       elt, step_expr);
6655                   if (stmts)
6656                     {
6657                       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6658                       gcc_assert (!new_bb);
6659                     }
6660                 }
6661               if (! CONSTANT_CLASS_P (elt))
6662                 constant_p = false;
6663               elts[eltn] = elt;
6664             }
6665           if (constant_p)
6666             new_vec = build_vector (vectype, elts);
6667           else
6668             {
6669               vec<constructor_elt, va_gc> *v;
6670               vec_alloc (v, nunits);
6671               for (i = 0; i < nunits; ++i)
6672                 CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
6673               new_vec = build_constructor (vectype, v);
6674             }
6675           vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6676
6677           /* Create the induction-phi that defines the induction-operand.  */
6678           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6679           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6680           set_vinfo_for_stmt (induction_phi,
6681                               new_stmt_vec_info (induction_phi, loop_vinfo));
6682           induc_def = PHI_RESULT (induction_phi);
6683
6684           /* Create the iv update inside the loop  */
6685           vec_def = make_ssa_name (vec_dest);
6686           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6687           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6688           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6689
6690           /* Set the arguments of the phi node:  */
6691           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6692           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6693                        UNKNOWN_LOCATION);
6694
6695           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6696         }
6697
6698       /* Re-use IVs when we can.  */
6699       if (ivn < nvects)
6700         {
6701           unsigned vfp
6702             = least_common_multiple (group_size, nunits) / group_size;
6703           /* Generate [VF'*S, VF'*S, ... ].  */
6704           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6705             {
6706               expr = build_int_cst (integer_type_node, vfp);
6707               expr = fold_convert (TREE_TYPE (step_expr), expr);
6708             }
6709           else
6710             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6711           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6712                                   expr, step_expr);
6713           if (! CONSTANT_CLASS_P (new_name))
6714             new_name = vect_init_vector (phi, new_name,
6715                                          TREE_TYPE (step_expr), NULL);
6716           new_vec = build_vector_from_val (vectype, new_name);
6717           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6718           for (; ivn < nvects; ++ivn)
6719             {
6720               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6721               tree def;
6722               if (gimple_code (iv) == GIMPLE_PHI)
6723                 def = gimple_phi_result (iv);
6724               else
6725                 def = gimple_assign_lhs (iv);
6726               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6727                                               PLUS_EXPR,
6728                                               def, vec_step);
6729               if (gimple_code (iv) == GIMPLE_PHI)
6730                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6731               else
6732                 {
6733                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6734                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6735                 }
6736               set_vinfo_for_stmt (new_stmt,
6737                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6738               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6739             }
6740         }
6741
6742       return true;
6743     }
6744
6745   /* Create the vector that holds the initial_value of the induction.  */
6746   if (nested_in_vect_loop)
6747     {
6748       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6749          been created during vectorization of previous stmts.  We obtain it
6750          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6751       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6752       /* If the initial value is not of proper type, convert it.  */
6753       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6754         {
6755           new_stmt
6756             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6757                                                           vect_simple_var,
6758                                                           "vec_iv_"),
6759                                    VIEW_CONVERT_EXPR,
6760                                    build1 (VIEW_CONVERT_EXPR, vectype,
6761                                            vec_init));
6762           vec_init = gimple_assign_lhs (new_stmt);
6763           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6764                                                  new_stmt);
6765           gcc_assert (!new_bb);
6766           set_vinfo_for_stmt (new_stmt,
6767                               new_stmt_vec_info (new_stmt, loop_vinfo));
6768         }
6769     }
6770   else
6771     {
6772       vec<constructor_elt, va_gc> *v;
6773
6774       /* iv_loop is the loop to be vectorized. Create:
6775          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6776       stmts = NULL;
6777       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6778
6779       vec_alloc (v, nunits);
6780       bool constant_p = is_gimple_min_invariant (new_name);
6781       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6782       for (i = 1; i < nunits; i++)
6783         {
6784           /* Create: new_name_i = new_name + step_expr  */
6785           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6786                                    new_name, step_expr);
6787           if (!is_gimple_min_invariant (new_name))
6788             constant_p = false;
6789           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
6790         }
6791       if (stmts)
6792         {
6793           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6794           gcc_assert (!new_bb);
6795         }
6796
6797       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
6798       if (constant_p)
6799         new_vec = build_vector_from_ctor (vectype, v);
6800       else
6801         new_vec = build_constructor (vectype, v);
6802       vec_init = vect_init_vector (phi, new_vec, vectype, NULL);
6803     }
6804
6805
6806   /* Create the vector that holds the step of the induction.  */
6807   if (nested_in_vect_loop)
6808     /* iv_loop is nested in the loop to be vectorized. Generate:
6809        vec_step = [S, S, S, S]  */
6810     new_name = step_expr;
6811   else
6812     {
6813       /* iv_loop is the loop to be vectorized. Generate:
6814           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6815       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6816         {
6817           expr = build_int_cst (integer_type_node, vf);
6818           expr = fold_convert (TREE_TYPE (step_expr), expr);
6819         }
6820       else
6821         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6822       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6823                               expr, step_expr);
6824       if (TREE_CODE (step_expr) == SSA_NAME)
6825         new_name = vect_init_vector (phi, new_name,
6826                                      TREE_TYPE (step_expr), NULL);
6827     }
6828
6829   t = unshare_expr (new_name);
6830   gcc_assert (CONSTANT_CLASS_P (new_name)
6831               || TREE_CODE (new_name) == SSA_NAME);
6832   new_vec = build_vector_from_val (vectype, t);
6833   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6834
6835
6836   /* Create the following def-use cycle:
6837      loop prolog:
6838          vec_init = ...
6839          vec_step = ...
6840      loop:
6841          vec_iv = PHI <vec_init, vec_loop>
6842          ...
6843          STMT
6844          ...
6845          vec_loop = vec_iv + vec_step;  */
6846
6847   /* Create the induction-phi that defines the induction-operand.  */
6848   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6849   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6850   set_vinfo_for_stmt (induction_phi,
6851                       new_stmt_vec_info (induction_phi, loop_vinfo));
6852   induc_def = PHI_RESULT (induction_phi);
6853
6854   /* Create the iv update inside the loop  */
6855   vec_def = make_ssa_name (vec_dest);
6856   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6857   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6858   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6859
6860   /* Set the arguments of the phi node:  */
6861   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6862   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6863                UNKNOWN_LOCATION);
6864
6865   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6866
6867   /* In case that vectorization factor (VF) is bigger than the number
6868      of elements that we can fit in a vectype (nunits), we have to generate
6869      more than one vector stmt - i.e - we need to "unroll" the
6870      vector stmt by a factor VF/nunits.  For more details see documentation
6871      in vectorizable_operation.  */
6872
6873   if (ncopies > 1)
6874     {
6875       stmt_vec_info prev_stmt_vinfo;
6876       /* FORNOW. This restriction should be relaxed.  */
6877       gcc_assert (!nested_in_vect_loop);
6878
6879       /* Create the vector that holds the step of the induction.  */
6880       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6881         {
6882           expr = build_int_cst (integer_type_node, nunits);
6883           expr = fold_convert (TREE_TYPE (step_expr), expr);
6884         }
6885       else
6886         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6887       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6888                               expr, step_expr);
6889       if (TREE_CODE (step_expr) == SSA_NAME)
6890         new_name = vect_init_vector (phi, new_name,
6891                                      TREE_TYPE (step_expr), NULL);
6892       t = unshare_expr (new_name);
6893       gcc_assert (CONSTANT_CLASS_P (new_name)
6894                   || TREE_CODE (new_name) == SSA_NAME);
6895       new_vec = build_vector_from_val (vectype, t);
6896       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6897
6898       vec_def = induc_def;
6899       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6900       for (i = 1; i < ncopies; i++)
6901         {
6902           /* vec_i = vec_prev + vec_step  */
6903           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6904                                           vec_def, vec_step);
6905           vec_def = make_ssa_name (vec_dest, new_stmt);
6906           gimple_assign_set_lhs (new_stmt, vec_def);
6907
6908           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6909           set_vinfo_for_stmt (new_stmt,
6910                               new_stmt_vec_info (new_stmt, loop_vinfo));
6911           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6912           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6913         }
6914     }
6915
6916   if (nested_in_vect_loop)
6917     {
6918       /* Find the loop-closed exit-phi of the induction, and record
6919          the final vector of induction results:  */
6920       exit_phi = NULL;
6921       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6922         {
6923           gimple *use_stmt = USE_STMT (use_p);
6924           if (is_gimple_debug (use_stmt))
6925             continue;
6926
6927           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6928             {
6929               exit_phi = use_stmt;
6930               break;
6931             }
6932         }
6933       if (exit_phi)
6934         {
6935           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6936           /* FORNOW. Currently not supporting the case that an inner-loop induction
6937              is not used in the outer-loop (i.e. only outside the outer-loop).  */
6938           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6939                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
6940
6941           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6942           if (dump_enabled_p ())
6943             {
6944               dump_printf_loc (MSG_NOTE, vect_location,
6945                                "vector of inductions after inner-loop:");
6946               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6947             }
6948         }
6949     }
6950
6951
6952   if (dump_enabled_p ())
6953     {
6954       dump_printf_loc (MSG_NOTE, vect_location,
6955                        "transform induction: created def-use cycle: ");
6956       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
6957       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6958                         SSA_NAME_DEF_STMT (vec_def), 0);
6959     }
6960
6961   return true;
6962 }
6963
6964 /* Function vectorizable_live_operation.
6965
6966    STMT computes a value that is used outside the loop.  Check if
6967    it can be supported.  */
6968
6969 bool
6970 vectorizable_live_operation (gimple *stmt,
6971                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6972                              slp_tree slp_node, int slp_index,
6973                              gimple **vec_stmt)
6974 {
6975   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6976   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6977   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6978   imm_use_iterator imm_iter;
6979   tree lhs, lhs_type, bitsize, vec_bitsize;
6980   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6981   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6982   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6983   gimple *use_stmt;
6984   auto_vec<tree> vec_oprnds;
6985
6986   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6987
6988   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6989     return false;
6990
6991   /* FORNOW.  CHECKME.  */
6992   if (nested_in_vect_loop_p (loop, stmt))
6993     return false;
6994
6995   /* If STMT is not relevant and it is a simple assignment and its inputs are
6996      invariant then it can remain in place, unvectorized.  The original last
6997      scalar value that it computes will be used.  */
6998   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6999     {
7000       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7001       if (dump_enabled_p ())
7002         dump_printf_loc (MSG_NOTE, vect_location,
7003                          "statement is simple and uses invariant.  Leaving in "
7004                          "place.\n");
7005       return true;
7006     }
7007
7008   if (!vec_stmt)
7009     /* No transformation required.  */
7010     return true;
7011
7012   /* If stmt has a related stmt, then use that for getting the lhs.  */
7013   if (is_pattern_stmt_p (stmt_info))
7014     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7015
7016   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7017         : gimple_get_lhs (stmt);
7018   lhs_type = TREE_TYPE (lhs);
7019
7020   bitsize = TYPE_SIZE (TREE_TYPE (vectype));
7021   vec_bitsize = TYPE_SIZE (vectype);
7022
7023   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7024   tree vec_lhs, bitstart;
7025   if (slp_node)
7026     {
7027       gcc_assert (slp_index >= 0);
7028
7029       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7030       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7031
7032       /* Get the last occurrence of the scalar index from the concatenation of
7033          all the slp vectors. Calculate which slp vector it is and the index
7034          within.  */
7035       int pos = (num_vec * nunits) - num_scalar + slp_index;
7036       int vec_entry = pos / nunits;
7037       int vec_index = pos % nunits;
7038
7039       /* Get the correct slp vectorized stmt.  */
7040       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7041
7042       /* Get entry to use.  */
7043       bitstart = build_int_cst (unsigned_type_node, vec_index);
7044       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7045     }
7046   else
7047     {
7048       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7049       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7050
7051       /* For multiple copies, get the last copy.  */
7052       for (int i = 1; i < ncopies; ++i)
7053         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7054                                                   vec_lhs);
7055
7056       /* Get the last lane in the vector.  */
7057       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7058     }
7059
7060   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7061      loop.  */
7062   gimple_seq stmts = NULL;
7063   tree bftype = TREE_TYPE (vectype);
7064   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7065     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7066   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7067   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7068                                    true, NULL_TREE);
7069   if (stmts)
7070     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7071
7072   /* Replace use of lhs with newly computed result.  If the use stmt is a
7073      single arg PHI, just replace all uses of PHI result.  It's necessary
7074      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7075   use_operand_p use_p;
7076   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7077     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7078         && !is_gimple_debug (use_stmt))
7079     {
7080       if (gimple_code (use_stmt) == GIMPLE_PHI
7081           && gimple_phi_num_args (use_stmt) == 1)
7082         {
7083           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7084         }
7085       else
7086         {
7087           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7088             SET_USE (use_p, new_tree);
7089         }
7090       update_stmt (use_stmt);
7091     }
7092
7093   return true;
7094 }
7095
7096 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7097
7098 static void
7099 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7100 {
7101   ssa_op_iter op_iter;
7102   imm_use_iterator imm_iter;
7103   def_operand_p def_p;
7104   gimple *ustmt;
7105
7106   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7107     {
7108       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7109         {
7110           basic_block bb;
7111
7112           if (!is_gimple_debug (ustmt))
7113             continue;
7114
7115           bb = gimple_bb (ustmt);
7116
7117           if (!flow_bb_inside_loop_p (loop, bb))
7118             {
7119               if (gimple_debug_bind_p (ustmt))
7120                 {
7121                   if (dump_enabled_p ())
7122                     dump_printf_loc (MSG_NOTE, vect_location,
7123                                      "killing debug use\n");
7124
7125                   gimple_debug_bind_reset_value (ustmt);
7126                   update_stmt (ustmt);
7127                 }
7128               else
7129                 gcc_unreachable ();
7130             }
7131         }
7132     }
7133 }
7134
7135 /* Given loop represented by LOOP_VINFO, return true if computation of
7136    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7137    otherwise.  */
7138
7139 static bool
7140 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7141 {
7142   /* Constant case.  */
7143   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7144     {
7145       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7146       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7147
7148       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7149       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7150       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7151         return true;
7152     }
7153
7154   widest_int max;
7155   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7156   /* Check the upper bound of loop niters.  */
7157   if (get_max_loop_iterations (loop, &max))
7158     {
7159       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7160       signop sgn = TYPE_SIGN (type);
7161       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7162       if (max < type_max)
7163         return true;
7164     }
7165   return false;
7166 }
7167
7168 /* Scale profiling counters by estimation for LOOP which is vectorized
7169    by factor VF.  */
7170
7171 static void
7172 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7173 {
7174   edge preheader = loop_preheader_edge (loop);
7175   /* Reduce loop iterations by the vectorization factor.  */
7176   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7177   profile_count freq_h = loop->header->count, freq_e = preheader->count;
7178
7179   /* Use frequency only if counts are zero.  */
7180   if (!(freq_h > 0) && !(freq_e > 0))
7181     {
7182       freq_h = profile_count::from_gcov_type (loop->header->frequency);
7183       freq_e = profile_count::from_gcov_type (EDGE_FREQUENCY (preheader));
7184     }
7185   if (freq_h > 0)
7186     {
7187       profile_probability p;
7188
7189       /* Avoid dropping loop body profile counter to 0 because of zero count
7190          in loop's preheader.  */
7191       if (!(freq_e > profile_count::from_gcov_type (1)))
7192        freq_e = profile_count::from_gcov_type (1);
7193       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7194       scale_loop_frequencies (loop, p);
7195     }
7196
7197   basic_block exit_bb = single_pred (loop->latch);
7198   edge exit_e = single_exit (loop);
7199   exit_e->count = loop_preheader_edge (loop)->count;
7200   exit_e->probability = profile_probability::always ()
7201                                  .apply_scale (1, new_est_niter + 1);
7202
7203   edge exit_l = single_pred_edge (loop->latch);
7204   int prob = exit_l->probability.initialized_p ()
7205              ? exit_l->probability.to_reg_br_prob_base () : 0;
7206   exit_l->probability = exit_e->probability.invert ();
7207   exit_l->count = exit_bb->count - exit_e->count;
7208   if (prob > 0)
7209     scale_bbs_frequencies_int (&loop->latch, 1,
7210                                exit_l->probability.to_reg_br_prob_base (), prob);
7211 }
7212
7213 /* Function vect_transform_loop.
7214
7215    The analysis phase has determined that the loop is vectorizable.
7216    Vectorize the loop - created vectorized stmts to replace the scalar
7217    stmts in the loop, and update the loop exit condition.
7218    Returns scalar epilogue loop if any.  */
7219
7220 struct loop *
7221 vect_transform_loop (loop_vec_info loop_vinfo)
7222 {
7223   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7224   struct loop *epilogue = NULL;
7225   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7226   int nbbs = loop->num_nodes;
7227   int i;
7228   tree niters_vector = NULL;
7229   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7230   bool grouped_store;
7231   bool slp_scheduled = false;
7232   gimple *stmt, *pattern_stmt;
7233   gimple_seq pattern_def_seq = NULL;
7234   gimple_stmt_iterator pattern_def_si = gsi_none ();
7235   bool transform_pattern_stmt = false;
7236   bool check_profitability = false;
7237   int th;
7238
7239   if (dump_enabled_p ())
7240     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7241
7242   /* Use the more conservative vectorization threshold.  If the number
7243      of iterations is constant assume the cost check has been performed
7244      by our caller.  If the threshold makes all loops profitable that
7245      run at least the vectorization factor number of times checking
7246      is pointless, too.  */
7247   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7248   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
7249       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7250     {
7251       if (dump_enabled_p ())
7252         dump_printf_loc (MSG_NOTE, vect_location,
7253                          "Profitability threshold is %d loop iterations.\n",
7254                          th);
7255       check_profitability = true;
7256     }
7257
7258   /* Make sure there exists a single-predecessor exit bb.  Do this before
7259      versioning.   */
7260   edge e = single_exit (loop);
7261   if (! single_pred_p (e->dest))
7262     {
7263       split_loop_exit_edge (e);
7264       if (dump_enabled_p ())
7265         dump_printf (MSG_NOTE, "split exit edge\n");
7266     }
7267
7268   /* Version the loop first, if required, so the profitability check
7269      comes first.  */
7270
7271   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7272     {
7273       vect_loop_versioning (loop_vinfo, th, check_profitability);
7274       check_profitability = false;
7275     }
7276
7277   /* Make sure there exists a single-predecessor exit bb also on the
7278      scalar loop copy.  Do this after versioning but before peeling
7279      so CFG structure is fine for both scalar and if-converted loop
7280      to make slpeel_duplicate_current_defs_from_edges face matched
7281      loop closed PHI nodes on the exit.  */
7282   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7283     {
7284       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7285       if (! single_pred_p (e->dest))
7286         {
7287           split_loop_exit_edge (e);
7288           if (dump_enabled_p ())
7289             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7290         }
7291     }
7292
7293   tree niters = vect_build_loop_niters (loop_vinfo);
7294   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7295   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7296   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7297   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7298                               check_profitability, niters_no_overflow);
7299   if (niters_vector == NULL_TREE)
7300     {
7301       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7302         niters_vector
7303           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7304                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7305       else
7306         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7307                                      niters_no_overflow);
7308     }
7309
7310   /* 1) Make sure the loop header has exactly two entries
7311      2) Make sure we have a preheader basic block.  */
7312
7313   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7314
7315   split_edge (loop_preheader_edge (loop));
7316
7317   /* FORNOW: the vectorizer supports only loops which body consist
7318      of one basic block (header + empty latch). When the vectorizer will
7319      support more involved loop forms, the order by which the BBs are
7320      traversed need to be reconsidered.  */
7321
7322   for (i = 0; i < nbbs; i++)
7323     {
7324       basic_block bb = bbs[i];
7325       stmt_vec_info stmt_info;
7326
7327       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7328            gsi_next (&si))
7329         {
7330           gphi *phi = si.phi ();
7331           if (dump_enabled_p ())
7332             {
7333               dump_printf_loc (MSG_NOTE, vect_location,
7334                                "------>vectorizing phi: ");
7335               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7336             }
7337           stmt_info = vinfo_for_stmt (phi);
7338           if (!stmt_info)
7339             continue;
7340
7341           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7342             vect_loop_kill_debug_uses (loop, phi);
7343
7344           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7345               && !STMT_VINFO_LIVE_P (stmt_info))
7346             continue;
7347
7348           if (STMT_VINFO_VECTYPE (stmt_info)
7349               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7350                   != (unsigned HOST_WIDE_INT) vf)
7351               && dump_enabled_p ())
7352             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7353
7354           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7355                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7356                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7357               && ! PURE_SLP_STMT (stmt_info))
7358             {
7359               if (dump_enabled_p ())
7360                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7361               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7362             }
7363         }
7364
7365       pattern_stmt = NULL;
7366       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7367            !gsi_end_p (si) || transform_pattern_stmt;)
7368         {
7369           bool is_store;
7370
7371           if (transform_pattern_stmt)
7372             stmt = pattern_stmt;
7373           else
7374             {
7375               stmt = gsi_stmt (si);
7376               /* During vectorization remove existing clobber stmts.  */
7377               if (gimple_clobber_p (stmt))
7378                 {
7379                   unlink_stmt_vdef (stmt);
7380                   gsi_remove (&si, true);
7381                   release_defs (stmt);
7382                   continue;
7383                 }
7384             }
7385
7386           if (dump_enabled_p ())
7387             {
7388               dump_printf_loc (MSG_NOTE, vect_location,
7389                                "------>vectorizing statement: ");
7390               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7391             }
7392
7393           stmt_info = vinfo_for_stmt (stmt);
7394
7395           /* vector stmts created in the outer-loop during vectorization of
7396              stmts in an inner-loop may not have a stmt_info, and do not
7397              need to be vectorized.  */
7398           if (!stmt_info)
7399             {
7400               gsi_next (&si);
7401               continue;
7402             }
7403
7404           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7405             vect_loop_kill_debug_uses (loop, stmt);
7406
7407           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7408               && !STMT_VINFO_LIVE_P (stmt_info))
7409             {
7410               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7411                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7412                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7413                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7414                 {
7415                   stmt = pattern_stmt;
7416                   stmt_info = vinfo_for_stmt (stmt);
7417                 }
7418               else
7419                 {
7420                   gsi_next (&si);
7421                   continue;
7422                 }
7423             }
7424           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7425                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7426                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7427                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7428             transform_pattern_stmt = true;
7429
7430           /* If pattern statement has def stmts, vectorize them too.  */
7431           if (is_pattern_stmt_p (stmt_info))
7432             {
7433               if (pattern_def_seq == NULL)
7434                 {
7435                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7436                   pattern_def_si = gsi_start (pattern_def_seq);
7437                 }
7438               else if (!gsi_end_p (pattern_def_si))
7439                 gsi_next (&pattern_def_si);
7440               if (pattern_def_seq != NULL)
7441                 {
7442                   gimple *pattern_def_stmt = NULL;
7443                   stmt_vec_info pattern_def_stmt_info = NULL;
7444
7445                   while (!gsi_end_p (pattern_def_si))
7446                     {
7447                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7448                       pattern_def_stmt_info
7449                         = vinfo_for_stmt (pattern_def_stmt);
7450                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7451                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7452                         break;
7453                       gsi_next (&pattern_def_si);
7454                     }
7455
7456                   if (!gsi_end_p (pattern_def_si))
7457                     {
7458                       if (dump_enabled_p ())
7459                         {
7460                           dump_printf_loc (MSG_NOTE, vect_location,
7461                                            "==> vectorizing pattern def "
7462                                            "stmt: ");
7463                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7464                                             pattern_def_stmt, 0);
7465                         }
7466
7467                       stmt = pattern_def_stmt;
7468                       stmt_info = pattern_def_stmt_info;
7469                     }
7470                   else
7471                     {
7472                       pattern_def_si = gsi_none ();
7473                       transform_pattern_stmt = false;
7474                     }
7475                 }
7476               else
7477                 transform_pattern_stmt = false;
7478             }
7479
7480           if (STMT_VINFO_VECTYPE (stmt_info))
7481             {
7482               unsigned int nunits
7483                 = (unsigned int)
7484                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7485               if (!STMT_SLP_TYPE (stmt_info)
7486                   && nunits != (unsigned int) vf
7487                   && dump_enabled_p ())
7488                   /* For SLP VF is set according to unrolling factor, and not
7489                      to vector size, hence for SLP this print is not valid.  */
7490                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7491             }
7492
7493           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7494              reached.  */
7495           if (STMT_SLP_TYPE (stmt_info))
7496             {
7497               if (!slp_scheduled)
7498                 {
7499                   slp_scheduled = true;
7500
7501                   if (dump_enabled_p ())
7502                     dump_printf_loc (MSG_NOTE, vect_location,
7503                                      "=== scheduling SLP instances ===\n");
7504
7505                   vect_schedule_slp (loop_vinfo);
7506                 }
7507
7508               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7509               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7510                 {
7511                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7512                     {
7513                       pattern_def_seq = NULL;
7514                       gsi_next (&si);
7515                     }
7516                   continue;
7517                 }
7518             }
7519
7520           /* -------- vectorize statement ------------ */
7521           if (dump_enabled_p ())
7522             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7523
7524           grouped_store = false;
7525           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7526           if (is_store)
7527             {
7528               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7529                 {
7530                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7531                      interleaving chain was completed - free all the stores in
7532                      the chain.  */
7533                   gsi_next (&si);
7534                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7535                 }
7536               else
7537                 {
7538                   /* Free the attached stmt_vec_info and remove the stmt.  */
7539                   gimple *store = gsi_stmt (si);
7540                   free_stmt_vec_info (store);
7541                   unlink_stmt_vdef (store);
7542                   gsi_remove (&si, true);
7543                   release_defs (store);
7544                 }
7545
7546               /* Stores can only appear at the end of pattern statements.  */
7547               gcc_assert (!transform_pattern_stmt);
7548               pattern_def_seq = NULL;
7549             }
7550           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7551             {
7552               pattern_def_seq = NULL;
7553               gsi_next (&si);
7554             }
7555         }                       /* stmts in BB */
7556     }                           /* BBs in loop */
7557
7558   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7559
7560   scale_profile_for_vect_loop (loop, vf);
7561
7562   /* The minimum number of iterations performed by the epilogue.  This
7563      is 1 when peeling for gaps because we always need a final scalar
7564      iteration.  */
7565   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7566   /* +1 to convert latch counts to loop iteration counts,
7567      -min_epilogue_iters to remove iterations that cannot be performed
7568        by the vector code.  */
7569   int bias = 1 - min_epilogue_iters;
7570   /* In these calculations the "- 1" converts loop iteration counts
7571      back to latch counts.  */
7572   if (loop->any_upper_bound)
7573     loop->nb_iterations_upper_bound
7574       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7575   if (loop->any_likely_upper_bound)
7576     loop->nb_iterations_likely_upper_bound
7577       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7578   if (loop->any_estimate)
7579     loop->nb_iterations_estimate
7580       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7581
7582   if (dump_enabled_p ())
7583     {
7584       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7585         {
7586           dump_printf_loc (MSG_NOTE, vect_location,
7587                            "LOOP VECTORIZED\n");
7588           if (loop->inner)
7589             dump_printf_loc (MSG_NOTE, vect_location,
7590                              "OUTER LOOP VECTORIZED\n");
7591           dump_printf (MSG_NOTE, "\n");
7592         }
7593       else
7594         dump_printf_loc (MSG_NOTE, vect_location,
7595                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7596                          current_vector_size);
7597     }
7598
7599   /* Free SLP instances here because otherwise stmt reference counting
7600      won't work.  */
7601   slp_instance instance;
7602   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7603     vect_free_slp_instance (instance);
7604   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7605   /* Clear-up safelen field since its value is invalid after vectorization
7606      since vectorized loop can have loop-carried dependencies.  */
7607   loop->safelen = 0;
7608
7609   /* Don't vectorize epilogue for epilogue.  */
7610   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7611     epilogue = NULL;
7612
7613   if (epilogue)
7614     {
7615         unsigned int vector_sizes
7616           = targetm.vectorize.autovectorize_vector_sizes ();
7617         vector_sizes &= current_vector_size - 1;
7618
7619         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7620           epilogue = NULL;
7621         else if (!vector_sizes)
7622           epilogue = NULL;
7623         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7624                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7625           {
7626             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7627             int ratio = current_vector_size / smallest_vec_size;
7628             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7629               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7630             eiters = eiters % vf;
7631
7632             epilogue->nb_iterations_upper_bound = eiters - 1;
7633
7634             if (eiters < vf / ratio)
7635               epilogue = NULL;
7636             }
7637     }
7638
7639   if (epilogue)
7640     {
7641       epilogue->force_vectorize = loop->force_vectorize;
7642       epilogue->safelen = loop->safelen;
7643       epilogue->dont_vectorize = false;
7644
7645       /* We may need to if-convert epilogue to vectorize it.  */
7646       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7647         tree_if_conversion (epilogue);
7648     }
7649
7650   return epilogue;
7651 }
7652
7653 /* The code below is trying to perform simple optimization - revert
7654    if-conversion for masked stores, i.e. if the mask of a store is zero
7655    do not perform it and all stored value producers also if possible.
7656    For example,
7657      for (i=0; i<n; i++)
7658        if (c[i])
7659         {
7660           p1[i] += 1;
7661           p2[i] = p3[i] +2;
7662         }
7663    this transformation will produce the following semi-hammock:
7664
7665    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7666      {
7667        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7668        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7669        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7670        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7671        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7672        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7673      }
7674 */
7675
7676 void
7677 optimize_mask_stores (struct loop *loop)
7678 {
7679   basic_block *bbs = get_loop_body (loop);
7680   unsigned nbbs = loop->num_nodes;
7681   unsigned i;
7682   basic_block bb;
7683   struct loop *bb_loop;
7684   gimple_stmt_iterator gsi;
7685   gimple *stmt;
7686   auto_vec<gimple *> worklist;
7687
7688   vect_location = find_loop_location (loop);
7689   /* Pick up all masked stores in loop if any.  */
7690   for (i = 0; i < nbbs; i++)
7691     {
7692       bb = bbs[i];
7693       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7694            gsi_next (&gsi))
7695         {
7696           stmt = gsi_stmt (gsi);
7697           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7698             worklist.safe_push (stmt);
7699         }
7700     }
7701
7702   free (bbs);
7703   if (worklist.is_empty ())
7704     return;
7705
7706   /* Loop has masked stores.  */
7707   while (!worklist.is_empty ())
7708     {
7709       gimple *last, *last_store;
7710       edge e, efalse;
7711       tree mask;
7712       basic_block store_bb, join_bb;
7713       gimple_stmt_iterator gsi_to;
7714       tree vdef, new_vdef;
7715       gphi *phi;
7716       tree vectype;
7717       tree zero;
7718
7719       last = worklist.pop ();
7720       mask = gimple_call_arg (last, 2);
7721       bb = gimple_bb (last);
7722       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7723          the same loop as if_bb.  It could be different to LOOP when two
7724          level loop-nest is vectorized and mask_store belongs to the inner
7725          one.  */
7726       e = split_block (bb, last);
7727       bb_loop = bb->loop_father;
7728       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7729       join_bb = e->dest;
7730       store_bb = create_empty_bb (bb);
7731       add_bb_to_loop (store_bb, bb_loop);
7732       e->flags = EDGE_TRUE_VALUE;
7733       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7734       /* Put STORE_BB to likely part.  */
7735       efalse->probability = profile_probability::unlikely ();
7736       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7737       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7738       if (dom_info_available_p (CDI_DOMINATORS))
7739         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7740       if (dump_enabled_p ())
7741         dump_printf_loc (MSG_NOTE, vect_location,
7742                          "Create new block %d to sink mask stores.",
7743                          store_bb->index);
7744       /* Create vector comparison with boolean result.  */
7745       vectype = TREE_TYPE (mask);
7746       zero = build_zero_cst (vectype);
7747       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7748       gsi = gsi_last_bb (bb);
7749       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7750       /* Create new PHI node for vdef of the last masked store:
7751          .MEM_2 = VDEF <.MEM_1>
7752          will be converted to
7753          .MEM.3 = VDEF <.MEM_1>
7754          and new PHI node will be created in join bb
7755          .MEM_2 = PHI <.MEM_1, .MEM_3>
7756       */
7757       vdef = gimple_vdef (last);
7758       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7759       gimple_set_vdef (last, new_vdef);
7760       phi = create_phi_node (vdef, join_bb);
7761       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7762
7763       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7764       while (true)
7765         {
7766           gimple_stmt_iterator gsi_from;
7767           gimple *stmt1 = NULL;
7768
7769           /* Move masked store to STORE_BB.  */
7770           last_store = last;
7771           gsi = gsi_for_stmt (last);
7772           gsi_from = gsi;
7773           /* Shift GSI to the previous stmt for further traversal.  */
7774           gsi_prev (&gsi);
7775           gsi_to = gsi_start_bb (store_bb);
7776           gsi_move_before (&gsi_from, &gsi_to);
7777           /* Setup GSI_TO to the non-empty block start.  */
7778           gsi_to = gsi_start_bb (store_bb);
7779           if (dump_enabled_p ())
7780             {
7781               dump_printf_loc (MSG_NOTE, vect_location,
7782                                "Move stmt to created bb\n");
7783               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7784             }
7785           /* Move all stored value producers if possible.  */
7786           while (!gsi_end_p (gsi))
7787             {
7788               tree lhs;
7789               imm_use_iterator imm_iter;
7790               use_operand_p use_p;
7791               bool res;
7792
7793               /* Skip debug statements.  */
7794               if (is_gimple_debug (gsi_stmt (gsi)))
7795                 {
7796                   gsi_prev (&gsi);
7797                   continue;
7798                 }
7799               stmt1 = gsi_stmt (gsi);
7800               /* Do not consider statements writing to memory or having
7801                  volatile operand.  */
7802               if (gimple_vdef (stmt1)
7803                   || gimple_has_volatile_ops (stmt1))
7804                 break;
7805               gsi_from = gsi;
7806               gsi_prev (&gsi);
7807               lhs = gimple_get_lhs (stmt1);
7808               if (!lhs)
7809                 break;
7810
7811               /* LHS of vectorized stmt must be SSA_NAME.  */
7812               if (TREE_CODE (lhs) != SSA_NAME)
7813                 break;
7814
7815               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7816                 {
7817                   /* Remove dead scalar statement.  */
7818                   if (has_zero_uses (lhs))
7819                     {
7820                       gsi_remove (&gsi_from, true);
7821                       continue;
7822                     }
7823                 }
7824
7825               /* Check that LHS does not have uses outside of STORE_BB.  */
7826               res = true;
7827               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7828                 {
7829                   gimple *use_stmt;
7830                   use_stmt = USE_STMT (use_p);
7831                   if (is_gimple_debug (use_stmt))
7832                     continue;
7833                   if (gimple_bb (use_stmt) != store_bb)
7834                     {
7835                       res = false;
7836                       break;
7837                     }
7838                 }
7839               if (!res)
7840                 break;
7841
7842               if (gimple_vuse (stmt1)
7843                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7844                 break;
7845
7846               /* Can move STMT1 to STORE_BB.  */
7847               if (dump_enabled_p ())
7848                 {
7849                   dump_printf_loc (MSG_NOTE, vect_location,
7850                                    "Move stmt to created bb\n");
7851                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7852                 }
7853               gsi_move_before (&gsi_from, &gsi_to);
7854               /* Shift GSI_TO for further insertion.  */
7855               gsi_prev (&gsi_to);
7856             }
7857           /* Put other masked stores with the same mask to STORE_BB.  */
7858           if (worklist.is_empty ()
7859               || gimple_call_arg (worklist.last (), 2) != mask
7860               || worklist.last () != stmt1)
7861             break;
7862           last = worklist.pop ();
7863         }
7864       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7865     }
7866 }