gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53
  54 /* Loop Vectorization Pass.
  55
  56    This pass tries to vectorize loops.
  57
  58    For example, the vectorizer transforms the following simple loop:
  59
  60         short a[N]; short b[N]; short c[N]; int i;
  61
  62         for (i=0; i<N; i++){
  63           a[i] = b[i] + c[i];
  64         }
  65
  66    as if it was manually vectorized by rewriting the source code into:
  67
  68         typedef int __attribute__((mode(V8HI))) v8hi;
  69         short a[N];  short b[N]; short c[N];   int i;
  70         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  71         v8hi va, vb, vc;
  72
  73         for (i=0; i<N/8; i++){
  74           vb = pb[i];
  75           vc = pc[i];
  76           va = vb + vc;
  77           pa[i] = va;
  78         }
  79
  80         The main entry to this pass is vectorize_loops(), in which
  81    the vectorizer applies a set of analyses on a given set of loops,
  82    followed by the actual vectorization transformation for the loops that
  83    had successfully passed the analysis phase.
  84         Throughout this pass we make a distinction between two types of
  85    data: scalars (which are represented by SSA_NAMES), and memory references
  86    ("data-refs").  These two types of data require different handling both
  87    during analysis and transformation. The types of data-refs that the
  88    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  89    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  90    accesses are required to have a simple (consecutive) access pattern.
  91
  92    Analysis phase:
  93    ===============
  94         The driver for the analysis phase is vect_analyze_loop().
  95    It applies a set of analyses, some of which rely on the scalar evolution
  96    analyzer (scev) developed by Sebastian Pop.
  97
  98         During the analysis phase the vectorizer records some information
  99    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 100    loop, as well as general information about the loop as a whole, which is
 101    recorded in a "loop_vec_info" struct attached to each loop.
 102
 103    Transformation phase:
 104    =====================
 105         The loop transformation phase scans all the stmts in the loop, and
 106    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 107    the loop that needs to be vectorized.  It inserts the vector code sequence
 108    just before the scalar stmt S, and records a pointer to the vector code
 109    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 110    attached to S).  This pointer will be used for the vectorization of following
 111    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 112    otherwise, we rely on dead code elimination for removing it.
 113
 114         For example, say stmt S1 was vectorized into stmt VS1:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    S2:  a = b;
 119
 120    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 121    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 122    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 123    resulting sequence would be:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    VS2: va = vb;
 128    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 129
 130         Operands that are not SSA_NAMEs, are data-refs that appear in
 131    load/store operations (like 'x[i]' in S1), and are handled differently.
 132
 133    Target modeling:
 134    =================
 135         Currently the only target specific information that is used is the
 136    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 137    Targets that can support different sizes of vectors, for now will need
 138    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 139    flexibility will be added in the future.
 140
 141         Since we only vectorize operations which vector form can be
 142    expressed using existing tree codes, to verify that an operation is
 143    supported, the vectorizer checks the relevant optab at the relevant
 144    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 145    the value found is CODE_FOR_nothing, then there's no target support, and
 146    we can't vectorize the stmt.
 147
 148    For additional information on this project see:
 149    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 150 */
 151
 152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 153
 154 /* Function vect_determine_vectorization_factor
 155
 156    Determine the vectorization factor (VF).  VF is the number of data elements
 157    that are operated upon in parallel in a single iteration of the vectorized
 158    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 159    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 160    elements can fit in a single vector register.
 161
 162    We currently support vectorization of loops in which all types operated upon
 163    are of the same size.  Therefore this function currently sets VF according to
 164    the size of the types operated upon, and fails if there are multiple sizes
 165    in the loop.
 166
 167    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 168    original loop:
 169         for (i=0; i<N; i++){
 170           a[i] = b[i] + c[i];
 171         }
 172
 173    vectorized loop:
 174         for (i=0; i<N; i+=VF){
 175           a[i:VF] = b[i:VF] + c[i:VF];
 176         }
 177 */
 178
 179 static bool
 180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 181 {
 182   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 183   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 184   unsigned nbbs = loop->num_nodes;
 185   unsigned int vectorization_factor = 0;
 186   tree scalar_type = NULL_TREE;
 187   gphi *phi;
 188   tree vectype;
 189   unsigned int nunits;
 190   stmt_vec_info stmt_info;
 191   unsigned i;
 192   HOST_WIDE_INT dummy;
 193   gimple *stmt, *pattern_stmt = NULL;
 194   gimple_seq pattern_def_seq = NULL;
 195   gimple_stmt_iterator pattern_def_si = gsi_none ();
 196   bool analyze_pattern_stmt = false;
 197   bool bool_result;
 198   auto_vec<stmt_vec_info> mask_producers;
 199
 200   if (dump_enabled_p ())
 201     dump_printf_loc (MSG_NOTE, vect_location,
 202                      "=== vect_determine_vectorization_factor ===\n");
 203
 204   for (i = 0; i < nbbs; i++)
 205     {
 206       basic_block bb = bbs[i];
 207
 208       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 209            gsi_next (&si))
 210         {
 211           phi = si.phi ();
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217             }
 218
 219           gcc_assert (stmt_info);
 220
 221           if (STMT_VINFO_RELEVANT_P (stmt_info)
 222               || STMT_VINFO_LIVE_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 554           if (dump_enabled_p ())
 555             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 556           if (!vectorization_factor
 557               || (nunits > vectorization_factor))
 558             vectorization_factor = nunits;
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 571                      vectorization_factor);
 572   if (vectorization_factor <= 1)
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "not vectorized: unsupported data-type\n");
 577       return false;
 578     }
 579   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 580
 581   for (i = 0; i < mask_producers.length (); i++)
 582     {
 583       tree mask_type = NULL;
 584
 585       stmt = STMT_VINFO_STMT (mask_producers[i]);
 586
 587       if (is_gimple_assign (stmt)
 588           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 589           && !VECT_SCALAR_BOOLEAN_TYPE_P
 590                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                     }
 623                   return false;
 624                 }
 625
 626               /* No vectype probably means external definition.
 627                  Allow it in case there is another operand which
 628                  allows to determine mask type.  */
 629               if (!vectype)
 630                 continue;
 631
 632               if (!mask_type)
 633                 mask_type = vectype;
 634               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 635                        != TYPE_VECTOR_SUBPARTS (vectype))
 636                 {
 637                   if (dump_enabled_p ())
 638                     {
 639                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                                        "not vectorized: different sized masks "
 641                                        "types in statement, ");
 642                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 643                                          mask_type);
 644                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 645                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 646                                          vectype);
 647                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 648                     }
 649                   return false;
 650                 }
 651               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 652                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 653                 {
 654                   if (dump_enabled_p ())
 655                     {
 656                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                                        "not vectorized: mixed mask and "
 658                                        "nonmask vector types in statement, ");
 659                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 660                                          mask_type);
 661                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 662                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 663                                          vectype);
 664                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 665                     }
 666                   return false;
 667                 }
 668             }
 669
 670           /* We may compare boolean value loaded as vector of integers.
 671              Fix mask_type in such case.  */
 672           if (mask_type
 673               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 674               && gimple_code (stmt) == GIMPLE_ASSIGN
 675               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 676             mask_type = build_same_sized_truth_vector_type (mask_type);
 677         }
 678
 679       /* No mask_type should mean loop invariant predicate.
 680          This is probably a subject for optimization in
 681          if-conversion.  */
 682       if (!mask_type)
 683         {
 684           if (dump_enabled_p ())
 685             {
 686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                                "not vectorized: can't compute mask type "
 688                                "for statement, ");
 689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 690                                 0);
 691             }
 692           return false;
 693         }
 694
 695       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 696     }
 697
 698   return true;
 699 }
 700
 701
 702 /* Function vect_is_simple_iv_evolution.
 703
 704    FORNOW: A simple evolution of an induction variables in the loop is
 705    considered a polynomial evolution.  */
 706
 707 static bool
 708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 709                              tree * step)
 710 {
 711   tree init_expr;
 712   tree step_expr;
 713   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 714   basic_block bb;
 715
 716   /* When there is no evolution in this loop, the evolution function
 717      is not "simple".  */
 718   if (evolution_part == NULL_TREE)
 719     return false;
 720
 721   /* When the evolution is a polynomial of degree >= 2
 722      the evolution function is not "simple".  */
 723   if (tree_is_chrec (evolution_part))
 724     return false;
 725
 726   step_expr = evolution_part;
 727   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 728
 729   if (dump_enabled_p ())
 730     {
 731       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 732       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 733       dump_printf (MSG_NOTE, ",  init: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 735       dump_printf (MSG_NOTE, "\n");
 736     }
 737
 738   *init = init_expr;
 739   *step = step_expr;
 740
 741   if (TREE_CODE (step_expr) != INTEGER_CST
 742       && (TREE_CODE (step_expr) != SSA_NAME
 743           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 744               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 745           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 746               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 747                   || !flag_associative_math)))
 748       && (TREE_CODE (step_expr) != REAL_CST
 749           || !flag_associative_math))
 750     {
 751       if (dump_enabled_p ())
 752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                          "step unknown.\n");
 754       return false;
 755     }
 756
 757   return true;
 758 }
 759
 760 /* Function vect_analyze_scalar_cycles_1.
 761
 762    Examine the cross iteration def-use cycles of scalar variables
 763    in LOOP.  LOOP_VINFO represents the loop that is now being
 764    considered for vectorization (can be LOOP, or an outer-loop
 765    enclosing LOOP).  */
 766
 767 static void
 768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 769 {
 770   basic_block bb = loop->header;
 771   tree init, step;
 772   auto_vec<gimple *, 64> worklist;
 773   gphi_iterator gsi;
 774   bool double_reduc;
 775
 776   if (dump_enabled_p ())
 777     dump_printf_loc (MSG_NOTE, vect_location,
 778                      "=== vect_analyze_scalar_cycles ===\n");
 779
 780   /* First - identify all inductions.  Reduction detection assumes that all the
 781      inductions have been identified, therefore, this order must not be
 782      changed.  */
 783   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 784     {
 785       gphi *phi = gsi.phi ();
 786       tree access_fn = NULL;
 787       tree def = PHI_RESULT (phi);
 788       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 789
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 793           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 794         }
 795
 796       /* Skip virtual phi's.  The data dependences that are associated with
 797          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 798       if (virtual_operand_p (def))
 799         continue;
 800
 801       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 802
 803       /* Analyze the evolution function.  */
 804       access_fn = analyze_scalar_evolution (loop, def);
 805       if (access_fn)
 806         {
 807           STRIP_NOPS (access_fn);
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "Access function of PHI: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 816             = initial_condition_in_loop_num (access_fn, loop->num);
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 818             = evolution_part_in_loop_num (access_fn, loop->num);
 819         }
 820
 821       if (!access_fn
 822           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 823           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 824               && TREE_CODE (step) != INTEGER_CST))
 825         {
 826           worklist.safe_push (phi);
 827           continue;
 828         }
 829
 830       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 831                   != NULL_TREE);
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 833
 834       if (dump_enabled_p ())
 835         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 836       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 837     }
 838
 839
 840   /* Second - identify all reductions and nested cycles.  */
 841   while (worklist.length () > 0)
 842     {
 843       gimple *phi = worklist.pop ();
 844       tree def = PHI_RESULT (phi);
 845       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 846       gimple *reduc_stmt;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852         }
 853
 854       gcc_assert (!virtual_operand_p (def)
 855                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 856
 857       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 858                                                 &double_reduc, false);
 859       if (reduc_stmt)
 860         {
 861           if (double_reduc)
 862             {
 863               if (dump_enabled_p ())
 864                 dump_printf_loc (MSG_NOTE, vect_location,
 865                                  "Detected double reduction.\n");
 866
 867               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 868               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 869                                                     vect_double_reduction_def;
 870             }
 871           else
 872             {
 873               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 874                 {
 875                   if (dump_enabled_p ())
 876                     dump_printf_loc (MSG_NOTE, vect_location,
 877                                      "Detected vectorizable nested cycle.\n");
 878
 879                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 880                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 881                                                              vect_nested_cycle;
 882                 }
 883               else
 884                 {
 885                   if (dump_enabled_p ())
 886                     dump_printf_loc (MSG_NOTE, vect_location,
 887                                      "Detected reduction.\n");
 888
 889                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 890                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 891                                                            vect_reduction_def;
 892                   /* Store the reduction cycles for possible vectorization in
 893                      loop-aware SLP if it was not detected as reduction
 894                      chain.  */
 895                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 896                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 984         while (next)
 985           {
 986             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 987               break;
 988             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 989           }
 990         /* If not all stmt in the chain are patterns try to handle
 991            the chain without patterns.  */
 992         if (! next)
 993           {
 994             vect_fixup_reduc_chain (first);
 995             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 996               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 997           }
 998       }
 999 }
1000
1001 /* Function vect_get_loop_niters.
1002
1003    Determine how many iterations the loop is executed and place it
1004    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1005    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1006    niter information holds in ASSUMPTIONS.
1007
1008    Return the loop exit condition.  */
1009
1010
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013                       tree *number_of_iterations, tree *number_of_iterationsm1)
1014 {
1015   edge exit = single_exit (loop);
1016   struct tree_niter_desc niter_desc;
1017   tree niter_assumptions, niter, may_be_zero;
1018   gcond *cond = get_loop_exit_condition (loop);
1019
1020   *assumptions = boolean_true_node;
1021   *number_of_iterationsm1 = chrec_dont_know;
1022   *number_of_iterations = chrec_dont_know;
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   if (!exit)
1028     return cond;
1029
1030   niter = chrec_dont_know;
1031   may_be_zero = NULL_TREE;
1032   niter_assumptions = boolean_true_node;
1033   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034       || chrec_contains_undetermined (niter_desc.niter))
1035     return cond;
1036
1037   niter_assumptions = niter_desc.assumptions;
1038   may_be_zero = niter_desc.may_be_zero;
1039   niter = niter_desc.niter;
1040
1041   if (may_be_zero && integer_zerop (may_be_zero))
1042     may_be_zero = NULL_TREE;
1043
1044   if (may_be_zero)
1045     {
1046       if (COMPARISON_CLASS_P (may_be_zero))
1047         {
1048           /* Try to combine may_be_zero with assumptions, this can simplify
1049              computation of niter expression.  */
1050           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052                                              niter_assumptions,
1053                                              fold_build1 (TRUTH_NOT_EXPR,
1054                                                           boolean_type_node,
1055                                                           may_be_zero));
1056           else
1057             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1059
1060           may_be_zero = NULL_TREE;
1061         }
1062       else if (integer_nonzerop (may_be_zero))
1063         {
1064           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066           return cond;
1067         }
1068       else
1069         return cond;
1070     }
1071
1072   *assumptions = niter_assumptions;
1073   *number_of_iterationsm1 = niter;
1074
1075   /* We want the number of loop header executions which is the number
1076      of latch executions plus one.
1077      ???  For UINT_MAX latch executions this number overflows to zero
1078      for loops like do { n++; } while (n != 0);  */
1079   if (niter && !chrec_contains_undetermined (niter))
1080     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081                           build_int_cst (TREE_TYPE (niter), 1));
1082   *number_of_iterations = niter;
1083
1084   return cond;
1085 }
1086
1087 /* Function bb_in_loop_p
1088
1089    Used as predicate for dfs order traversal of the loop bbs.  */
1090
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1093 {
1094   const struct loop *const loop = (const struct loop *)data;
1095   if (flow_bb_inside_loop_p (loop, bb))
1096     return true;
1097   return false;
1098 }
1099
1100
1101 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1102    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1103
1104 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1105   : vec_info (vec_info::loop, init_cost (loop_in)),
1106     loop (loop_in),
1107     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1108     num_itersm1 (NULL_TREE),
1109     num_iters (NULL_TREE),
1110     num_iters_unchanged (NULL_TREE),
1111     num_iters_assumptions (NULL_TREE),
1112     th (0),
1113     vectorization_factor (0),
1114     max_vectorization_factor (0),
1115     unaligned_dr (NULL),
1116     peeling_for_alignment (0),
1117     ptr_mask (0),
1118     slp_unrolling_factor (1),
1119     single_scalar_iteration_cost (0),
1120     vectorizable (false),
1121     peeling_for_gaps (false),
1122     peeling_for_niter (false),
1123     operands_swapped (false),
1124     no_data_dependencies (false),
1125     has_mask_store (false),
1126     scalar_loop (NULL),
1127     orig_loop_info (NULL)
1128 {
1129   /* Create/Update stmt_info for all stmts in the loop.  */
1130   basic_block *body = get_loop_body (loop);
1131   for (unsigned int i = 0; i < loop->num_nodes; i++)
1132     {
1133       basic_block bb = body[i];
1134       gimple_stmt_iterator si;
1135
1136       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1137         {
1138           gimple *phi = gsi_stmt (si);
1139           gimple_set_uid (phi, 0);
1140           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1141         }
1142
1143       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1144         {
1145           gimple *stmt = gsi_stmt (si);
1146           gimple_set_uid (stmt, 0);
1147           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1148         }
1149     }
1150   free (body);
1151
1152   /* CHECKME: We want to visit all BBs before their successors (except for
1153      latch blocks, for which this assertion wouldn't hold).  In the simple
1154      case of the loop forms we allow, a dfs order of the BBs would the same
1155      as reversed postorder traversal, so we are safe.  */
1156
1157   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1158                                           bbs, loop->num_nodes, loop);
1159   gcc_assert (nbbs == loop->num_nodes);
1160 }
1161
1162
1163 /* Free all memory used by the _loop_vec_info, as well as all the
1164    stmt_vec_info structs of all the stmts in the loop.  */
1165
1166 _loop_vec_info::~_loop_vec_info ()
1167 {
1168   int nbbs;
1169   gimple_stmt_iterator si;
1170   int j;
1171
1172   nbbs = loop->num_nodes;
1173   for (j = 0; j < nbbs; j++)
1174     {
1175       basic_block bb = bbs[j];
1176       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1177         free_stmt_vec_info (gsi_stmt (si));
1178
1179       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1180         {
1181           gimple *stmt = gsi_stmt (si);
1182
1183           /* We may have broken canonical form by moving a constant
1184              into RHS1 of a commutative op.  Fix such occurrences.  */
1185           if (operands_swapped && is_gimple_assign (stmt))
1186             {
1187               enum tree_code code = gimple_assign_rhs_code (stmt);
1188
1189               if ((code == PLUS_EXPR
1190                    || code == POINTER_PLUS_EXPR
1191                    || code == MULT_EXPR)
1192                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1193                 swap_ssa_operands (stmt,
1194                                    gimple_assign_rhs1_ptr (stmt),
1195                                    gimple_assign_rhs2_ptr (stmt));
1196               else if (code == COND_EXPR
1197                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1198                 {
1199                   tree cond_expr = gimple_assign_rhs1 (stmt);
1200                   enum tree_code cond_code = TREE_CODE (cond_expr);
1201
1202                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1203                     {
1204                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1205                                                                   0));
1206                       cond_code = invert_tree_comparison (cond_code,
1207                                                           honor_nans);
1208                       if (cond_code != ERROR_MARK)
1209                         {
1210                           TREE_SET_CODE (cond_expr, cond_code);
1211                           swap_ssa_operands (stmt,
1212                                              gimple_assign_rhs2_ptr (stmt),
1213                                              gimple_assign_rhs3_ptr (stmt));
1214                         }
1215                     }
1216                 }
1217             }
1218
1219           /* Free stmt_vec_info.  */
1220           free_stmt_vec_info (stmt);
1221           gsi_next (&si);
1222         }
1223     }
1224
1225   free (bbs);
1226
1227   loop->aux = NULL;
1228 }
1229
1230
1231 /* Calculate the cost of one scalar iteration of the loop.  */
1232 static void
1233 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 {
1235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1236   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1237   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1238   int innerloop_iters, i;
1239
1240   /* Count statements in scalar loop.  Using this as scalar cost for a single
1241      iteration for now.
1242
1243      TODO: Add outer loop support.
1244
1245      TODO: Consider assigning different costs to different scalar
1246      statements.  */
1247
1248   /* FORNOW.  */
1249   innerloop_iters = 1;
1250   if (loop->inner)
1251     innerloop_iters = 50; /* FIXME */
1252
1253   for (i = 0; i < nbbs; i++)
1254     {
1255       gimple_stmt_iterator si;
1256       basic_block bb = bbs[i];
1257
1258       if (bb->loop_father == loop->inner)
1259         factor = innerloop_iters;
1260       else
1261         factor = 1;
1262
1263       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1264         {
1265           gimple *stmt = gsi_stmt (si);
1266           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1267
1268           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1269             continue;
1270
1271           /* Skip stmts that are not vectorized inside the loop.  */
1272           if (stmt_info
1273               && !STMT_VINFO_RELEVANT_P (stmt_info)
1274               && (!STMT_VINFO_LIVE_P (stmt_info)
1275                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1276               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1277             continue;
1278
1279           vect_cost_for_stmt kind;
1280           if (STMT_VINFO_DATA_REF (stmt_info))
1281             {
1282               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1283                kind = scalar_load;
1284              else
1285                kind = scalar_store;
1286             }
1287           else
1288             kind = scalar_stmt;
1289
1290           scalar_single_iter_cost
1291             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1292                                  factor, kind, stmt_info, 0, vect_prologue);
1293         }
1294     }
1295   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1296     = scalar_single_iter_cost;
1297 }
1298
1299
1300 /* Function vect_analyze_loop_form_1.
1301
1302    Verify that certain CFG restrictions hold, including:
1303    - the loop has a pre-header
1304    - the loop has a single entry and exit
1305    - the loop exit condition is simple enough
1306    - the number of iterations can be analyzed, i.e, a countable loop.  The
1307      niter could be analyzed under some assumptions.  */
1308
1309 bool
1310 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1311                           tree *assumptions, tree *number_of_iterationsm1,
1312                           tree *number_of_iterations, gcond **inner_loop_cond)
1313 {
1314   if (dump_enabled_p ())
1315     dump_printf_loc (MSG_NOTE, vect_location,
1316                      "=== vect_analyze_loop_form ===\n");
1317
1318   /* Different restrictions apply when we are considering an inner-most loop,
1319      vs. an outer (nested) loop.
1320      (FORNOW. May want to relax some of these restrictions in the future).  */
1321
1322   if (!loop->inner)
1323     {
1324       /* Inner-most loop.  We currently require that the number of BBs is
1325          exactly 2 (the header and latch).  Vectorizable inner-most loops
1326          look like this:
1327
1328                         (pre-header)
1329                            |
1330                           header <--------+
1331                            | |            |
1332                            | +--> latch --+
1333                            |
1334                         (exit-bb)  */
1335
1336       if (loop->num_nodes != 2)
1337         {
1338           if (dump_enabled_p ())
1339             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                              "not vectorized: control flow in loop.\n");
1341           return false;
1342         }
1343
1344       if (empty_block_p (loop->header))
1345         {
1346           if (dump_enabled_p ())
1347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348                              "not vectorized: empty loop.\n");
1349           return false;
1350         }
1351     }
1352   else
1353     {
1354       struct loop *innerloop = loop->inner;
1355       edge entryedge;
1356
1357       /* Nested loop. We currently require that the loop is doubly-nested,
1358          contains a single inner loop, and the number of BBs is exactly 5.
1359          Vectorizable outer-loops look like this:
1360
1361                         (pre-header)
1362                            |
1363                           header <---+
1364                            |         |
1365                           inner-loop |
1366                            |         |
1367                           tail ------+
1368                            |
1369                         (exit-bb)
1370
1371          The inner-loop has the properties expected of inner-most loops
1372          as described above.  */
1373
1374       if ((loop->inner)->inner || (loop->inner)->next)
1375         {
1376           if (dump_enabled_p ())
1377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378                              "not vectorized: multiple nested loops.\n");
1379           return false;
1380         }
1381
1382       if (loop->num_nodes != 5)
1383         {
1384           if (dump_enabled_p ())
1385             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386                              "not vectorized: control flow in loop.\n");
1387           return false;
1388         }
1389
1390       entryedge = loop_preheader_edge (innerloop);
1391       if (entryedge->src != loop->header
1392           || !single_exit (innerloop)
1393           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1394         {
1395           if (dump_enabled_p ())
1396             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397                              "not vectorized: unsupported outerloop form.\n");
1398           return false;
1399         }
1400
1401       /* Analyze the inner-loop.  */
1402       tree inner_niterm1, inner_niter, inner_assumptions;
1403       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1404                                       &inner_assumptions, &inner_niterm1,
1405                                       &inner_niter, NULL)
1406           /* Don't support analyzing niter under assumptions for inner
1407              loop.  */
1408           || !integer_onep (inner_assumptions))
1409         {
1410           if (dump_enabled_p ())
1411             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1412                              "not vectorized: Bad inner loop.\n");
1413           return false;
1414         }
1415
1416       if (!expr_invariant_in_loop_p (loop, inner_niter))
1417         {
1418           if (dump_enabled_p ())
1419             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420                              "not vectorized: inner-loop count not"
1421                              " invariant.\n");
1422           return false;
1423         }
1424
1425       if (dump_enabled_p ())
1426         dump_printf_loc (MSG_NOTE, vect_location,
1427                          "Considering outer-loop vectorization.\n");
1428     }
1429
1430   if (!single_exit (loop)
1431       || EDGE_COUNT (loop->header->preds) != 2)
1432     {
1433       if (dump_enabled_p ())
1434         {
1435           if (!single_exit (loop))
1436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1437                              "not vectorized: multiple exits.\n");
1438           else if (EDGE_COUNT (loop->header->preds) != 2)
1439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1440                              "not vectorized: too many incoming edges.\n");
1441         }
1442       return false;
1443     }
1444
1445   /* We assume that the loop exit condition is at the end of the loop. i.e,
1446      that the loop is represented as a do-while (with a proper if-guard
1447      before the loop if needed), where the loop header contains all the
1448      executable statements, and the latch is empty.  */
1449   if (!empty_block_p (loop->latch)
1450       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1451     {
1452       if (dump_enabled_p ())
1453         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454                          "not vectorized: latch block not empty.\n");
1455       return false;
1456     }
1457
1458   /* Make sure the exit is not abnormal.  */
1459   edge e = single_exit (loop);
1460   if (e->flags & EDGE_ABNORMAL)
1461     {
1462       if (dump_enabled_p ())
1463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                          "not vectorized: abnormal loop exit edge.\n");
1465       return false;
1466     }
1467
1468   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1469                                      number_of_iterationsm1);
1470   if (!*loop_cond)
1471     {
1472       if (dump_enabled_p ())
1473         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474                          "not vectorized: complicated exit condition.\n");
1475       return false;
1476     }
1477
1478   if (integer_zerop (*assumptions)
1479       || !*number_of_iterations
1480       || chrec_contains_undetermined (*number_of_iterations))
1481     {
1482       if (dump_enabled_p ())
1483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1484                          "not vectorized: number of iterations cannot be "
1485                          "computed.\n");
1486       return false;
1487     }
1488
1489   if (integer_zerop (*number_of_iterations))
1490     {
1491       if (dump_enabled_p ())
1492         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493                          "not vectorized: number of iterations = 0.\n");
1494       return false;
1495     }
1496
1497   return true;
1498 }
1499
1500 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1501
1502 loop_vec_info
1503 vect_analyze_loop_form (struct loop *loop)
1504 {
1505   tree assumptions, number_of_iterations, number_of_iterationsm1;
1506   gcond *loop_cond, *inner_loop_cond = NULL;
1507
1508   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1509                                   &assumptions, &number_of_iterationsm1,
1510                                   &number_of_iterations, &inner_loop_cond))
1511     return NULL;
1512
1513   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1514   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1515   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1516   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1517   if (!integer_onep (assumptions))
1518     {
1519       /* We consider to vectorize this loop by versioning it under
1520          some assumptions.  In order to do this, we need to clear
1521          existing information computed by scev and niter analyzer.  */
1522       scev_reset_htab ();
1523       free_numbers_of_iterations_estimates (loop);
1524       /* Also set flag for this loop so that following scev and niter
1525          analysis are done under the assumptions.  */
1526       loop_constraint_set (loop, LOOP_C_FINITE);
1527       /* Also record the assumptions for versioning.  */
1528       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1529     }
1530
1531   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1532     {
1533       if (dump_enabled_p ())
1534         {
1535           dump_printf_loc (MSG_NOTE, vect_location,
1536                            "Symbolic number of iterations is ");
1537           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1538           dump_printf (MSG_NOTE, "\n");
1539         }
1540     }
1541
1542   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1543   if (inner_loop_cond)
1544     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1545       = loop_exit_ctrl_vec_info_type;
1546
1547   gcc_assert (!loop->aux);
1548   loop->aux = loop_vinfo;
1549   return loop_vinfo;
1550 }
1551
1552
1553
1554 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1555    statements update the vectorization factor.  */
1556
1557 static void
1558 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1559 {
1560   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1561   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1562   int nbbs = loop->num_nodes;
1563   unsigned int vectorization_factor;
1564   int i;
1565
1566   if (dump_enabled_p ())
1567     dump_printf_loc (MSG_NOTE, vect_location,
1568                      "=== vect_update_vf_for_slp ===\n");
1569
1570   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1571   gcc_assert (vectorization_factor != 0);
1572
1573   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1574      vectorization factor of the loop is the unrolling factor required by
1575      the SLP instances.  If that unrolling factor is 1, we say, that we
1576      perform pure SLP on loop - cross iteration parallelism is not
1577      exploited.  */
1578   bool only_slp_in_loop = true;
1579   for (i = 0; i < nbbs; i++)
1580     {
1581       basic_block bb = bbs[i];
1582       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1583            gsi_next (&si))
1584         {
1585           gimple *stmt = gsi_stmt (si);
1586           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1587           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1588               && STMT_VINFO_RELATED_STMT (stmt_info))
1589             {
1590               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1591               stmt_info = vinfo_for_stmt (stmt);
1592             }
1593           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1594                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1595               && !PURE_SLP_STMT (stmt_info))
1596             /* STMT needs both SLP and loop-based vectorization.  */
1597             only_slp_in_loop = false;
1598         }
1599     }
1600
1601   if (only_slp_in_loop)
1602     {
1603       dump_printf_loc (MSG_NOTE, vect_location,
1604                        "Loop contains only SLP stmts\n");
1605       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1606     }
1607   else
1608     {
1609       dump_printf_loc (MSG_NOTE, vect_location,
1610                        "Loop contains SLP and non-SLP stmts\n");
1611       vectorization_factor
1612         = least_common_multiple (vectorization_factor,
1613                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1614     }
1615
1616   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1617   if (dump_enabled_p ())
1618     dump_printf_loc (MSG_NOTE, vect_location,
1619                      "Updating vectorization factor to %d\n",
1620                      vectorization_factor);
1621 }
1622
1623 /* Function vect_analyze_loop_operations.
1624
1625    Scan the loop stmts and make sure they are all vectorizable.  */
1626
1627 static bool
1628 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1629 {
1630   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632   int nbbs = loop->num_nodes;
1633   int i;
1634   stmt_vec_info stmt_info;
1635   bool need_to_vectorize = false;
1636   bool ok;
1637
1638   if (dump_enabled_p ())
1639     dump_printf_loc (MSG_NOTE, vect_location,
1640                      "=== vect_analyze_loop_operations ===\n");
1641
1642   for (i = 0; i < nbbs; i++)
1643     {
1644       basic_block bb = bbs[i];
1645
1646       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1647            gsi_next (&si))
1648         {
1649           gphi *phi = si.phi ();
1650           ok = true;
1651
1652           stmt_info = vinfo_for_stmt (phi);
1653           if (dump_enabled_p ())
1654             {
1655               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1656               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1657             }
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && STMT_VINFO_DEF_TYPE (stmt_info)
1671                      != vect_double_reduction_def)
1672                 {
1673                   if (dump_enabled_p ())
1674                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675                                      "Unsupported loop-closed phi in "
1676                                      "outer-loop.\n");
1677                   return false;
1678                 }
1679
1680               /* If PHI is used in the outer loop, we check that its operand
1681                  is defined in the inner loop.  */
1682               if (STMT_VINFO_RELEVANT_P (stmt_info))
1683                 {
1684                   tree phi_op;
1685                   gimple *op_def_stmt;
1686
1687                   if (gimple_phi_num_args (phi) != 1)
1688                     return false;
1689
1690                   phi_op = PHI_ARG_DEF (phi, 0);
1691                   if (TREE_CODE (phi_op) != SSA_NAME)
1692                     return false;
1693
1694                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1695                   if (gimple_nop_p (op_def_stmt)
1696                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1697                       || !vinfo_for_stmt (op_def_stmt))
1698                     return false;
1699
1700                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1701                         != vect_used_in_outer
1702                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1703                            != vect_used_in_outer_by_reduction)
1704                     return false;
1705                 }
1706
1707               continue;
1708             }
1709
1710           gcc_assert (stmt_info);
1711
1712           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1713                || STMT_VINFO_LIVE_P (stmt_info))
1714               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1715             {
1716               /* A scalar-dependence cycle that we don't support.  */
1717               if (dump_enabled_p ())
1718                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                                  "not vectorized: scalar dependence cycle.\n");
1720               return false;
1721             }
1722
1723           if (STMT_VINFO_RELEVANT_P (stmt_info))
1724             {
1725               need_to_vectorize = true;
1726               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1727                   && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1729               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1730                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731                        && ! PURE_SLP_STMT (stmt_info))
1732                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1733             }
1734
1735           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1736             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1737
1738           if (!ok)
1739             {
1740               if (dump_enabled_p ())
1741                 {
1742                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743                                    "not vectorized: relevant phi not "
1744                                    "supported: ");
1745                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1746                 }
1747               return false;
1748             }
1749         }
1750
1751       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752            gsi_next (&si))
1753         {
1754           gimple *stmt = gsi_stmt (si);
1755           if (!gimple_clobber_p (stmt)
1756               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1757             return false;
1758         }
1759     } /* bbs */
1760
1761   /* All operations in the loop are either irrelevant (deal with loop
1762      control, or dead), or only used outside the loop and can be moved
1763      out of the loop (e.g. invariants, inductions).  The loop can be
1764      optimized away by scalar optimizations.  We're better off not
1765      touching this loop.  */
1766   if (!need_to_vectorize)
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "All the computation can be taken out of the loop.\n");
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773                          "not vectorized: redundant loop. no profit to "
1774                          "vectorize.\n");
1775       return false;
1776     }
1777
1778   return true;
1779 }
1780
1781
1782 /* Function vect_analyze_loop_2.
1783
1784    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1785    for it.  The different analyses will record information in the
1786    loop_vec_info struct.  */
1787 static bool
1788 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1789 {
1790   bool ok;
1791   int max_vf = MAX_VECTORIZATION_FACTOR;
1792   int min_vf = 2;
1793   unsigned int n_stmts = 0;
1794
1795   /* The first group of checks is independent of the vector size.  */
1796   fatal = true;
1797
1798   /* Find all data references in the loop (which correspond to vdefs/vuses)
1799      and analyze their evolution in the loop.  */
1800
1801   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1802
1803   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1804   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1805     {
1806       if (dump_enabled_p ())
1807         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808                          "not vectorized: loop nest containing two "
1809                          "or more consecutive inner loops cannot be "
1810                          "vectorized\n");
1811       return false;
1812     }
1813
1814   for (unsigned i = 0; i < loop->num_nodes; i++)
1815     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1816          !gsi_end_p (gsi); gsi_next (&gsi))
1817       {
1818         gimple *stmt = gsi_stmt (gsi);
1819         if (is_gimple_debug (stmt))
1820           continue;
1821         ++n_stmts;
1822         if (!find_data_references_in_stmt (loop, stmt,
1823                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1824           {
1825             if (is_gimple_call (stmt) && loop->safelen)
1826               {
1827                 tree fndecl = gimple_call_fndecl (stmt), op;
1828                 if (fndecl != NULL_TREE)
1829                   {
1830                     cgraph_node *node = cgraph_node::get (fndecl);
1831                     if (node != NULL && node->simd_clones != NULL)
1832                       {
1833                         unsigned int j, n = gimple_call_num_args (stmt);
1834                         for (j = 0; j < n; j++)
1835                           {
1836                             op = gimple_call_arg (stmt, j);
1837                             if (DECL_P (op)
1838                                 || (REFERENCE_CLASS_P (op)
1839                                     && get_base_address (op)))
1840                               break;
1841                           }
1842                         op = gimple_call_lhs (stmt);
1843                         /* Ignore #pragma omp declare simd functions
1844                            if they don't have data references in the
1845                            call stmt itself.  */
1846                         if (j == n
1847                             && !(op
1848                                  && (DECL_P (op)
1849                                      || (REFERENCE_CLASS_P (op)
1850                                          && get_base_address (op)))))
1851                           continue;
1852                       }
1853                   }
1854               }
1855             if (dump_enabled_p ())
1856               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857                                "not vectorized: loop contains function "
1858                                "calls or data references that cannot "
1859                                "be analyzed\n");
1860             return false;
1861           }
1862       }
1863
1864   /* Analyze the data references and also adjust the minimal
1865      vectorization factor according to the loads and stores.  */
1866
1867   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868   if (!ok)
1869     {
1870       if (dump_enabled_p ())
1871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                          "bad data references.\n");
1873       return false;
1874     }
1875
1876   /* Classify all cross-iteration scalar data-flow cycles.
1877      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1878   vect_analyze_scalar_cycles (loop_vinfo);
1879
1880   vect_pattern_recog (loop_vinfo);
1881
1882   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1883
1884   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1886
1887   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888   if (!ok)
1889     {
1890       if (dump_enabled_p ())
1891         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892                          "bad data access.\n");
1893       return false;
1894     }
1895
1896   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1897
1898   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "unexpected pattern.\n");
1904       return false;
1905     }
1906
1907   /* While the rest of the analysis below depends on it in some way.  */
1908   fatal = false;
1909
1910   /* Analyze data dependences between the data-refs in the loop
1911      and adjust the maximum vectorization factor according to
1912      the dependences.
1913      FORNOW: fail at the first data dependence that we encounter.  */
1914
1915   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916   if (!ok
1917       || max_vf < min_vf)
1918     {
1919       if (dump_enabled_p ())
1920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921                              "bad data dependence.\n");
1922       return false;
1923     }
1924   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1925
1926   ok = vect_determine_vectorization_factor (loop_vinfo);
1927   if (!ok)
1928     {
1929       if (dump_enabled_p ())
1930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                          "can't determine vectorization factor.\n");
1932       return false;
1933     }
1934   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1935     {
1936       if (dump_enabled_p ())
1937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1938                          "bad data dependence.\n");
1939       return false;
1940     }
1941
1942   /* Compute the scalar iteration cost.  */
1943   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1944
1945   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1946   HOST_WIDE_INT estimated_niter;
1947   unsigned th;
1948   int min_scalar_loop_bound;
1949
1950   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1951   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1952   if (!ok)
1953     return false;
1954
1955   /* If there are any SLP instances mark them as pure_slp.  */
1956   bool slp = vect_make_slp_decision (loop_vinfo);
1957   if (slp)
1958     {
1959       /* Find stmts that need to be both vectorized and SLPed.  */
1960       vect_detect_hybrid_slp (loop_vinfo);
1961
1962       /* Update the vectorization factor based on the SLP decision.  */
1963       vect_update_vf_for_slp (loop_vinfo);
1964     }
1965
1966   /* This is the point where we can re-start analysis with SLP forced off.  */
1967 start_over:
1968
1969   /* Now the vectorization factor is final.  */
1970   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1971   gcc_assert (vectorization_factor != 0);
1972
1973   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1974     dump_printf_loc (MSG_NOTE, vect_location,
1975                      "vectorization_factor = %d, niters = "
1976                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1977                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1978
1979   HOST_WIDE_INT max_niter
1980     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1981   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1982        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1983       || (max_niter != -1
1984           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1985     {
1986       if (dump_enabled_p ())
1987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1988                          "not vectorized: iteration count smaller than "
1989                          "vectorization factor.\n");
1990       return false;
1991     }
1992
1993   /* Analyze the alignment of the data-refs in the loop.
1994      Fail if a data reference is found that cannot be vectorized.  */
1995
1996   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1997   if (!ok)
1998     {
1999       if (dump_enabled_p ())
2000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001                          "bad data alignment.\n");
2002       return false;
2003     }
2004
2005   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2006      It is important to call pruning after vect_analyze_data_ref_accesses,
2007      since we use grouping information gathered by interleaving analysis.  */
2008   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2009   if (!ok)
2010     return false;
2011
2012   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2013      vectorization.  */
2014   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2015     {
2016     /* This pass will decide on using loop versioning and/or loop peeling in
2017        order to enhance the alignment of data references in the loop.  */
2018     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2019     if (!ok)
2020       {
2021         if (dump_enabled_p ())
2022           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023                            "bad data alignment.\n");
2024         return false;
2025       }
2026     }
2027
2028   if (slp)
2029     {
2030       /* Analyze operations in the SLP instances.  Note this may
2031          remove unsupported SLP instances which makes the above
2032          SLP kind detection invalid.  */
2033       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2034       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2035                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2036       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2037         goto again;
2038     }
2039
2040   /* Scan all the remaining operations in the loop that are not subject
2041      to SLP and make sure they are vectorizable.  */
2042   ok = vect_analyze_loop_operations (loop_vinfo);
2043   if (!ok)
2044     {
2045       if (dump_enabled_p ())
2046         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2047                          "bad operation or unsupported loop bound.\n");
2048       return false;
2049     }
2050
2051   /* If epilog loop is required because of data accesses with gaps,
2052      one additional iteration needs to be peeled.  Check if there is
2053      enough iterations for vectorization.  */
2054   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2055       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2056     {
2057       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2058       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2059
2060       if (wi::to_widest (scalar_niters) < vf)
2061         {
2062           if (dump_enabled_p ())
2063             dump_printf_loc (MSG_NOTE, vect_location,
2064                              "loop has no enough iterations to support"
2065                              " peeling for gaps.\n");
2066           return false;
2067         }
2068     }
2069
2070   /* Analyze cost.  Decide if worth while to vectorize.  */
2071   int min_profitable_estimate, min_profitable_iters;
2072   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2073                                       &min_profitable_estimate);
2074
2075   if (min_profitable_iters < 0)
2076     {
2077       if (dump_enabled_p ())
2078         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2079                          "not vectorized: vectorization not profitable.\n");
2080       if (dump_enabled_p ())
2081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2082                          "not vectorized: vector version will never be "
2083                          "profitable.\n");
2084       goto again;
2085     }
2086
2087   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2088                            * vectorization_factor);
2089
2090   /* Use the cost model only if it is more conservative than user specified
2091      threshold.  */
2092   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2093
2094   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2095
2096   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2097       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2098     {
2099       if (dump_enabled_p ())
2100         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2101                          "not vectorized: vectorization not profitable.\n");
2102       if (dump_enabled_p ())
2103         dump_printf_loc (MSG_NOTE, vect_location,
2104                          "not vectorized: iteration count smaller than user "
2105                          "specified loop bound parameter or minimum profitable "
2106                          "iterations (whichever is more conservative).\n");
2107       goto again;
2108     }
2109
2110   estimated_niter
2111     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2112   if (estimated_niter == -1)
2113     estimated_niter = max_niter;
2114   if (estimated_niter != -1
2115       && ((unsigned HOST_WIDE_INT) estimated_niter
2116           < MAX (th, (unsigned) min_profitable_estimate)))
2117     {
2118       if (dump_enabled_p ())
2119         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2120                          "not vectorized: estimated iteration count too "
2121                          "small.\n");
2122       if (dump_enabled_p ())
2123         dump_printf_loc (MSG_NOTE, vect_location,
2124                          "not vectorized: estimated iteration count smaller "
2125                          "than specified loop bound parameter or minimum "
2126                          "profitable iterations (whichever is more "
2127                          "conservative).\n");
2128       goto again;
2129     }
2130
2131   /* Decide whether we need to create an epilogue loop to handle
2132      remaining scalar iterations.  */
2133   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2134          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2135         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2136
2137   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2138       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2139     {
2140       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2141                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2142           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2143         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2144     }
2145   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2146            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2147                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2148                /* In case of versioning, check if the maximum number of
2149                   iterations is greater than th.  If they are identical,
2150                   the epilogue is unnecessary.  */
2151                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2152                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2153     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2154
2155   /* If an epilogue loop is required make sure we can create one.  */
2156   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2157       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2158     {
2159       if (dump_enabled_p ())
2160         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2161       if (!vect_can_advance_ivs_p (loop_vinfo)
2162           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2163                                            single_exit (LOOP_VINFO_LOOP
2164                                                          (loop_vinfo))))
2165         {
2166           if (dump_enabled_p ())
2167             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2168                              "not vectorized: can't create required "
2169                              "epilog loop\n");
2170           goto again;
2171         }
2172     }
2173
2174   /* During peeling, we need to check if number of loop iterations is
2175      enough for both peeled prolog loop and vector loop.  This check
2176      can be merged along with threshold check of loop versioning, so
2177      increase threshold for this case if necessary.  */
2178   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2179       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2180           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2181     {
2182       unsigned niters_th;
2183
2184       /* Niters for peeled prolog loop.  */
2185       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2186         {
2187           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2188           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2189
2190           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2191         }
2192       else
2193         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2194
2195       /* Niters for at least one iteration of vectorized loop.  */
2196       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2197       /* One additional iteration because of peeling for gap.  */
2198       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2199         niters_th++;
2200       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2201         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2202     }
2203
2204   gcc_assert (vectorization_factor
2205               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2206
2207   /* Ok to vectorize!  */
2208   return true;
2209
2210 again:
2211   /* Try again with SLP forced off but if we didn't do any SLP there is
2212      no point in re-trying.  */
2213   if (!slp)
2214     return false;
2215
2216   /* If there are reduction chains re-trying will fail anyway.  */
2217   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2218     return false;
2219
2220   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2221      via interleaving or lane instructions.  */
2222   slp_instance instance;
2223   slp_tree node;
2224   unsigned i, j;
2225   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2226     {
2227       stmt_vec_info vinfo;
2228       vinfo = vinfo_for_stmt
2229           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2230       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2231         continue;
2232       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2233       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2234       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2235       if (! vect_store_lanes_supported (vectype, size)
2236           && ! vect_grouped_store_supported (vectype, size))
2237         return false;
2238       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2239         {
2240           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2241           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2242           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2243           size = STMT_VINFO_GROUP_SIZE (vinfo);
2244           vectype = STMT_VINFO_VECTYPE (vinfo);
2245           if (! vect_load_lanes_supported (vectype, size)
2246               && ! vect_grouped_load_supported (vectype, single_element_p,
2247                                                 size))
2248             return false;
2249         }
2250     }
2251
2252   if (dump_enabled_p ())
2253     dump_printf_loc (MSG_NOTE, vect_location,
2254                      "re-trying with SLP disabled\n");
2255
2256   /* Roll back state appropriately.  No SLP this time.  */
2257   slp = false;
2258   /* Restore vectorization factor as it were without SLP.  */
2259   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2260   /* Free the SLP instances.  */
2261   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2262     vect_free_slp_instance (instance);
2263   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2264   /* Reset SLP type to loop_vect on all stmts.  */
2265   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2266     {
2267       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2268       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2269            !gsi_end_p (si); gsi_next (&si))
2270         {
2271           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2272           STMT_SLP_TYPE (stmt_info) = loop_vect;
2273         }
2274       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2275            !gsi_end_p (si); gsi_next (&si))
2276         {
2277           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2278           STMT_SLP_TYPE (stmt_info) = loop_vect;
2279           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2280             {
2281               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2282               STMT_SLP_TYPE (stmt_info) = loop_vect;
2283               for (gimple_stmt_iterator pi
2284                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2285                    !gsi_end_p (pi); gsi_next (&pi))
2286                 {
2287                   gimple *pstmt = gsi_stmt (pi);
2288                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2289                 }
2290             }
2291         }
2292     }
2293   /* Free optimized alias test DDRS.  */
2294   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2295   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2296   /* Reset target cost data.  */
2297   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2298   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2299     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2300   /* Reset assorted flags.  */
2301   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2302   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2303   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2304
2305   goto start_over;
2306 }
2307
2308 /* Function vect_analyze_loop.
2309
2310    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2311    for it.  The different analyses will record information in the
2312    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2313    be vectorized.  */
2314 loop_vec_info
2315 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2316 {
2317   loop_vec_info loop_vinfo;
2318   unsigned int vector_sizes;
2319
2320   /* Autodetect first vector size we try.  */
2321   current_vector_size = 0;
2322   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2323
2324   if (dump_enabled_p ())
2325     dump_printf_loc (MSG_NOTE, vect_location,
2326                      "===== analyze_loop_nest =====\n");
2327
2328   if (loop_outer (loop)
2329       && loop_vec_info_for_loop (loop_outer (loop))
2330       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2331     {
2332       if (dump_enabled_p ())
2333         dump_printf_loc (MSG_NOTE, vect_location,
2334                          "outer-loop already vectorized.\n");
2335       return NULL;
2336     }
2337
2338   while (1)
2339     {
2340       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2341       loop_vinfo = vect_analyze_loop_form (loop);
2342       if (!loop_vinfo)
2343         {
2344           if (dump_enabled_p ())
2345             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2346                              "bad loop form.\n");
2347           return NULL;
2348         }
2349
2350       bool fatal = false;
2351
2352       if (orig_loop_vinfo)
2353         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2354
2355       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2356         {
2357           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2358
2359           return loop_vinfo;
2360         }
2361
2362       delete loop_vinfo;
2363
2364       vector_sizes &= ~current_vector_size;
2365       if (fatal
2366           || vector_sizes == 0
2367           || current_vector_size == 0)
2368         return NULL;
2369
2370       /* Try the next biggest vector size.  */
2371       current_vector_size = 1 << floor_log2 (vector_sizes);
2372       if (dump_enabled_p ())
2373         dump_printf_loc (MSG_NOTE, vect_location,
2374                          "***** Re-trying analysis with "
2375                          "vector size %d\n", current_vector_size);
2376     }
2377 }
2378
2379
2380 /* Function reduction_code_for_scalar_code
2381
2382    Input:
2383    CODE - tree_code of a reduction operations.
2384
2385    Output:
2386    REDUC_CODE - the corresponding tree-code to be used to reduce the
2387       vector of partial results into a single scalar result, or ERROR_MARK
2388       if the operation is a supported reduction operation, but does not have
2389       such a tree-code.
2390
2391    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2392
2393 static bool
2394 reduction_code_for_scalar_code (enum tree_code code,
2395                                 enum tree_code *reduc_code)
2396 {
2397   switch (code)
2398     {
2399       case MAX_EXPR:
2400         *reduc_code = REDUC_MAX_EXPR;
2401         return true;
2402
2403       case MIN_EXPR:
2404         *reduc_code = REDUC_MIN_EXPR;
2405         return true;
2406
2407       case PLUS_EXPR:
2408         *reduc_code = REDUC_PLUS_EXPR;
2409         return true;
2410
2411       case MULT_EXPR:
2412       case MINUS_EXPR:
2413       case BIT_IOR_EXPR:
2414       case BIT_XOR_EXPR:
2415       case BIT_AND_EXPR:
2416         *reduc_code = ERROR_MARK;
2417         return true;
2418
2419       default:
2420        return false;
2421     }
2422 }
2423
2424
2425 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2426    STMT is printed with a message MSG. */
2427
2428 static void
2429 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2430 {
2431   dump_printf_loc (msg_type, vect_location, "%s", msg);
2432   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2433 }
2434
2435
2436 /* Detect SLP reduction of the form:
2437
2438    #a1 = phi <a5, a0>
2439    a2 = operation (a1)
2440    a3 = operation (a2)
2441    a4 = operation (a3)
2442    a5 = operation (a4)
2443
2444    #a = phi <a5>
2445
2446    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2447    FIRST_STMT is the first reduction stmt in the chain
2448    (a2 = operation (a1)).
2449
2450    Return TRUE if a reduction chain was detected.  */
2451
2452 static bool
2453 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2454                        gimple *first_stmt)
2455 {
2456   struct loop *loop = (gimple_bb (phi))->loop_father;
2457   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2458   enum tree_code code;
2459   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2460   stmt_vec_info use_stmt_info, current_stmt_info;
2461   tree lhs;
2462   imm_use_iterator imm_iter;
2463   use_operand_p use_p;
2464   int nloop_uses, size = 0, n_out_of_loop_uses;
2465   bool found = false;
2466
2467   if (loop != vect_loop)
2468     return false;
2469
2470   lhs = PHI_RESULT (phi);
2471   code = gimple_assign_rhs_code (first_stmt);
2472   while (1)
2473     {
2474       nloop_uses = 0;
2475       n_out_of_loop_uses = 0;
2476       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2477         {
2478           gimple *use_stmt = USE_STMT (use_p);
2479           if (is_gimple_debug (use_stmt))
2480             continue;
2481
2482           /* Check if we got back to the reduction phi.  */
2483           if (use_stmt == phi)
2484             {
2485               loop_use_stmt = use_stmt;
2486               found = true;
2487               break;
2488             }
2489
2490           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2491             {
2492               loop_use_stmt = use_stmt;
2493               nloop_uses++;
2494             }
2495            else
2496              n_out_of_loop_uses++;
2497
2498            /* There are can be either a single use in the loop or two uses in
2499               phi nodes.  */
2500            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2501              return false;
2502         }
2503
2504       if (found)
2505         break;
2506
2507       /* We reached a statement with no loop uses.  */
2508       if (nloop_uses == 0)
2509         return false;
2510
2511       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2512       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2513         return false;
2514
2515       if (!is_gimple_assign (loop_use_stmt)
2516           || code != gimple_assign_rhs_code (loop_use_stmt)
2517           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2518         return false;
2519
2520       /* Insert USE_STMT into reduction chain.  */
2521       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2522       if (current_stmt)
2523         {
2524           current_stmt_info = vinfo_for_stmt (current_stmt);
2525           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2526           GROUP_FIRST_ELEMENT (use_stmt_info)
2527             = GROUP_FIRST_ELEMENT (current_stmt_info);
2528         }
2529       else
2530         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2531
2532       lhs = gimple_assign_lhs (loop_use_stmt);
2533       current_stmt = loop_use_stmt;
2534       size++;
2535    }
2536
2537   if (!found || loop_use_stmt != phi || size < 2)
2538     return false;
2539
2540   /* Swap the operands, if needed, to make the reduction operand be the second
2541      operand.  */
2542   lhs = PHI_RESULT (phi);
2543   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2544   while (next_stmt)
2545     {
2546       if (gimple_assign_rhs2 (next_stmt) == lhs)
2547         {
2548           tree op = gimple_assign_rhs1 (next_stmt);
2549           gimple *def_stmt = NULL;
2550
2551           if (TREE_CODE (op) == SSA_NAME)
2552             def_stmt = SSA_NAME_DEF_STMT (op);
2553
2554           /* Check that the other def is either defined in the loop
2555              ("vect_internal_def"), or it's an induction (defined by a
2556              loop-header phi-node).  */
2557           if (def_stmt
2558               && gimple_bb (def_stmt)
2559               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2560               && (is_gimple_assign (def_stmt)
2561                   || is_gimple_call (def_stmt)
2562                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2563                            == vect_induction_def
2564                   || (gimple_code (def_stmt) == GIMPLE_PHI
2565                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2566                                   == vect_internal_def
2567                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2568             {
2569               lhs = gimple_assign_lhs (next_stmt);
2570               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2571               continue;
2572             }
2573
2574           return false;
2575         }
2576       else
2577         {
2578           tree op = gimple_assign_rhs2 (next_stmt);
2579           gimple *def_stmt = NULL;
2580
2581           if (TREE_CODE (op) == SSA_NAME)
2582             def_stmt = SSA_NAME_DEF_STMT (op);
2583
2584           /* Check that the other def is either defined in the loop
2585             ("vect_internal_def"), or it's an induction (defined by a
2586             loop-header phi-node).  */
2587           if (def_stmt
2588               && gimple_bb (def_stmt)
2589               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2590               && (is_gimple_assign (def_stmt)
2591                   || is_gimple_call (def_stmt)
2592                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2593                               == vect_induction_def
2594                   || (gimple_code (def_stmt) == GIMPLE_PHI
2595                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2596                                   == vect_internal_def
2597                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2598             {
2599               if (dump_enabled_p ())
2600                 {
2601                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2602                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2603                 }
2604
2605               swap_ssa_operands (next_stmt,
2606                                  gimple_assign_rhs1_ptr (next_stmt),
2607                                  gimple_assign_rhs2_ptr (next_stmt));
2608               update_stmt (next_stmt);
2609
2610               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2611                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2612             }
2613           else
2614             return false;
2615         }
2616
2617       lhs = gimple_assign_lhs (next_stmt);
2618       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2619     }
2620
2621   /* Save the chain for further analysis in SLP detection.  */
2622   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2623   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2624   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2625
2626   return true;
2627 }
2628
2629
2630 /* Function vect_is_simple_reduction
2631
2632    (1) Detect a cross-iteration def-use cycle that represents a simple
2633    reduction computation.  We look for the following pattern:
2634
2635    loop_header:
2636      a1 = phi < a0, a2 >
2637      a3 = ...
2638      a2 = operation (a3, a1)
2639
2640    or
2641
2642    a3 = ...
2643    loop_header:
2644      a1 = phi < a0, a2 >
2645      a2 = operation (a3, a1)
2646
2647    such that:
2648    1. operation is commutative and associative and it is safe to
2649       change the order of the computation
2650    2. no uses for a2 in the loop (a2 is used out of the loop)
2651    3. no uses of a1 in the loop besides the reduction operation
2652    4. no uses of a1 outside the loop.
2653
2654    Conditions 1,4 are tested here.
2655    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2656
2657    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2658    nested cycles.
2659
2660    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2661    reductions:
2662
2663      a1 = phi < a0, a2 >
2664      inner loop (def of a3)
2665      a2 = phi < a3 >
2666
2667    (4) Detect condition expressions, ie:
2668      for (int i = 0; i < N; i++)
2669        if (a[i] < val)
2670         ret_val = a[i];
2671
2672 */
2673
2674 static gimple *
2675 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2676                           bool *double_reduc,
2677                           bool need_wrapping_integral_overflow,
2678                           enum vect_reduction_type *v_reduc_type)
2679 {
2680   struct loop *loop = (gimple_bb (phi))->loop_father;
2681   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2682   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2683   enum tree_code orig_code, code;
2684   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2685   tree type;
2686   int nloop_uses;
2687   tree name;
2688   imm_use_iterator imm_iter;
2689   use_operand_p use_p;
2690   bool phi_def;
2691
2692   *double_reduc = false;
2693   *v_reduc_type = TREE_CODE_REDUCTION;
2694
2695   tree phi_name = PHI_RESULT (phi);
2696   /* ???  If there are no uses of the PHI result the inner loop reduction
2697      won't be detected as possibly double-reduction by vectorizable_reduction
2698      because that tries to walk the PHI arg from the preheader edge which
2699      can be constant.  See PR60382.  */
2700   if (has_zero_uses (phi_name))
2701     return NULL;
2702   nloop_uses = 0;
2703   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2704     {
2705       gimple *use_stmt = USE_STMT (use_p);
2706       if (is_gimple_debug (use_stmt))
2707         continue;
2708
2709       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2710         {
2711           if (dump_enabled_p ())
2712             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2713                              "intermediate value used outside loop.\n");
2714
2715           return NULL;
2716         }
2717
2718       nloop_uses++;
2719       if (nloop_uses > 1)
2720         {
2721           if (dump_enabled_p ())
2722             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2723                              "reduction value used in loop.\n");
2724           return NULL;
2725         }
2726
2727       phi_use_stmt = use_stmt;
2728     }
2729
2730   edge latch_e = loop_latch_edge (loop);
2731   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2732   if (TREE_CODE (loop_arg) != SSA_NAME)
2733     {
2734       if (dump_enabled_p ())
2735         {
2736           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2737                            "reduction: not ssa_name: ");
2738           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2739           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2740         }
2741       return NULL;
2742     }
2743
2744   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2745   if (is_gimple_assign (def_stmt))
2746     {
2747       name = gimple_assign_lhs (def_stmt);
2748       phi_def = false;
2749     }
2750   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2751     {
2752       name = PHI_RESULT (def_stmt);
2753       phi_def = true;
2754     }
2755   else
2756     {
2757       if (dump_enabled_p ())
2758         {
2759           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                            "reduction: unhandled reduction operation: ");
2761           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2762         }
2763       return NULL;
2764     }
2765
2766   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2767     return NULL;
2768
2769   nloop_uses = 0;
2770   auto_vec<gphi *, 3> lcphis;
2771   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2772     {
2773       gimple *use_stmt = USE_STMT (use_p);
2774       if (is_gimple_debug (use_stmt))
2775         continue;
2776       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2777         nloop_uses++;
2778       else
2779         /* We can have more than one loop-closed PHI.  */
2780         lcphis.safe_push (as_a <gphi *> (use_stmt));
2781       if (nloop_uses > 1)
2782         {
2783           if (dump_enabled_p ())
2784             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2785                              "reduction used in loop.\n");
2786           return NULL;
2787         }
2788     }
2789
2790   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2791      defined in the inner loop.  */
2792   if (phi_def)
2793     {
2794       op1 = PHI_ARG_DEF (def_stmt, 0);
2795
2796       if (gimple_phi_num_args (def_stmt) != 1
2797           || TREE_CODE (op1) != SSA_NAME)
2798         {
2799           if (dump_enabled_p ())
2800             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2801                              "unsupported phi node definition.\n");
2802
2803           return NULL;
2804         }
2805
2806       def1 = SSA_NAME_DEF_STMT (op1);
2807       if (gimple_bb (def1)
2808           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2809           && loop->inner
2810           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2811           && is_gimple_assign (def1)
2812           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2813         {
2814           if (dump_enabled_p ())
2815             report_vect_op (MSG_NOTE, def_stmt,
2816                             "detected double reduction: ");
2817
2818           *double_reduc = true;
2819           return def_stmt;
2820         }
2821
2822       return NULL;
2823     }
2824
2825   /* If we are vectorizing an inner reduction we are executing that
2826      in the original order only in case we are not dealing with a
2827      double reduction.  */
2828   bool check_reduction = true;
2829   if (flow_loop_nested_p (vect_loop, loop))
2830     {
2831       gphi *lcphi;
2832       unsigned i;
2833       check_reduction = false;
2834       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2835         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2836           {
2837             gimple *use_stmt = USE_STMT (use_p);
2838             if (is_gimple_debug (use_stmt))
2839               continue;
2840             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2841               check_reduction = true;
2842           }
2843     }
2844
2845   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2846   code = orig_code = gimple_assign_rhs_code (def_stmt);
2847
2848   /* We can handle "res -= x[i]", which is non-associative by
2849      simply rewriting this into "res += -x[i]".  Avoid changing
2850      gimple instruction for the first simple tests and only do this
2851      if we're allowed to change code at all.  */
2852   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2853     code = PLUS_EXPR;
2854
2855   if (code == COND_EXPR)
2856     {
2857       if (! nested_in_vect_loop)
2858         *v_reduc_type = COND_REDUCTION;
2859
2860       op3 = gimple_assign_rhs1 (def_stmt);
2861       if (COMPARISON_CLASS_P (op3))
2862         {
2863           op4 = TREE_OPERAND (op3, 1);
2864           op3 = TREE_OPERAND (op3, 0);
2865         }
2866       if (op3 == phi_name || op4 == phi_name)
2867         {
2868           if (dump_enabled_p ())
2869             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2870                             "reduction: condition depends on previous"
2871                             " iteration: ");
2872           return NULL;
2873         }
2874
2875       op1 = gimple_assign_rhs2 (def_stmt);
2876       op2 = gimple_assign_rhs3 (def_stmt);
2877     }
2878   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2879     {
2880       if (dump_enabled_p ())
2881         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2882                         "reduction: not commutative/associative: ");
2883       return NULL;
2884     }
2885   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2886     {
2887       op1 = gimple_assign_rhs1 (def_stmt);
2888       op2 = gimple_assign_rhs2 (def_stmt);
2889     }
2890   else
2891     {
2892       if (dump_enabled_p ())
2893         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2894                         "reduction: not handled operation: ");
2895       return NULL;
2896     }
2897
2898   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2899     {
2900       if (dump_enabled_p ())
2901         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2902                         "reduction: both uses not ssa_names: ");
2903
2904       return NULL;
2905     }
2906
2907   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2908   if ((TREE_CODE (op1) == SSA_NAME
2909        && !types_compatible_p (type,TREE_TYPE (op1)))
2910       || (TREE_CODE (op2) == SSA_NAME
2911           && !types_compatible_p (type, TREE_TYPE (op2)))
2912       || (op3 && TREE_CODE (op3) == SSA_NAME
2913           && !types_compatible_p (type, TREE_TYPE (op3)))
2914       || (op4 && TREE_CODE (op4) == SSA_NAME
2915           && !types_compatible_p (type, TREE_TYPE (op4))))
2916     {
2917       if (dump_enabled_p ())
2918         {
2919           dump_printf_loc (MSG_NOTE, vect_location,
2920                            "reduction: multiple types: operation type: ");
2921           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2922           dump_printf (MSG_NOTE, ", operands types: ");
2923           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2924                              TREE_TYPE (op1));
2925           dump_printf (MSG_NOTE, ",");
2926           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2927                              TREE_TYPE (op2));
2928           if (op3)
2929             {
2930               dump_printf (MSG_NOTE, ",");
2931               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2932                                  TREE_TYPE (op3));
2933             }
2934
2935           if (op4)
2936             {
2937               dump_printf (MSG_NOTE, ",");
2938               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2939                                  TREE_TYPE (op4));
2940             }
2941           dump_printf (MSG_NOTE, "\n");
2942         }
2943
2944       return NULL;
2945     }
2946
2947   /* Check that it's ok to change the order of the computation.
2948      Generally, when vectorizing a reduction we change the order of the
2949      computation.  This may change the behavior of the program in some
2950      cases, so we need to check that this is ok.  One exception is when
2951      vectorizing an outer-loop: the inner-loop is executed sequentially,
2952      and therefore vectorizing reductions in the inner-loop during
2953      outer-loop vectorization is safe.  */
2954
2955   if (*v_reduc_type != COND_REDUCTION
2956       && check_reduction)
2957     {
2958       /* CHECKME: check for !flag_finite_math_only too?  */
2959       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2960         {
2961           /* Changing the order of operations changes the semantics.  */
2962           if (dump_enabled_p ())
2963             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2964                         "reduction: unsafe fp math optimization: ");
2965           return NULL;
2966         }
2967       else if (INTEGRAL_TYPE_P (type))
2968         {
2969           if (!operation_no_trapping_overflow (type, code))
2970             {
2971               /* Changing the order of operations changes the semantics.  */
2972               if (dump_enabled_p ())
2973                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2974                                 "reduction: unsafe int math optimization"
2975                                 " (overflow traps): ");
2976               return NULL;
2977             }
2978           if (need_wrapping_integral_overflow
2979               && !TYPE_OVERFLOW_WRAPS (type)
2980               && operation_can_overflow (code))
2981             {
2982               /* Changing the order of operations changes the semantics.  */
2983               if (dump_enabled_p ())
2984                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2985                                 "reduction: unsafe int math optimization"
2986                                 " (overflow doesn't wrap): ");
2987               return NULL;
2988             }
2989         }
2990       else if (SAT_FIXED_POINT_TYPE_P (type))
2991         {
2992           /* Changing the order of operations changes the semantics.  */
2993           if (dump_enabled_p ())
2994           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2995                           "reduction: unsafe fixed-point math optimization: ");
2996           return NULL;
2997         }
2998     }
2999
3000   /* Reduction is safe. We're dealing with one of the following:
3001      1) integer arithmetic and no trapv
3002      2) floating point arithmetic, and special flags permit this optimization
3003      3) nested cycle (i.e., outer loop vectorization).  */
3004   if (TREE_CODE (op1) == SSA_NAME)
3005     def1 = SSA_NAME_DEF_STMT (op1);
3006
3007   if (TREE_CODE (op2) == SSA_NAME)
3008     def2 = SSA_NAME_DEF_STMT (op2);
3009
3010   if (code != COND_EXPR
3011       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3012     {
3013       if (dump_enabled_p ())
3014         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3015       return NULL;
3016     }
3017
3018   /* Check that one def is the reduction def, defined by PHI,
3019      the other def is either defined in the loop ("vect_internal_def"),
3020      or it's an induction (defined by a loop-header phi-node).  */
3021
3022   if (def2 && def2 == phi
3023       && (code == COND_EXPR
3024           || !def1 || gimple_nop_p (def1)
3025           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3026           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3027               && (is_gimple_assign (def1)
3028                   || is_gimple_call (def1)
3029                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3030                       == vect_induction_def
3031                   || (gimple_code (def1) == GIMPLE_PHI
3032                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3033                           == vect_internal_def
3034                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3035     {
3036       if (dump_enabled_p ())
3037         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3038       return def_stmt;
3039     }
3040
3041   if (def1 && def1 == phi
3042       && (code == COND_EXPR
3043           || !def2 || gimple_nop_p (def2)
3044           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3045           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3046               && (is_gimple_assign (def2)
3047                   || is_gimple_call (def2)
3048                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3049                        == vect_induction_def
3050                   || (gimple_code (def2) == GIMPLE_PHI
3051                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3052                            == vect_internal_def
3053                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3054     {
3055       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3056         {
3057           /* Check if we can swap operands (just for simplicity - so that
3058              the rest of the code can assume that the reduction variable
3059              is always the last (second) argument).  */
3060           if (code == COND_EXPR)
3061             {
3062               /* Swap cond_expr by inverting the condition.  */
3063               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3064               enum tree_code invert_code = ERROR_MARK;
3065               enum tree_code cond_code = TREE_CODE (cond_expr);
3066
3067               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3068                 {
3069                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3070                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3071                 }
3072               if (invert_code != ERROR_MARK)
3073                 {
3074                   TREE_SET_CODE (cond_expr, invert_code);
3075                   swap_ssa_operands (def_stmt,
3076                                      gimple_assign_rhs2_ptr (def_stmt),
3077                                      gimple_assign_rhs3_ptr (def_stmt));
3078                 }
3079               else
3080                 {
3081                   if (dump_enabled_p ())
3082                     report_vect_op (MSG_NOTE, def_stmt,
3083                                     "detected reduction: cannot swap operands "
3084                                     "for cond_expr");
3085                   return NULL;
3086                 }
3087             }
3088           else
3089             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3090                                gimple_assign_rhs2_ptr (def_stmt));
3091
3092           if (dump_enabled_p ())
3093             report_vect_op (MSG_NOTE, def_stmt,
3094                             "detected reduction: need to swap operands: ");
3095
3096           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3097             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3098         }
3099       else
3100         {
3101           if (dump_enabled_p ())
3102             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3103         }
3104
3105       return def_stmt;
3106     }
3107
3108   /* Try to find SLP reduction chain.  */
3109   if (! nested_in_vect_loop
3110       && code != COND_EXPR
3111       && orig_code != MINUS_EXPR
3112       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3113     {
3114       if (dump_enabled_p ())
3115         report_vect_op (MSG_NOTE, def_stmt,
3116                         "reduction: detected reduction chain: ");
3117
3118       return def_stmt;
3119     }
3120
3121   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3122   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3123   while (first)
3124     {
3125       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3126       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3127       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3128       first = next;
3129     }
3130
3131   /* Look for the expression computing loop_arg from loop PHI result.  */
3132   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3133   auto_bitmap visited;
3134   tree lookfor = PHI_RESULT (phi);
3135   ssa_op_iter curri;
3136   use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3137                                             SSA_OP_USE);
3138   while (USE_FROM_PTR (curr) != loop_arg)
3139     curr = op_iter_next_use (&curri);
3140   curri.i = curri.numops;
3141   do
3142     {
3143       path.safe_push (std::make_pair (curri, curr));
3144       tree use = USE_FROM_PTR (curr);
3145       if (use == lookfor)
3146         break;
3147       gimple *def = SSA_NAME_DEF_STMT (use);
3148       if (gimple_nop_p (def)
3149           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3150         {
3151 pop:
3152           do
3153             {
3154               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3155               curri = x.first;
3156               curr = x.second;
3157               do
3158                 curr = op_iter_next_use (&curri);
3159               /* Skip already visited or non-SSA operands (from iterating
3160                  over PHI args).  */
3161               while (curr != NULL_USE_OPERAND_P
3162                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3163                          || ! bitmap_set_bit (visited,
3164                                               SSA_NAME_VERSION
3165                                                 (USE_FROM_PTR (curr)))));
3166             }
3167           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3168           if (curr == NULL_USE_OPERAND_P)
3169             break;
3170         }
3171       else
3172         {
3173           if (gimple_code (def) == GIMPLE_PHI)
3174             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3175           else
3176             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3177           while (curr != NULL_USE_OPERAND_P
3178                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3179                      || ! bitmap_set_bit (visited,
3180                                           SSA_NAME_VERSION
3181                                             (USE_FROM_PTR (curr)))))
3182             curr = op_iter_next_use (&curri);
3183           if (curr == NULL_USE_OPERAND_P)
3184             goto pop;
3185         }
3186     }
3187   while (1);
3188   if (dump_file && (dump_flags & TDF_DETAILS))
3189     {
3190       dump_printf_loc (MSG_NOTE, vect_location,
3191                        "reduction path: ");
3192       unsigned i;
3193       std::pair<ssa_op_iter, use_operand_p> *x;
3194       FOR_EACH_VEC_ELT (path, i, x)
3195         {
3196           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3197           dump_printf (MSG_NOTE, " ");
3198         }
3199       dump_printf (MSG_NOTE, "\n");
3200     }
3201
3202   /* Check whether the reduction path detected is valid.  */
3203   bool fail = path.length () == 0;
3204   bool neg = false;
3205   for (unsigned i = 1; i < path.length (); ++i)
3206     {
3207       gimple *use_stmt = USE_STMT (path[i].second);
3208       tree op = USE_FROM_PTR (path[i].second);
3209       if (! has_single_use (op)
3210           || ! is_gimple_assign (use_stmt))
3211         {
3212           fail = true;
3213           break;
3214         }
3215       if (gimple_assign_rhs_code (use_stmt) != code)
3216         {
3217           if (code == PLUS_EXPR
3218               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3219             {
3220               /* Track whether we negate the reduction value each iteration.  */
3221               if (gimple_assign_rhs2 (use_stmt) == op)
3222                 neg = ! neg;
3223             }
3224           else
3225             {
3226               fail = true;
3227               break;
3228             }
3229         }
3230     }
3231   if (! fail && ! neg)
3232     return def_stmt;
3233
3234   if (dump_enabled_p ())
3235     {
3236       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3237                       "reduction: unknown pattern: ");
3238     }
3239
3240   return NULL;
3241 }
3242
3243 /* Wrapper around vect_is_simple_reduction, which will modify code
3244    in-place if it enables detection of more reductions.  Arguments
3245    as there.  */
3246
3247 gimple *
3248 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3249                              bool *double_reduc,
3250                              bool need_wrapping_integral_overflow)
3251 {
3252   enum vect_reduction_type v_reduc_type;
3253   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3254                                           need_wrapping_integral_overflow,
3255                                           &v_reduc_type);
3256   if (def)
3257     {
3258       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3259       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3260       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3261       reduc_def_info = vinfo_for_stmt (def);
3262       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3263     }
3264   return def;
3265 }
3266
3267 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3268 int
3269 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3270                              int *peel_iters_epilogue,
3271                              stmt_vector_for_cost *scalar_cost_vec,
3272                              stmt_vector_for_cost *prologue_cost_vec,
3273                              stmt_vector_for_cost *epilogue_cost_vec)
3274 {
3275   int retval = 0;
3276   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3277
3278   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3279     {
3280       *peel_iters_epilogue = vf/2;
3281       if (dump_enabled_p ())
3282         dump_printf_loc (MSG_NOTE, vect_location,
3283                          "cost model: epilogue peel iters set to vf/2 "
3284                          "because loop iterations are unknown .\n");
3285
3286       /* If peeled iterations are known but number of scalar loop
3287          iterations are unknown, count a taken branch per peeled loop.  */
3288       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3289                                  NULL, 0, vect_prologue);
3290       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3291                                  NULL, 0, vect_epilogue);
3292     }
3293   else
3294     {
3295       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3296       peel_iters_prologue = niters < peel_iters_prologue ?
3297                             niters : peel_iters_prologue;
3298       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3299       /* If we need to peel for gaps, but no peeling is required, we have to
3300          peel VF iterations.  */
3301       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3302         *peel_iters_epilogue = vf;
3303     }
3304
3305   stmt_info_for_cost *si;
3306   int j;
3307   if (peel_iters_prologue)
3308     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3309         {
3310           stmt_vec_info stmt_info
3311             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3312           retval += record_stmt_cost (prologue_cost_vec,
3313                                       si->count * peel_iters_prologue,
3314                                       si->kind, stmt_info, si->misalign,
3315                                       vect_prologue);
3316         }
3317   if (*peel_iters_epilogue)
3318     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3319         {
3320           stmt_vec_info stmt_info
3321             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3322           retval += record_stmt_cost (epilogue_cost_vec,
3323                                       si->count * *peel_iters_epilogue,
3324                                       si->kind, stmt_info, si->misalign,
3325                                       vect_epilogue);
3326         }
3327
3328   return retval;
3329 }
3330
3331 /* Function vect_estimate_min_profitable_iters
3332
3333    Return the number of iterations required for the vector version of the
3334    loop to be profitable relative to the cost of the scalar version of the
3335    loop.
3336
3337    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3338    of iterations for vectorization.  -1 value means loop vectorization
3339    is not profitable.  This returned value may be used for dynamic
3340    profitability check.
3341
3342    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3343    for static check against estimated number of iterations.  */
3344
3345 static void
3346 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3347                                     int *ret_min_profitable_niters,
3348                                     int *ret_min_profitable_estimate)
3349 {
3350   int min_profitable_iters;
3351   int min_profitable_estimate;
3352   int peel_iters_prologue;
3353   int peel_iters_epilogue;
3354   unsigned vec_inside_cost = 0;
3355   int vec_outside_cost = 0;
3356   unsigned vec_prologue_cost = 0;
3357   unsigned vec_epilogue_cost = 0;
3358   int scalar_single_iter_cost = 0;
3359   int scalar_outside_cost = 0;
3360   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3361   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3362   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3363
3364   /* Cost model disabled.  */
3365   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3366     {
3367       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3368       *ret_min_profitable_niters = 0;
3369       *ret_min_profitable_estimate = 0;
3370       return;
3371     }
3372
3373   /* Requires loop versioning tests to handle misalignment.  */
3374   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3375     {
3376       /*  FIXME: Make cost depend on complexity of individual check.  */
3377       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3378       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3379                             vect_prologue);
3380       dump_printf (MSG_NOTE,
3381                    "cost model: Adding cost of checks for loop "
3382                    "versioning to treat misalignment.\n");
3383     }
3384
3385   /* Requires loop versioning with alias checks.  */
3386   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3387     {
3388       /*  FIXME: Make cost depend on complexity of individual check.  */
3389       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3390       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3391                             vect_prologue);
3392       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3393       if (len)
3394         /* Count LEN - 1 ANDs and LEN comparisons.  */
3395         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3396                               NULL, 0, vect_prologue);
3397       dump_printf (MSG_NOTE,
3398                    "cost model: Adding cost of checks for loop "
3399                    "versioning aliasing.\n");
3400     }
3401
3402   /* Requires loop versioning with niter checks.  */
3403   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3404     {
3405       /*  FIXME: Make cost depend on complexity of individual check.  */
3406       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3407                             vect_prologue);
3408       dump_printf (MSG_NOTE,
3409                    "cost model: Adding cost of checks for loop "
3410                    "versioning niters.\n");
3411     }
3412
3413   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3414     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3415                           vect_prologue);
3416
3417   /* Count statements in scalar loop.  Using this as scalar cost for a single
3418      iteration for now.
3419
3420      TODO: Add outer loop support.
3421
3422      TODO: Consider assigning different costs to different scalar
3423      statements.  */
3424
3425   scalar_single_iter_cost
3426     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3427
3428   /* Add additional cost for the peeled instructions in prologue and epilogue
3429      loop.
3430
3431      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3432      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3433
3434      TODO: Build an expression that represents peel_iters for prologue and
3435      epilogue to be used in a run-time test.  */
3436
3437   if (npeel  < 0)
3438     {
3439       peel_iters_prologue = vf/2;
3440       dump_printf (MSG_NOTE, "cost model: "
3441                    "prologue peel iters set to vf/2.\n");
3442
3443       /* If peeling for alignment is unknown, loop bound of main loop becomes
3444          unknown.  */
3445       peel_iters_epilogue = vf/2;
3446       dump_printf (MSG_NOTE, "cost model: "
3447                    "epilogue peel iters set to vf/2 because "
3448                    "peeling for alignment is unknown.\n");
3449
3450       /* If peeled iterations are unknown, count a taken branch and a not taken
3451          branch per peeled loop. Even if scalar loop iterations are known,
3452          vector iterations are not known since peeled prologue iterations are
3453          not known. Hence guards remain the same.  */
3454       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3455                             NULL, 0, vect_prologue);
3456       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3457                             NULL, 0, vect_prologue);
3458       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3459                             NULL, 0, vect_epilogue);
3460       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3461                             NULL, 0, vect_epilogue);
3462       stmt_info_for_cost *si;
3463       int j;
3464       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3465         {
3466           struct _stmt_vec_info *stmt_info
3467             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3468           (void) add_stmt_cost (target_cost_data,
3469                                 si->count * peel_iters_prologue,
3470                                 si->kind, stmt_info, si->misalign,
3471                                 vect_prologue);
3472           (void) add_stmt_cost (target_cost_data,
3473                                 si->count * peel_iters_epilogue,
3474                                 si->kind, stmt_info, si->misalign,
3475                                 vect_epilogue);
3476         }
3477     }
3478   else
3479     {
3480       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3481       stmt_info_for_cost *si;
3482       int j;
3483       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3484
3485       prologue_cost_vec.create (2);
3486       epilogue_cost_vec.create (2);
3487       peel_iters_prologue = npeel;
3488
3489       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3490                                           &peel_iters_epilogue,
3491                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3492                                             (loop_vinfo),
3493                                           &prologue_cost_vec,
3494                                           &epilogue_cost_vec);
3495
3496       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3497         {
3498           struct _stmt_vec_info *stmt_info
3499             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3500           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3501                                 si->misalign, vect_prologue);
3502         }
3503
3504       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3505         {
3506           struct _stmt_vec_info *stmt_info
3507             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3508           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3509                                 si->misalign, vect_epilogue);
3510         }
3511
3512       prologue_cost_vec.release ();
3513       epilogue_cost_vec.release ();
3514     }
3515
3516   /* FORNOW: The scalar outside cost is incremented in one of the
3517      following ways:
3518
3519      1. The vectorizer checks for alignment and aliasing and generates
3520      a condition that allows dynamic vectorization.  A cost model
3521      check is ANDED with the versioning condition.  Hence scalar code
3522      path now has the added cost of the versioning check.
3523
3524        if (cost > th & versioning_check)
3525          jmp to vector code
3526
3527      Hence run-time scalar is incremented by not-taken branch cost.
3528
3529      2. The vectorizer then checks if a prologue is required.  If the
3530      cost model check was not done before during versioning, it has to
3531      be done before the prologue check.
3532
3533        if (cost <= th)
3534          prologue = scalar_iters
3535        if (prologue == 0)
3536          jmp to vector code
3537        else
3538          execute prologue
3539        if (prologue == num_iters)
3540          go to exit
3541
3542      Hence the run-time scalar cost is incremented by a taken branch,
3543      plus a not-taken branch, plus a taken branch cost.
3544
3545      3. The vectorizer then checks if an epilogue is required.  If the
3546      cost model check was not done before during prologue check, it
3547      has to be done with the epilogue check.
3548
3549        if (prologue == 0)
3550          jmp to vector code
3551        else
3552          execute prologue
3553        if (prologue == num_iters)
3554          go to exit
3555        vector code:
3556          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3557            jmp to epilogue
3558
3559      Hence the run-time scalar cost should be incremented by 2 taken
3560      branches.
3561
3562      TODO: The back end may reorder the BBS's differently and reverse
3563      conditions/branch directions.  Change the estimates below to
3564      something more reasonable.  */
3565
3566   /* If the number of iterations is known and we do not do versioning, we can
3567      decide whether to vectorize at compile time.  Hence the scalar version
3568      do not carry cost model guard costs.  */
3569   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3570       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3571     {
3572       /* Cost model check occurs at versioning.  */
3573       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3574         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3575       else
3576         {
3577           /* Cost model check occurs at prologue generation.  */
3578           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3579             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3580               + vect_get_stmt_cost (cond_branch_not_taken);
3581           /* Cost model check occurs at epilogue generation.  */
3582           else
3583             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3584         }
3585     }
3586
3587   /* Complete the target-specific cost calculations.  */
3588   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3589                &vec_inside_cost, &vec_epilogue_cost);
3590
3591   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3592
3593   if (dump_enabled_p ())
3594     {
3595       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3596       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3597                    vec_inside_cost);
3598       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3599                    vec_prologue_cost);
3600       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3601                    vec_epilogue_cost);
3602       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3603                    scalar_single_iter_cost);
3604       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3605                    scalar_outside_cost);
3606       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3607                    vec_outside_cost);
3608       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3609                    peel_iters_prologue);
3610       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3611                    peel_iters_epilogue);
3612     }
3613
3614   /* Calculate number of iterations required to make the vector version
3615      profitable, relative to the loop bodies only.  The following condition
3616      must hold true:
3617      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3618      where
3619      SIC = scalar iteration cost, VIC = vector iteration cost,
3620      VOC = vector outside cost, VF = vectorization factor,
3621      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3622      SOC = scalar outside cost for run time cost model check.  */
3623
3624   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3625     {
3626       if (vec_outside_cost <= 0)
3627         min_profitable_iters = 0;
3628       else
3629         {
3630           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3631                                   - vec_inside_cost * peel_iters_prologue
3632                                   - vec_inside_cost * peel_iters_epilogue)
3633                                  / ((scalar_single_iter_cost * vf)
3634                                     - vec_inside_cost);
3635
3636           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3637               <= (((int) vec_inside_cost * min_profitable_iters)
3638                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3639             min_profitable_iters++;
3640         }
3641     }
3642   /* vector version will never be profitable.  */
3643   else
3644     {
3645       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3646         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3647                     "did not happen for a simd loop");
3648
3649       if (dump_enabled_p ())
3650         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3651                          "cost model: the vector iteration cost = %d "
3652                          "divided by the scalar iteration cost = %d "
3653                          "is greater or equal to the vectorization factor = %d"
3654                          ".\n",
3655                          vec_inside_cost, scalar_single_iter_cost, vf);
3656       *ret_min_profitable_niters = -1;
3657       *ret_min_profitable_estimate = -1;
3658       return;
3659     }
3660
3661   dump_printf (MSG_NOTE,
3662                "  Calculated minimum iters for profitability: %d\n",
3663                min_profitable_iters);
3664
3665   /* We want the vectorized loop to execute at least once.  */
3666   if (min_profitable_iters < (vf + peel_iters_prologue))
3667     min_profitable_iters = vf + peel_iters_prologue;
3668
3669   if (dump_enabled_p ())
3670     dump_printf_loc (MSG_NOTE, vect_location,
3671                      "  Runtime profitability threshold = %d\n",
3672                      min_profitable_iters);
3673
3674   *ret_min_profitable_niters = min_profitable_iters;
3675
3676   /* Calculate number of iterations required to make the vector version
3677      profitable, relative to the loop bodies only.
3678
3679      Non-vectorized variant is SIC * niters and it must win over vector
3680      variant on the expected loop trip count.  The following condition must hold true:
3681      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3682
3683   if (vec_outside_cost <= 0)
3684     min_profitable_estimate = 0;
3685   else
3686     {
3687       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3688                                  - vec_inside_cost * peel_iters_prologue
3689                                  - vec_inside_cost * peel_iters_epilogue)
3690                                  / ((scalar_single_iter_cost * vf)
3691                                    - vec_inside_cost);
3692     }
3693   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3694   if (dump_enabled_p ())
3695     dump_printf_loc (MSG_NOTE, vect_location,
3696                      "  Static estimate profitability threshold = %d\n",
3697                      min_profitable_estimate);
3698
3699   *ret_min_profitable_estimate = min_profitable_estimate;
3700 }
3701
3702 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3703    vector elements (not bits) for a vector with NELT elements.  */
3704 static void
3705 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3706                               vec_perm_indices *sel)
3707 {
3708   unsigned int i;
3709
3710   for (i = 0; i < nelt; i++)
3711     sel->quick_push ((i + offset) & (2 * nelt - 1));
3712 }
3713
3714 /* Checks whether the target supports whole-vector shifts for vectors of mode
3715    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3716    it supports vec_perm_const with masks for all necessary shift amounts.  */
3717 static bool
3718 have_whole_vector_shift (machine_mode mode)
3719 {
3720   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3721     return true;
3722
3723   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3724     return false;
3725
3726   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3727   auto_vec_perm_indices sel (nelt);
3728
3729   for (i = nelt/2; i >= 1; i/=2)
3730     {
3731       sel.truncate (0);
3732       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3733       if (!can_vec_perm_p (mode, false, &sel))
3734         return false;
3735     }
3736   return true;
3737 }
3738
3739 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3740    functions. Design better to avoid maintenance issues.  */
3741
3742 /* Function vect_model_reduction_cost.
3743
3744    Models cost for a reduction operation, including the vector ops
3745    generated within the strip-mine loop, the initial definition before
3746    the loop, and the epilogue code that must be generated.  */
3747
3748 static void
3749 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3750                            int ncopies)
3751 {
3752   int prologue_cost = 0, epilogue_cost = 0;
3753   enum tree_code code;
3754   optab optab;
3755   tree vectype;
3756   gimple *orig_stmt;
3757   machine_mode mode;
3758   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3759   struct loop *loop = NULL;
3760   void *target_cost_data;
3761
3762   if (loop_vinfo)
3763     {
3764       loop = LOOP_VINFO_LOOP (loop_vinfo);
3765       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3766     }
3767   else
3768     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3769
3770   /* Condition reductions generate two reductions in the loop.  */
3771   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3772     ncopies *= 2;
3773
3774   /* Cost of reduction op inside loop.  */
3775   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3776                                         stmt_info, 0, vect_body);
3777
3778   vectype = STMT_VINFO_VECTYPE (stmt_info);
3779   mode = TYPE_MODE (vectype);
3780   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3781
3782   if (!orig_stmt)
3783     orig_stmt = STMT_VINFO_STMT (stmt_info);
3784
3785   code = gimple_assign_rhs_code (orig_stmt);
3786
3787   /* Add in cost for initial definition.
3788      For cond reduction we have four vectors: initial index, step, initial
3789      result of the data reduction, initial value of the index reduction.  */
3790   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3791                        == COND_REDUCTION ? 4 : 1;
3792   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3793                                   scalar_to_vec, stmt_info, 0,
3794                                   vect_prologue);
3795
3796   /* Determine cost of epilogue code.
3797
3798      We have a reduction operator that will reduce the vector in one statement.
3799      Also requires scalar extract.  */
3800
3801   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3802     {
3803       if (reduc_code != ERROR_MARK)
3804         {
3805           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3806             {
3807               /* An EQ stmt and an COND_EXPR stmt.  */
3808               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3809                                               vector_stmt, stmt_info, 0,
3810                                               vect_epilogue);
3811               /* Reduction of the max index and a reduction of the found
3812                  values.  */
3813               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3814                                               vec_to_scalar, stmt_info, 0,
3815                                               vect_epilogue);
3816               /* A broadcast of the max value.  */
3817               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3818                                               scalar_to_vec, stmt_info, 0,
3819                                               vect_epilogue);
3820             }
3821           else
3822             {
3823               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3824                                               stmt_info, 0, vect_epilogue);
3825               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3826                                               vec_to_scalar, stmt_info, 0,
3827                                               vect_epilogue);
3828             }
3829         }
3830       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3831         {
3832           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3833           /* Extraction of scalar elements.  */
3834           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3835                                           vec_to_scalar, stmt_info, 0,
3836                                           vect_epilogue);
3837           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3838           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3839                                           scalar_stmt, stmt_info, 0,
3840                                           vect_epilogue);
3841         }
3842       else
3843         {
3844           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3845           tree bitsize =
3846             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3847           int element_bitsize = tree_to_uhwi (bitsize);
3848           int nelements = vec_size_in_bits / element_bitsize;
3849
3850           if (code == COND_EXPR)
3851             code = MAX_EXPR;
3852
3853           optab = optab_for_tree_code (code, vectype, optab_default);
3854
3855           /* We have a whole vector shift available.  */
3856           if (optab != unknown_optab
3857               && VECTOR_MODE_P (mode)
3858               && optab_handler (optab, mode) != CODE_FOR_nothing
3859               && have_whole_vector_shift (mode))
3860             {
3861               /* Final reduction via vector shifts and the reduction operator.
3862                  Also requires scalar extract.  */
3863               epilogue_cost += add_stmt_cost (target_cost_data,
3864                                               exact_log2 (nelements) * 2,
3865                                               vector_stmt, stmt_info, 0,
3866                                               vect_epilogue);
3867               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3868                                               vec_to_scalar, stmt_info, 0,
3869                                               vect_epilogue);
3870             }
3871           else
3872             /* Use extracts and reduction op for final reduction.  For N
3873                elements, we have N extracts and N-1 reduction ops.  */
3874             epilogue_cost += add_stmt_cost (target_cost_data,
3875                                             nelements + nelements - 1,
3876                                             vector_stmt, stmt_info, 0,
3877                                             vect_epilogue);
3878         }
3879     }
3880
3881   if (dump_enabled_p ())
3882     dump_printf (MSG_NOTE,
3883                  "vect_model_reduction_cost: inside_cost = %d, "
3884                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3885                  prologue_cost, epilogue_cost);
3886 }
3887
3888
3889 /* Function vect_model_induction_cost.
3890
3891    Models cost for induction operations.  */
3892
3893 static void
3894 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3895 {
3896   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3897   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3898   unsigned inside_cost, prologue_cost;
3899
3900   if (PURE_SLP_STMT (stmt_info))
3901     return;
3902
3903   /* loop cost for vec_loop.  */
3904   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3905                                stmt_info, 0, vect_body);
3906
3907   /* prologue cost for vec_init and vec_step.  */
3908   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3909                                  stmt_info, 0, vect_prologue);
3910
3911   if (dump_enabled_p ())
3912     dump_printf_loc (MSG_NOTE, vect_location,
3913                      "vect_model_induction_cost: inside_cost = %d, "
3914                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3915 }
3916
3917
3918
3919 /* Function get_initial_def_for_reduction
3920
3921    Input:
3922    STMT - a stmt that performs a reduction operation in the loop.
3923    INIT_VAL - the initial value of the reduction variable
3924
3925    Output:
3926    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3927         of the reduction (used for adjusting the epilog - see below).
3928    Return a vector variable, initialized according to the operation that STMT
3929         performs. This vector will be used as the initial value of the
3930         vector of partial results.
3931
3932    Option1 (adjust in epilog): Initialize the vector as follows:
3933      add/bit or/xor:    [0,0,...,0,0]
3934      mult/bit and:      [1,1,...,1,1]
3935      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3936    and when necessary (e.g. add/mult case) let the caller know
3937    that it needs to adjust the result by init_val.
3938
3939    Option2: Initialize the vector as follows:
3940      add/bit or/xor:    [init_val,0,0,...,0]
3941      mult/bit and:      [init_val,1,1,...,1]
3942      min/max/cond_expr: [init_val,init_val,...,init_val]
3943    and no adjustments are needed.
3944
3945    For example, for the following code:
3946
3947    s = init_val;
3948    for (i=0;i<n;i++)
3949      s = s + a[i];
3950
3951    STMT is 's = s + a[i]', and the reduction variable is 's'.
3952    For a vector of 4 units, we want to return either [0,0,0,init_val],
3953    or [0,0,0,0] and let the caller know that it needs to adjust
3954    the result at the end by 'init_val'.
3955
3956    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3957    initialization vector is simpler (same element in all entries), if
3958    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3959
3960    A cost model should help decide between these two schemes.  */
3961
3962 tree
3963 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3964                                tree *adjustment_def)
3965 {
3966   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3967   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3968   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3969   tree scalar_type = TREE_TYPE (init_val);
3970   tree vectype = get_vectype_for_scalar_type (scalar_type);
3971   int nunits;
3972   enum tree_code code = gimple_assign_rhs_code (stmt);
3973   tree def_for_init;
3974   tree init_def;
3975   int i;
3976   bool nested_in_vect_loop = false;
3977   REAL_VALUE_TYPE real_init_val = dconst0;
3978   int int_init_val = 0;
3979   gimple *def_stmt = NULL;
3980   gimple_seq stmts = NULL;
3981
3982   gcc_assert (vectype);
3983   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3984
3985   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3986               || SCALAR_FLOAT_TYPE_P (scalar_type));
3987
3988   if (nested_in_vect_loop_p (loop, stmt))
3989     nested_in_vect_loop = true;
3990   else
3991     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3992
3993   /* In case of double reduction we only create a vector variable to be put
3994      in the reduction phi node.  The actual statement creation is done in
3995      vect_create_epilog_for_reduction.  */
3996   if (adjustment_def && nested_in_vect_loop
3997       && TREE_CODE (init_val) == SSA_NAME
3998       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3999       && gimple_code (def_stmt) == GIMPLE_PHI
4000       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4001       && vinfo_for_stmt (def_stmt)
4002       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4003           == vect_double_reduction_def)
4004     {
4005       *adjustment_def = NULL;
4006       return vect_create_destination_var (init_val, vectype);
4007     }
4008
4009   /* In case of a nested reduction do not use an adjustment def as
4010      that case is not supported by the epilogue generation correctly
4011      if ncopies is not one.  */
4012   if (adjustment_def && nested_in_vect_loop)
4013     {
4014       *adjustment_def = NULL;
4015       return vect_get_vec_def_for_operand (init_val, stmt);
4016     }
4017
4018   switch (code)
4019     {
4020     case WIDEN_SUM_EXPR:
4021     case DOT_PROD_EXPR:
4022     case SAD_EXPR:
4023     case PLUS_EXPR:
4024     case MINUS_EXPR:
4025     case BIT_IOR_EXPR:
4026     case BIT_XOR_EXPR:
4027     case MULT_EXPR:
4028     case BIT_AND_EXPR:
4029       {
4030         /* ADJUSMENT_DEF is NULL when called from
4031            vect_create_epilog_for_reduction to vectorize double reduction.  */
4032         if (adjustment_def)
4033           *adjustment_def = init_val;
4034
4035         if (code == MULT_EXPR)
4036           {
4037             real_init_val = dconst1;
4038             int_init_val = 1;
4039           }
4040
4041         if (code == BIT_AND_EXPR)
4042           int_init_val = -1;
4043
4044         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4045           def_for_init = build_real (scalar_type, real_init_val);
4046         else
4047           def_for_init = build_int_cst (scalar_type, int_init_val);
4048
4049         if (adjustment_def)
4050           /* Option1: the first element is '0' or '1' as well.  */
4051           init_def = gimple_build_vector_from_val (&stmts, vectype,
4052                                                    def_for_init);
4053         else
4054           {
4055             /* Option2: the first element is INIT_VAL.  */
4056             auto_vec<tree, 32> elts (nunits);
4057             elts.quick_push (init_val);
4058             for (i = 1; i < nunits; ++i)
4059               elts.quick_push (def_for_init);
4060             init_def = gimple_build_vector (&stmts, vectype, elts);
4061           }
4062       }
4063       break;
4064
4065     case MIN_EXPR:
4066     case MAX_EXPR:
4067     case COND_EXPR:
4068       {
4069         if (adjustment_def)
4070           {
4071             *adjustment_def = NULL_TREE;
4072             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4073               {
4074                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4075                 break;
4076               }
4077           }
4078         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4079         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4080       }
4081       break;
4082
4083     default:
4084       gcc_unreachable ();
4085     }
4086
4087   if (stmts)
4088     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4089   return init_def;
4090 }
4091
4092 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4093    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4094
4095 static void
4096 get_initial_defs_for_reduction (slp_tree slp_node,
4097                                 vec<tree> *vec_oprnds,
4098                                 unsigned int number_of_vectors,
4099                                 enum tree_code code, bool reduc_chain)
4100 {
4101   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4102   gimple *stmt = stmts[0];
4103   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4104   unsigned nunits;
4105   unsigned j, number_of_places_left_in_vector;
4106   tree vector_type, scalar_type;
4107   tree vop;
4108   int group_size = stmts.length ();
4109   unsigned int vec_num, i;
4110   unsigned number_of_copies = 1;
4111   vec<tree> voprnds;
4112   voprnds.create (number_of_vectors);
4113   tree neutral_op = NULL;
4114   struct loop *loop;
4115
4116   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4117   scalar_type = TREE_TYPE (vector_type);
4118   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4119
4120   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4121
4122   loop = (gimple_bb (stmt))->loop_father;
4123   gcc_assert (loop);
4124   edge pe = loop_preheader_edge (loop);
4125
4126   /* op is the reduction operand of the first stmt already.  */
4127   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4128      we need either neutral operands or the original operands.  See
4129      get_initial_def_for_reduction() for details.  */
4130   switch (code)
4131     {
4132     case WIDEN_SUM_EXPR:
4133     case DOT_PROD_EXPR:
4134     case SAD_EXPR:
4135     case PLUS_EXPR:
4136     case MINUS_EXPR:
4137     case BIT_IOR_EXPR:
4138     case BIT_XOR_EXPR:
4139       neutral_op = build_zero_cst (scalar_type);
4140       break;
4141
4142     case MULT_EXPR:
4143       neutral_op = build_one_cst (scalar_type);
4144       break;
4145
4146     case BIT_AND_EXPR:
4147       neutral_op = build_all_ones_cst (scalar_type);
4148       break;
4149
4150     /* For MIN/MAX we don't have an easy neutral operand but
4151        the initial values can be used fine here.  Only for
4152        a reduction chain we have to force a neutral element.  */
4153     case MAX_EXPR:
4154     case MIN_EXPR:
4155       if (! reduc_chain)
4156         neutral_op = NULL;
4157       else
4158         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4159       break;
4160
4161     default:
4162       gcc_assert (! reduc_chain);
4163       neutral_op = NULL;
4164     }
4165
4166   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4167      created vectors. It is greater than 1 if unrolling is performed.
4168
4169      For example, we have two scalar operands, s1 and s2 (e.g., group of
4170      strided accesses of size two), while NUNITS is four (i.e., four scalars
4171      of this type can be packed in a vector).  The output vector will contain
4172      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4173      will be 2).
4174
4175      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4176      containing the operands.
4177
4178      For example, NUNITS is four as before, and the group size is 8
4179      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4180      {s5, s6, s7, s8}.  */
4181
4182   number_of_copies = nunits * number_of_vectors / group_size;
4183
4184   number_of_places_left_in_vector = nunits;
4185   auto_vec<tree, 32> elts (nunits);
4186   elts.quick_grow (nunits);
4187   for (j = 0; j < number_of_copies; j++)
4188     {
4189       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4190         {
4191           tree op;
4192           /* Get the def before the loop.  In reduction chain we have only
4193              one initial value.  */
4194           if ((j != (number_of_copies - 1)
4195                || (reduc_chain && i != 0))
4196               && neutral_op)
4197             op = neutral_op;
4198           else
4199             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4200
4201           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4202           number_of_places_left_in_vector--;
4203           elts[number_of_places_left_in_vector] = op;
4204
4205           if (number_of_places_left_in_vector == 0)
4206             {
4207               gimple_seq ctor_seq = NULL;
4208               tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4209               if (ctor_seq != NULL)
4210                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4211               voprnds.quick_push (init);
4212
4213               number_of_places_left_in_vector = nunits;
4214             }
4215         }
4216     }
4217
4218   /* Since the vectors are created in the reverse order, we should invert
4219      them.  */
4220   vec_num = voprnds.length ();
4221   for (j = vec_num; j != 0; j--)
4222     {
4223       vop = voprnds[j - 1];
4224       vec_oprnds->quick_push (vop);
4225     }
4226
4227   voprnds.release ();
4228
4229   /* In case that VF is greater than the unrolling factor needed for the SLP
4230      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4231      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4232      to replicate the vectors.  */
4233   tree neutral_vec = NULL;
4234   while (number_of_vectors > vec_oprnds->length ())
4235     {
4236       if (neutral_op)
4237         {
4238           if (!neutral_vec)
4239             {
4240               gimple_seq ctor_seq = NULL;
4241               neutral_vec = gimple_build_vector_from_val
4242                 (&ctor_seq, vector_type, neutral_op);
4243               if (ctor_seq != NULL)
4244                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4245             }
4246           vec_oprnds->quick_push (neutral_vec);
4247         }
4248       else
4249         {
4250           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4251             vec_oprnds->quick_push (vop);
4252         }
4253     }
4254 }
4255
4256
4257 /* Function vect_create_epilog_for_reduction
4258
4259    Create code at the loop-epilog to finalize the result of a reduction
4260    computation.
4261
4262    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4263      reduction statements.
4264    STMT is the scalar reduction stmt that is being vectorized.
4265    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4266      number of elements that we can fit in a vectype (nunits).  In this case
4267      we have to generate more than one vector stmt - i.e - we need to "unroll"
4268      the vector stmt by a factor VF/nunits.  For more details see documentation
4269      in vectorizable_operation.
4270    REDUC_CODE is the tree-code for the epilog reduction.
4271    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4272      computation.
4273    REDUC_INDEX is the index of the operand in the right hand side of the
4274      statement that is defined by REDUCTION_PHI.
4275    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4276    SLP_NODE is an SLP node containing a group of reduction statements. The
4277      first one in this group is STMT.
4278
4279    This function:
4280    1. Creates the reduction def-use cycles: sets the arguments for
4281       REDUCTION_PHIS:
4282       The loop-entry argument is the vectorized initial-value of the reduction.
4283       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4284       sums.
4285    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4286       by applying the operation specified by REDUC_CODE if available, or by
4287       other means (whole-vector shifts or a scalar loop).
4288       The function also creates a new phi node at the loop exit to preserve
4289       loop-closed form, as illustrated below.
4290
4291      The flow at the entry to this function:
4292
4293         loop:
4294           vec_def = phi <null, null>            # REDUCTION_PHI
4295           VECT_DEF = vector_stmt                # vectorized form of STMT
4296           s_loop = scalar_stmt                  # (scalar) STMT
4297         loop_exit:
4298           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4299           use <s_out0>
4300           use <s_out0>
4301
4302      The above is transformed by this function into:
4303
4304         loop:
4305           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4306           VECT_DEF = vector_stmt                # vectorized form of STMT
4307           s_loop = scalar_stmt                  # (scalar) STMT
4308         loop_exit:
4309           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4310           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4311           v_out2 = reduce <v_out1>
4312           s_out3 = extract_field <v_out2, 0>
4313           s_out4 = adjust_result <s_out3>
4314           use <s_out4>
4315           use <s_out4>
4316 */
4317
4318 static void
4319 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4320                                   gimple *reduc_def_stmt,
4321                                   int ncopies, enum tree_code reduc_code,
4322                                   vec<gimple *> reduction_phis,
4323                                   bool double_reduc,
4324                                   slp_tree slp_node,
4325                                   slp_instance slp_node_instance)
4326 {
4327   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4328   stmt_vec_info prev_phi_info;
4329   tree vectype;
4330   machine_mode mode;
4331   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4332   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4333   basic_block exit_bb;
4334   tree scalar_dest;
4335   tree scalar_type;
4336   gimple *new_phi = NULL, *phi;
4337   gimple_stmt_iterator exit_gsi;
4338   tree vec_dest;
4339   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4340   gimple *epilog_stmt = NULL;
4341   enum tree_code code = gimple_assign_rhs_code (stmt);
4342   gimple *exit_phi;
4343   tree bitsize;
4344   tree adjustment_def = NULL;
4345   tree vec_initial_def = NULL;
4346   tree expr, def, initial_def = NULL;
4347   tree orig_name, scalar_result;
4348   imm_use_iterator imm_iter, phi_imm_iter;
4349   use_operand_p use_p, phi_use_p;
4350   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4351   bool nested_in_vect_loop = false;
4352   auto_vec<gimple *> new_phis;
4353   auto_vec<gimple *> inner_phis;
4354   enum vect_def_type dt = vect_unknown_def_type;
4355   int j, i;
4356   auto_vec<tree> scalar_results;
4357   unsigned int group_size = 1, k, ratio;
4358   auto_vec<tree> vec_initial_defs;
4359   auto_vec<gimple *> phis;
4360   bool slp_reduc = false;
4361   tree new_phi_result;
4362   gimple *inner_phi = NULL;
4363   tree induction_index = NULL_TREE;
4364
4365   if (slp_node)
4366     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4367
4368   if (nested_in_vect_loop_p (loop, stmt))
4369     {
4370       outer_loop = loop;
4371       loop = loop->inner;
4372       nested_in_vect_loop = true;
4373       gcc_assert (!slp_node);
4374     }
4375
4376   vectype = STMT_VINFO_VECTYPE (stmt_info);
4377   gcc_assert (vectype);
4378   mode = TYPE_MODE (vectype);
4379
4380   /* 1. Create the reduction def-use cycle:
4381      Set the arguments of REDUCTION_PHIS, i.e., transform
4382
4383         loop:
4384           vec_def = phi <null, null>            # REDUCTION_PHI
4385           VECT_DEF = vector_stmt                # vectorized form of STMT
4386           ...
4387
4388      into:
4389
4390         loop:
4391           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4392           VECT_DEF = vector_stmt                # vectorized form of STMT
4393           ...
4394
4395      (in case of SLP, do it for all the phis). */
4396
4397   /* Get the loop-entry arguments.  */
4398   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4399   if (slp_node)
4400     {
4401       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4402       vec_initial_defs.reserve (vec_num);
4403       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4404                                       &vec_initial_defs, vec_num, code,
4405                                       GROUP_FIRST_ELEMENT (stmt_info));
4406     }
4407   else
4408     {
4409       /* Get at the scalar def before the loop, that defines the initial value
4410          of the reduction variable.  */
4411       gimple *def_stmt;
4412       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4413                                            loop_preheader_edge (loop));
4414       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4415       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4416                                                        &adjustment_def);
4417       vec_initial_defs.create (1);
4418       vec_initial_defs.quick_push (vec_initial_def);
4419     }
4420
4421   /* Set phi nodes arguments.  */
4422   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4423     {
4424       tree vec_init_def = vec_initial_defs[i];
4425       tree def = vect_defs[i];
4426       for (j = 0; j < ncopies; j++)
4427         {
4428           if (j != 0)
4429             {
4430               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4431               if (nested_in_vect_loop)
4432                 vec_init_def
4433                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4434                                                     vec_init_def);
4435             }
4436
4437           /* Set the loop-entry arg of the reduction-phi.  */
4438
4439           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4440               == INTEGER_INDUC_COND_REDUCTION)
4441             {
4442               /* Initialise the reduction phi to zero.  This prevents initial
4443                  values of non-zero interferring with the reduction op.  */
4444               gcc_assert (ncopies == 1);
4445               gcc_assert (i == 0);
4446
4447               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4448               tree zero_vec = build_zero_cst (vec_init_def_type);
4449
4450               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4451                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4452             }
4453           else
4454             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4455                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4456
4457           /* Set the loop-latch arg for the reduction-phi.  */
4458           if (j > 0)
4459             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4460
4461           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4462                        UNKNOWN_LOCATION);
4463
4464           if (dump_enabled_p ())
4465             {
4466               dump_printf_loc (MSG_NOTE, vect_location,
4467                                "transform reduction: created def-use cycle: ");
4468               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4469               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4470             }
4471         }
4472     }
4473
4474   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4475      which is updated with the current index of the loop for every match of
4476      the original loop's cond_expr (VEC_STMT).  This results in a vector
4477      containing the last time the condition passed for that vector lane.
4478      The first match will be a 1 to allow 0 to be used for non-matching
4479      indexes.  If there are no matches at all then the vector will be all
4480      zeroes.  */
4481   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4482     {
4483       tree indx_before_incr, indx_after_incr;
4484       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4485       int k;
4486
4487       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4488       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4489
4490       int scalar_precision
4491         = GET_MODE_PRECISION (TYPE_MODE (TREE_TYPE (vectype)));
4492       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4493       tree cr_index_vector_type = build_vector_type
4494         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4495
4496       /* First we create a simple vector induction variable which starts
4497          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4498          vector size (STEP).  */
4499
4500       /* Create a {1,2,3,...} vector.  */
4501       auto_vec<tree, 32> vtemp (nunits_out);
4502       for (k = 0; k < nunits_out; ++k)
4503         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4504       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4505
4506       /* Create a vector of the step value.  */
4507       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4508       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4509
4510       /* Create an induction variable.  */
4511       gimple_stmt_iterator incr_gsi;
4512       bool insert_after;
4513       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4514       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4515                  insert_after, &indx_before_incr, &indx_after_incr);
4516
4517       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4518          filled with zeros (VEC_ZERO).  */
4519
4520       /* Create a vector of 0s.  */
4521       tree zero = build_zero_cst (cr_index_scalar_type);
4522       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4523
4524       /* Create a vector phi node.  */
4525       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4526       new_phi = create_phi_node (new_phi_tree, loop->header);
4527       set_vinfo_for_stmt (new_phi,
4528                           new_stmt_vec_info (new_phi, loop_vinfo));
4529       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4530                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4531
4532       /* Now take the condition from the loops original cond_expr
4533          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4534          every match uses values from the induction variable
4535          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4536          (NEW_PHI_TREE).
4537          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4538          the new cond_expr (INDEX_COND_EXPR).  */
4539
4540       /* Duplicate the condition from vec_stmt.  */
4541       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4542
4543       /* Create a conditional, where the condition is taken from vec_stmt
4544          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4545          else is the phi (NEW_PHI_TREE).  */
4546       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4547                                      ccompare, indx_before_incr,
4548                                      new_phi_tree);
4549       induction_index = make_ssa_name (cr_index_vector_type);
4550       gimple *index_condition = gimple_build_assign (induction_index,
4551                                                      index_cond_expr);
4552       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4553       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4554                                                         loop_vinfo);
4555       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4556       set_vinfo_for_stmt (index_condition, index_vec_info);
4557
4558       /* Update the phi with the vec cond.  */
4559       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4560                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4561     }
4562
4563   /* 2. Create epilog code.
4564         The reduction epilog code operates across the elements of the vector
4565         of partial results computed by the vectorized loop.
4566         The reduction epilog code consists of:
4567
4568         step 1: compute the scalar result in a vector (v_out2)
4569         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4570         step 3: adjust the scalar result (s_out3) if needed.
4571
4572         Step 1 can be accomplished using one the following three schemes:
4573           (scheme 1) using reduc_code, if available.
4574           (scheme 2) using whole-vector shifts, if available.
4575           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4576                      combined.
4577
4578           The overall epilog code looks like this:
4579
4580           s_out0 = phi <s_loop>         # original EXIT_PHI
4581           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4582           v_out2 = reduce <v_out1>              # step 1
4583           s_out3 = extract_field <v_out2, 0>    # step 2
4584           s_out4 = adjust_result <s_out3>       # step 3
4585
4586           (step 3 is optional, and steps 1 and 2 may be combined).
4587           Lastly, the uses of s_out0 are replaced by s_out4.  */
4588
4589
4590   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4591          v_out1 = phi <VECT_DEF>
4592          Store them in NEW_PHIS.  */
4593
4594   exit_bb = single_exit (loop)->dest;
4595   prev_phi_info = NULL;
4596   new_phis.create (vect_defs.length ());
4597   FOR_EACH_VEC_ELT (vect_defs, i, def)
4598     {
4599       for (j = 0; j < ncopies; j++)
4600         {
4601           tree new_def = copy_ssa_name (def);
4602           phi = create_phi_node (new_def, exit_bb);
4603           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4604           if (j == 0)
4605             new_phis.quick_push (phi);
4606           else
4607             {
4608               def = vect_get_vec_def_for_stmt_copy (dt, def);
4609               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4610             }
4611
4612           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4613           prev_phi_info = vinfo_for_stmt (phi);
4614         }
4615     }
4616
4617   /* The epilogue is created for the outer-loop, i.e., for the loop being
4618      vectorized.  Create exit phis for the outer loop.  */
4619   if (double_reduc)
4620     {
4621       loop = outer_loop;
4622       exit_bb = single_exit (loop)->dest;
4623       inner_phis.create (vect_defs.length ());
4624       FOR_EACH_VEC_ELT (new_phis, i, phi)
4625         {
4626           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4627           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4628           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4629                            PHI_RESULT (phi));
4630           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4631                                                             loop_vinfo));
4632           inner_phis.quick_push (phi);
4633           new_phis[i] = outer_phi;
4634           prev_phi_info = vinfo_for_stmt (outer_phi);
4635           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4636             {
4637               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4638               new_result = copy_ssa_name (PHI_RESULT (phi));
4639               outer_phi = create_phi_node (new_result, exit_bb);
4640               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4641                                PHI_RESULT (phi));
4642               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4643                                                                 loop_vinfo));
4644               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4645               prev_phi_info = vinfo_for_stmt (outer_phi);
4646             }
4647         }
4648     }
4649
4650   exit_gsi = gsi_after_labels (exit_bb);
4651
4652   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4653          (i.e. when reduc_code is not available) and in the final adjustment
4654          code (if needed).  Also get the original scalar reduction variable as
4655          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4656          represents a reduction pattern), the tree-code and scalar-def are
4657          taken from the original stmt that the pattern-stmt (STMT) replaces.
4658          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4659          are taken from STMT.  */
4660
4661   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4662   if (!orig_stmt)
4663     {
4664       /* Regular reduction  */
4665       orig_stmt = stmt;
4666     }
4667   else
4668     {
4669       /* Reduction pattern  */
4670       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4671       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4672       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4673     }
4674
4675   code = gimple_assign_rhs_code (orig_stmt);
4676   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4677      partial results are added and not subtracted.  */
4678   if (code == MINUS_EXPR)
4679     code = PLUS_EXPR;
4680
4681   scalar_dest = gimple_assign_lhs (orig_stmt);
4682   scalar_type = TREE_TYPE (scalar_dest);
4683   scalar_results.create (group_size);
4684   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4685   bitsize = TYPE_SIZE (scalar_type);
4686
4687   /* In case this is a reduction in an inner-loop while vectorizing an outer
4688      loop - we don't need to extract a single scalar result at the end of the
4689      inner-loop (unless it is double reduction, i.e., the use of reduction is
4690      outside the outer-loop).  The final vector of partial results will be used
4691      in the vectorized outer-loop, or reduced to a scalar result at the end of
4692      the outer-loop.  */
4693   if (nested_in_vect_loop && !double_reduc)
4694     goto vect_finalize_reduction;
4695
4696   /* SLP reduction without reduction chain, e.g.,
4697      # a1 = phi <a2, a0>
4698      # b1 = phi <b2, b0>
4699      a2 = operation (a1)
4700      b2 = operation (b1)  */
4701   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4702
4703   /* In case of reduction chain, e.g.,
4704      # a1 = phi <a3, a0>
4705      a2 = operation (a1)
4706      a3 = operation (a2),
4707
4708      we may end up with more than one vector result.  Here we reduce them to
4709      one vector.  */
4710   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4711     {
4712       tree first_vect = PHI_RESULT (new_phis[0]);
4713       gassign *new_vec_stmt = NULL;
4714       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4715       for (k = 1; k < new_phis.length (); k++)
4716         {
4717           gimple *next_phi = new_phis[k];
4718           tree second_vect = PHI_RESULT (next_phi);
4719           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4720           new_vec_stmt = gimple_build_assign (tem, code,
4721                                               first_vect, second_vect);
4722           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4723           first_vect = tem;
4724         }
4725
4726       new_phi_result = first_vect;
4727       if (new_vec_stmt)
4728         {
4729           new_phis.truncate (0);
4730           new_phis.safe_push (new_vec_stmt);
4731         }
4732     }
4733   /* Likewise if we couldn't use a single defuse cycle.  */
4734   else if (ncopies > 1)
4735     {
4736       gcc_assert (new_phis.length () == 1);
4737       tree first_vect = PHI_RESULT (new_phis[0]);
4738       gassign *new_vec_stmt = NULL;
4739       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4740       gimple *next_phi = new_phis[0];
4741       for (int k = 1; k < ncopies; ++k)
4742         {
4743           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4744           tree second_vect = PHI_RESULT (next_phi);
4745           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4746           new_vec_stmt = gimple_build_assign (tem, code,
4747                                               first_vect, second_vect);
4748           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4749           first_vect = tem;
4750         }
4751       new_phi_result = first_vect;
4752       new_phis.truncate (0);
4753       new_phis.safe_push (new_vec_stmt);
4754     }
4755   else
4756     new_phi_result = PHI_RESULT (new_phis[0]);
4757
4758   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4759       && reduc_code != ERROR_MARK)
4760     {
4761       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4762          various data values where the condition matched and another vector
4763          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4764          need to extract the last matching index (which will be the index with
4765          highest value) and use this to index into the data vector.
4766          For the case where there were no matches, the data vector will contain
4767          all default values and the index vector will be all zeros.  */
4768
4769       /* Get various versions of the type of the vector of indexes.  */
4770       tree index_vec_type = TREE_TYPE (induction_index);
4771       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4772       tree index_scalar_type = TREE_TYPE (index_vec_type);
4773       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4774         (index_vec_type);
4775
4776       /* Get an unsigned integer version of the type of the data vector.  */
4777       int scalar_precision
4778         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4779       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4780       tree vectype_unsigned = build_vector_type
4781         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4782
4783       /* First we need to create a vector (ZERO_VEC) of zeros and another
4784          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4785          can create using a MAX reduction and then expanding.
4786          In the case where the loop never made any matches, the max index will
4787          be zero.  */
4788
4789       /* Vector of {0, 0, 0,...}.  */
4790       tree zero_vec = make_ssa_name (vectype);
4791       tree zero_vec_rhs = build_zero_cst (vectype);
4792       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4793       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4794
4795       /* Find maximum value from the vector of found indexes.  */
4796       tree max_index = make_ssa_name (index_scalar_type);
4797       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4798                                                     induction_index);
4799       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4800
4801       /* Vector of {max_index, max_index, max_index,...}.  */
4802       tree max_index_vec = make_ssa_name (index_vec_type);
4803       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4804                                                       max_index);
4805       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4806                                                         max_index_vec_rhs);
4807       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4808
4809       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4810          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4811          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4812          otherwise.  Only one value should match, resulting in a vector
4813          (VEC_COND) with one data value and the rest zeros.
4814          In the case where the loop never made any matches, every index will
4815          match, resulting in a vector with all data values (which will all be
4816          the default value).  */
4817
4818       /* Compare the max index vector to the vector of found indexes to find
4819          the position of the max value.  */
4820       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4821       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4822                                                       induction_index,
4823                                                       max_index_vec);
4824       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4825
4826       /* Use the compare to choose either values from the data vector or
4827          zero.  */
4828       tree vec_cond = make_ssa_name (vectype);
4829       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4830                                                    vec_compare, new_phi_result,
4831                                                    zero_vec);
4832       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4833
4834       /* Finally we need to extract the data value from the vector (VEC_COND)
4835          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4836          reduction, but because this doesn't exist, we can use a MAX reduction
4837          instead.  The data value might be signed or a float so we need to cast
4838          it first.
4839          In the case where the loop never made any matches, the data values are
4840          all identical, and so will reduce down correctly.  */
4841
4842       /* Make the matched data values unsigned.  */
4843       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4844       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4845                                        vec_cond);
4846       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4847                                                         VIEW_CONVERT_EXPR,
4848                                                         vec_cond_cast_rhs);
4849       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4850
4851       /* Reduce down to a scalar value.  */
4852       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4853       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4854                                       optab_default);
4855       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4856                   != CODE_FOR_nothing);
4857       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4858                                                      REDUC_MAX_EXPR,
4859                                                      vec_cond_cast);
4860       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4861
4862       /* Convert the reduced value back to the result type and set as the
4863          result.  */
4864       gimple_seq stmts = NULL;
4865       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4866                                data_reduc);
4867       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4868       scalar_results.safe_push (new_temp);
4869     }
4870   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4871            && reduc_code == ERROR_MARK)
4872     {
4873       /* Condition redution without supported REDUC_MAX_EXPR.  Generate
4874          idx = 0;
4875          idx_val = induction_index[0];
4876          val = data_reduc[0];
4877          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4878            if (induction_index[i] > idx_val)
4879              val = data_reduc[i], idx_val = induction_index[i];
4880          return val;  */
4881
4882       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4883       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4884       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4885       unsigned HOST_WIDE_INT v_size
4886         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4887       tree idx_val = NULL_TREE, val = NULL_TREE;
4888       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4889         {
4890           tree old_idx_val = idx_val;
4891           tree old_val = val;
4892           idx_val = make_ssa_name (idx_eltype);
4893           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4894                                              build3 (BIT_FIELD_REF, idx_eltype,
4895                                                      induction_index,
4896                                                      bitsize_int (el_size),
4897                                                      bitsize_int (off)));
4898           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4899           val = make_ssa_name (data_eltype);
4900           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4901                                              build3 (BIT_FIELD_REF,
4902                                                      data_eltype,
4903                                                      new_phi_result,
4904                                                      bitsize_int (el_size),
4905                                                      bitsize_int (off)));
4906           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4907           if (off != 0)
4908             {
4909               tree new_idx_val = idx_val;
4910               tree new_val = val;
4911               if (off != v_size - el_size)
4912                 {
4913                   new_idx_val = make_ssa_name (idx_eltype);
4914                   epilog_stmt = gimple_build_assign (new_idx_val,
4915                                                      MAX_EXPR, idx_val,
4916                                                      old_idx_val);
4917                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4918                 }
4919               new_val = make_ssa_name (data_eltype);
4920               epilog_stmt = gimple_build_assign (new_val,
4921                                                  COND_EXPR,
4922                                                  build2 (GT_EXPR,
4923                                                          boolean_type_node,
4924                                                          idx_val,
4925                                                          old_idx_val),
4926                                                  val, old_val);
4927               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4928               idx_val = new_idx_val;
4929               val = new_val;
4930             }
4931         }
4932       /* Convert the reduced value back to the result type and set as the
4933          result.  */
4934       gimple_seq stmts = NULL;
4935       val = gimple_convert (&stmts, scalar_type, val);
4936       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4937       scalar_results.safe_push (val);
4938     }
4939
4940   /* 2.3 Create the reduction code, using one of the three schemes described
4941          above. In SLP we simply need to extract all the elements from the
4942          vector (without reducing them), so we use scalar shifts.  */
4943   else if (reduc_code != ERROR_MARK && !slp_reduc)
4944     {
4945       tree tmp;
4946       tree vec_elem_type;
4947
4948       /* Case 1:  Create:
4949          v_out2 = reduc_expr <v_out1>  */
4950
4951       if (dump_enabled_p ())
4952         dump_printf_loc (MSG_NOTE, vect_location,
4953                          "Reduce using direct vector reduction.\n");
4954
4955       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4956       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4957         {
4958           tree tmp_dest =
4959               vect_create_destination_var (scalar_dest, vec_elem_type);
4960           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4961           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4962           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4963           gimple_assign_set_lhs (epilog_stmt, new_temp);
4964           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4965
4966           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4967         }
4968       else
4969         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4970
4971       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4972       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4973       gimple_assign_set_lhs (epilog_stmt, new_temp);
4974       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975
4976       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4977           == INTEGER_INDUC_COND_REDUCTION)
4978         {
4979           /* Earlier we set the initial value to be zero.  Check the result
4980              and if it is zero then replace with the original initial
4981              value.  */
4982           tree zero = build_zero_cst (scalar_type);
4983           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4984
4985           tmp = make_ssa_name (new_scalar_dest);
4986           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4987                                              initial_def, new_temp);
4988           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4989           new_temp = tmp;
4990         }
4991
4992       scalar_results.safe_push (new_temp);
4993     }
4994   else
4995     {
4996       bool reduce_with_shift = have_whole_vector_shift (mode);
4997       int element_bitsize = tree_to_uhwi (bitsize);
4998       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4999       tree vec_temp;
5000
5001       /* COND reductions all do the final reduction with MAX_EXPR.  */
5002       if (code == COND_EXPR)
5003         code = MAX_EXPR;
5004
5005       /* Regardless of whether we have a whole vector shift, if we're
5006          emulating the operation via tree-vect-generic, we don't want
5007          to use it.  Only the first round of the reduction is likely
5008          to still be profitable via emulation.  */
5009       /* ??? It might be better to emit a reduction tree code here, so that
5010          tree-vect-generic can expand the first round via bit tricks.  */
5011       if (!VECTOR_MODE_P (mode))
5012         reduce_with_shift = false;
5013       else
5014         {
5015           optab optab = optab_for_tree_code (code, vectype, optab_default);
5016           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5017             reduce_with_shift = false;
5018         }
5019
5020       if (reduce_with_shift && !slp_reduc)
5021         {
5022           int nelements = vec_size_in_bits / element_bitsize;
5023           auto_vec_perm_indices sel (nelements);
5024
5025           int elt_offset;
5026
5027           tree zero_vec = build_zero_cst (vectype);
5028           /* Case 2: Create:
5029              for (offset = nelements/2; offset >= 1; offset/=2)
5030                 {
5031                   Create:  va' = vec_shift <va, offset>
5032                   Create:  va = vop <va, va'>
5033                 }  */
5034
5035           tree rhs;
5036
5037           if (dump_enabled_p ())
5038             dump_printf_loc (MSG_NOTE, vect_location,
5039                              "Reduce using vector shifts\n");
5040
5041           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5042           new_temp = new_phi_result;
5043           for (elt_offset = nelements / 2;
5044                elt_offset >= 1;
5045                elt_offset /= 2)
5046             {
5047               sel.truncate (0);
5048               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5049               tree mask = vect_gen_perm_mask_any (vectype, sel);
5050               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5051                                                  new_temp, zero_vec, mask);
5052               new_name = make_ssa_name (vec_dest, epilog_stmt);
5053               gimple_assign_set_lhs (epilog_stmt, new_name);
5054               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5055
5056               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5057                                                  new_temp);
5058               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5059               gimple_assign_set_lhs (epilog_stmt, new_temp);
5060               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5061             }
5062
5063           /* 2.4  Extract the final scalar result.  Create:
5064              s_out3 = extract_field <v_out2, bitpos>  */
5065
5066           if (dump_enabled_p ())
5067             dump_printf_loc (MSG_NOTE, vect_location,
5068                              "extract scalar result\n");
5069
5070           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5071                         bitsize, bitsize_zero_node);
5072           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5073           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5074           gimple_assign_set_lhs (epilog_stmt, new_temp);
5075           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5076           scalar_results.safe_push (new_temp);
5077         }
5078       else
5079         {
5080           /* Case 3: Create:
5081              s = extract_field <v_out2, 0>
5082              for (offset = element_size;
5083                   offset < vector_size;
5084                   offset += element_size;)
5085                {
5086                  Create:  s' = extract_field <v_out2, offset>
5087                  Create:  s = op <s, s'>  // For non SLP cases
5088                }  */
5089
5090           if (dump_enabled_p ())
5091             dump_printf_loc (MSG_NOTE, vect_location,
5092                              "Reduce using scalar code.\n");
5093
5094           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5095           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5096             {
5097               int bit_offset;
5098               if (gimple_code (new_phi) == GIMPLE_PHI)
5099                 vec_temp = PHI_RESULT (new_phi);
5100               else
5101                 vec_temp = gimple_assign_lhs (new_phi);
5102               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5103                             bitsize_zero_node);
5104               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5105               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5106               gimple_assign_set_lhs (epilog_stmt, new_temp);
5107               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5108
5109               /* In SLP we don't need to apply reduction operation, so we just
5110                  collect s' values in SCALAR_RESULTS.  */
5111               if (slp_reduc)
5112                 scalar_results.safe_push (new_temp);
5113
5114               for (bit_offset = element_bitsize;
5115                    bit_offset < vec_size_in_bits;
5116                    bit_offset += element_bitsize)
5117                 {
5118                   tree bitpos = bitsize_int (bit_offset);
5119                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5120                                      bitsize, bitpos);
5121
5122                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5123                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5124                   gimple_assign_set_lhs (epilog_stmt, new_name);
5125                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5126
5127                   if (slp_reduc)
5128                     {
5129                       /* In SLP we don't need to apply reduction operation, so
5130                          we just collect s' values in SCALAR_RESULTS.  */
5131                       new_temp = new_name;
5132                       scalar_results.safe_push (new_name);
5133                     }
5134                   else
5135                     {
5136                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5137                                                          new_name, new_temp);
5138                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5139                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5140                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5141                     }
5142                 }
5143             }
5144
5145           /* The only case where we need to reduce scalar results in SLP, is
5146              unrolling.  If the size of SCALAR_RESULTS is greater than
5147              GROUP_SIZE, we reduce them combining elements modulo
5148              GROUP_SIZE.  */
5149           if (slp_reduc)
5150             {
5151               tree res, first_res, new_res;
5152               gimple *new_stmt;
5153
5154               /* Reduce multiple scalar results in case of SLP unrolling.  */
5155               for (j = group_size; scalar_results.iterate (j, &res);
5156                    j++)
5157                 {
5158                   first_res = scalar_results[j % group_size];
5159                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5160                                                   first_res, res);
5161                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5162                   gimple_assign_set_lhs (new_stmt, new_res);
5163                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5164                   scalar_results[j % group_size] = new_res;
5165                 }
5166             }
5167           else
5168             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5169             scalar_results.safe_push (new_temp);
5170         }
5171
5172       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5173           == INTEGER_INDUC_COND_REDUCTION)
5174         {
5175           /* Earlier we set the initial value to be zero.  Check the result
5176              and if it is zero then replace with the original initial
5177              value.  */
5178           tree zero = build_zero_cst (scalar_type);
5179           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5180
5181           tree tmp = make_ssa_name (new_scalar_dest);
5182           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5183                                              initial_def, new_temp);
5184           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5185           scalar_results[0] = tmp;
5186         }
5187     }
5188
5189 vect_finalize_reduction:
5190
5191   if (double_reduc)
5192     loop = loop->inner;
5193
5194   /* 2.5 Adjust the final result by the initial value of the reduction
5195          variable. (When such adjustment is not needed, then
5196          'adjustment_def' is zero).  For example, if code is PLUS we create:
5197          new_temp = loop_exit_def + adjustment_def  */
5198
5199   if (adjustment_def)
5200     {
5201       gcc_assert (!slp_reduc);
5202       if (nested_in_vect_loop)
5203         {
5204           new_phi = new_phis[0];
5205           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5206           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5207           new_dest = vect_create_destination_var (scalar_dest, vectype);
5208         }
5209       else
5210         {
5211           new_temp = scalar_results[0];
5212           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5213           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5214           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5215         }
5216
5217       epilog_stmt = gimple_build_assign (new_dest, expr);
5218       new_temp = make_ssa_name (new_dest, epilog_stmt);
5219       gimple_assign_set_lhs (epilog_stmt, new_temp);
5220       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5221       if (nested_in_vect_loop)
5222         {
5223           set_vinfo_for_stmt (epilog_stmt,
5224                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5225           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5226                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5227
5228           if (!double_reduc)
5229             scalar_results.quick_push (new_temp);
5230           else
5231             scalar_results[0] = new_temp;
5232         }
5233       else
5234         scalar_results[0] = new_temp;
5235
5236       new_phis[0] = epilog_stmt;
5237     }
5238
5239   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5240           phis with new adjusted scalar results, i.e., replace use <s_out0>
5241           with use <s_out4>.
5242
5243      Transform:
5244         loop_exit:
5245           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5246           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5247           v_out2 = reduce <v_out1>
5248           s_out3 = extract_field <v_out2, 0>
5249           s_out4 = adjust_result <s_out3>
5250           use <s_out0>
5251           use <s_out0>
5252
5253      into:
5254
5255         loop_exit:
5256           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5257           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5258           v_out2 = reduce <v_out1>
5259           s_out3 = extract_field <v_out2, 0>
5260           s_out4 = adjust_result <s_out3>
5261           use <s_out4>
5262           use <s_out4> */
5263
5264
5265   /* In SLP reduction chain we reduce vector results into one vector if
5266      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5267      the last stmt in the reduction chain, since we are looking for the loop
5268      exit phi node.  */
5269   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5270     {
5271       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5272       /* Handle reduction patterns.  */
5273       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5274         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5275
5276       scalar_dest = gimple_assign_lhs (dest_stmt);
5277       group_size = 1;
5278     }
5279
5280   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5281      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5282      need to match SCALAR_RESULTS with corresponding statements.  The first
5283      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5284      the first vector stmt, etc.
5285      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5286   if (group_size > new_phis.length ())
5287     {
5288       ratio = group_size / new_phis.length ();
5289       gcc_assert (!(group_size % new_phis.length ()));
5290     }
5291   else
5292     ratio = 1;
5293
5294   for (k = 0; k < group_size; k++)
5295     {
5296       if (k % ratio == 0)
5297         {
5298           epilog_stmt = new_phis[k / ratio];
5299           reduction_phi = reduction_phis[k / ratio];
5300           if (double_reduc)
5301             inner_phi = inner_phis[k / ratio];
5302         }
5303
5304       if (slp_reduc)
5305         {
5306           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5307
5308           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5309           /* SLP statements can't participate in patterns.  */
5310           gcc_assert (!orig_stmt);
5311           scalar_dest = gimple_assign_lhs (current_stmt);
5312         }
5313
5314       phis.create (3);
5315       /* Find the loop-closed-use at the loop exit of the original scalar
5316          result.  (The reduction result is expected to have two immediate uses -
5317          one at the latch block, and one at the loop exit).  */
5318       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5319         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5320             && !is_gimple_debug (USE_STMT (use_p)))
5321           phis.safe_push (USE_STMT (use_p));
5322
5323       /* While we expect to have found an exit_phi because of loop-closed-ssa
5324          form we can end up without one if the scalar cycle is dead.  */
5325
5326       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5327         {
5328           if (outer_loop)
5329             {
5330               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5331               gphi *vect_phi;
5332
5333               /* FORNOW. Currently not supporting the case that an inner-loop
5334                  reduction is not used in the outer-loop (but only outside the
5335                  outer-loop), unless it is double reduction.  */
5336               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5337                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5338                           || double_reduc);
5339
5340               if (double_reduc)
5341                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5342               else
5343                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5344               if (!double_reduc
5345                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5346                       != vect_double_reduction_def)
5347                 continue;
5348
5349               /* Handle double reduction:
5350
5351                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5352                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5353                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5354                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5355
5356                  At that point the regular reduction (stmt2 and stmt3) is
5357                  already vectorized, as well as the exit phi node, stmt4.
5358                  Here we vectorize the phi node of double reduction, stmt1, and
5359                  update all relevant statements.  */
5360
5361               /* Go through all the uses of s2 to find double reduction phi
5362                  node, i.e., stmt1 above.  */
5363               orig_name = PHI_RESULT (exit_phi);
5364               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5365                 {
5366                   stmt_vec_info use_stmt_vinfo;
5367                   stmt_vec_info new_phi_vinfo;
5368                   tree vect_phi_init, preheader_arg, vect_phi_res;
5369                   basic_block bb = gimple_bb (use_stmt);
5370                   gimple *use;
5371
5372                   /* Check that USE_STMT is really double reduction phi
5373                      node.  */
5374                   if (gimple_code (use_stmt) != GIMPLE_PHI
5375                       || gimple_phi_num_args (use_stmt) != 2
5376                       || bb->loop_father != outer_loop)
5377                     continue;
5378                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5379                   if (!use_stmt_vinfo
5380                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5381                           != vect_double_reduction_def)
5382                     continue;
5383
5384                   /* Create vector phi node for double reduction:
5385                      vs1 = phi <vs0, vs2>
5386                      vs1 was created previously in this function by a call to
5387                        vect_get_vec_def_for_operand and is stored in
5388                        vec_initial_def;
5389                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5390                      vs0 is created here.  */
5391
5392                   /* Create vector phi node.  */
5393                   vect_phi = create_phi_node (vec_initial_def, bb);
5394                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5395                                     loop_vec_info_for_loop (outer_loop));
5396                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5397
5398                   /* Create vs0 - initial def of the double reduction phi.  */
5399                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5400                                              loop_preheader_edge (outer_loop));
5401                   vect_phi_init = get_initial_def_for_reduction
5402                     (stmt, preheader_arg, NULL);
5403
5404                   /* Update phi node arguments with vs0 and vs2.  */
5405                   add_phi_arg (vect_phi, vect_phi_init,
5406                                loop_preheader_edge (outer_loop),
5407                                UNKNOWN_LOCATION);
5408                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5409                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5410                   if (dump_enabled_p ())
5411                     {
5412                       dump_printf_loc (MSG_NOTE, vect_location,
5413                                        "created double reduction phi node: ");
5414                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5415                     }
5416
5417                   vect_phi_res = PHI_RESULT (vect_phi);
5418
5419                   /* Replace the use, i.e., set the correct vs1 in the regular
5420                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5421                      loop is redundant.  */
5422                   use = reduction_phi;
5423                   for (j = 0; j < ncopies; j++)
5424                     {
5425                       edge pr_edge = loop_preheader_edge (loop);
5426                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5427                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5428                     }
5429                 }
5430             }
5431         }
5432
5433       phis.release ();
5434       if (nested_in_vect_loop)
5435         {
5436           if (double_reduc)
5437             loop = outer_loop;
5438           else
5439             continue;
5440         }
5441
5442       phis.create (3);
5443       /* Find the loop-closed-use at the loop exit of the original scalar
5444          result.  (The reduction result is expected to have two immediate uses,
5445          one at the latch block, and one at the loop exit).  For double
5446          reductions we are looking for exit phis of the outer loop.  */
5447       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5448         {
5449           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5450             {
5451               if (!is_gimple_debug (USE_STMT (use_p)))
5452                 phis.safe_push (USE_STMT (use_p));
5453             }
5454           else
5455             {
5456               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5457                 {
5458                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5459
5460                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5461                     {
5462                       if (!flow_bb_inside_loop_p (loop,
5463                                              gimple_bb (USE_STMT (phi_use_p)))
5464                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5465                         phis.safe_push (USE_STMT (phi_use_p));
5466                     }
5467                 }
5468             }
5469         }
5470
5471       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5472         {
5473           /* Replace the uses:  */
5474           orig_name = PHI_RESULT (exit_phi);
5475           scalar_result = scalar_results[k];
5476           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5477             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5478               SET_USE (use_p, scalar_result);
5479         }
5480
5481       phis.release ();
5482     }
5483 }
5484
5485
5486 /* Function is_nonwrapping_integer_induction.
5487
5488    Check if STMT (which is part of loop LOOP) both increments and
5489    does not cause overflow.  */
5490
5491 static bool
5492 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5493 {
5494   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5495   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5496   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5497   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5498   widest_int ni, max_loop_value, lhs_max;
5499   bool overflow = false;
5500
5501   /* Make sure the loop is integer based.  */
5502   if (TREE_CODE (base) != INTEGER_CST
5503       || TREE_CODE (step) != INTEGER_CST)
5504     return false;
5505
5506   /* Check that the induction increments.  */
5507   if (tree_int_cst_sgn (step) == -1)
5508     return false;
5509
5510   /* Check that the max size of the loop will not wrap.  */
5511
5512   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5513     return true;
5514
5515   if (! max_stmt_executions (loop, &ni))
5516     return false;
5517
5518   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5519                             &overflow);
5520   if (overflow)
5521     return false;
5522
5523   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5524                             TYPE_SIGN (lhs_type), &overflow);
5525   if (overflow)
5526     return false;
5527
5528   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5529           <= TYPE_PRECISION (lhs_type));
5530 }
5531
5532 /* Function vectorizable_reduction.
5533
5534    Check if STMT performs a reduction operation that can be vectorized.
5535    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5536    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5537    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5538
5539    This function also handles reduction idioms (patterns) that have been
5540    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5541    of this form:
5542      X = pattern_expr (arg0, arg1, ..., X)
5543    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5544    sequence that had been detected and replaced by the pattern-stmt (STMT).
5545
5546    This function also handles reduction of condition expressions, for example:
5547      for (int i = 0; i < N; i++)
5548        if (a[i] < value)
5549          last = a[i];
5550    This is handled by vectorising the loop and creating an additional vector
5551    containing the loop indexes for which "a[i] < value" was true.  In the
5552    function epilogue this is reduced to a single max value and then used to
5553    index into the vector of results.
5554
5555    In some cases of reduction patterns, the type of the reduction variable X is
5556    different than the type of the other arguments of STMT.
5557    In such cases, the vectype that is used when transforming STMT into a vector
5558    stmt is different than the vectype that is used to determine the
5559    vectorization factor, because it consists of a different number of elements
5560    than the actual number of elements that are being operated upon in parallel.
5561
5562    For example, consider an accumulation of shorts into an int accumulator.
5563    On some targets it's possible to vectorize this pattern operating on 8
5564    shorts at a time (hence, the vectype for purposes of determining the
5565    vectorization factor should be V8HI); on the other hand, the vectype that
5566    is used to create the vector form is actually V4SI (the type of the result).
5567
5568    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5569    indicates what is the actual level of parallelism (V8HI in the example), so
5570    that the right vectorization factor would be derived.  This vectype
5571    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5572    be used to create the vectorized stmt.  The right vectype for the vectorized
5573    stmt is obtained from the type of the result X:
5574         get_vectype_for_scalar_type (TREE_TYPE (X))
5575
5576    This means that, contrary to "regular" reductions (or "regular" stmts in
5577    general), the following equation:
5578       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5579    does *NOT* necessarily hold for reduction patterns.  */
5580
5581 bool
5582 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5583                         gimple **vec_stmt, slp_tree slp_node,
5584                         slp_instance slp_node_instance)
5585 {
5586   tree vec_dest;
5587   tree scalar_dest;
5588   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5589   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5590   tree vectype_in = NULL_TREE;
5591   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5592   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5593   enum tree_code code, orig_code, epilog_reduc_code;
5594   machine_mode vec_mode;
5595   int op_type;
5596   optab optab, reduc_optab;
5597   tree new_temp = NULL_TREE;
5598   gimple *def_stmt;
5599   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5600   tree scalar_type;
5601   bool is_simple_use;
5602   gimple *orig_stmt;
5603   stmt_vec_info orig_stmt_info = NULL;
5604   int i;
5605   int ncopies;
5606   int epilog_copies;
5607   stmt_vec_info prev_stmt_info, prev_phi_info;
5608   bool single_defuse_cycle = false;
5609   gimple *new_stmt = NULL;
5610   int j;
5611   tree ops[3];
5612   enum vect_def_type dts[3];
5613   bool nested_cycle = false, found_nested_cycle_def = false;
5614   bool double_reduc = false;
5615   basic_block def_bb;
5616   struct loop * def_stmt_loop, *outer_loop = NULL;
5617   tree def_arg;
5618   gimple *def_arg_stmt;
5619   auto_vec<tree> vec_oprnds0;
5620   auto_vec<tree> vec_oprnds1;
5621   auto_vec<tree> vec_oprnds2;
5622   auto_vec<tree> vect_defs;
5623   auto_vec<gimple *> phis;
5624   int vec_num;
5625   tree def0, tem;
5626   bool first_p = true;
5627   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5628   tree cond_reduc_val = NULL_TREE;
5629
5630   /* Make sure it was already recognized as a reduction computation.  */
5631   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5632       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5633     return false;
5634
5635   if (nested_in_vect_loop_p (loop, stmt))
5636     {
5637       outer_loop = loop;
5638       loop = loop->inner;
5639       nested_cycle = true;
5640     }
5641
5642   /* In case of reduction chain we switch to the first stmt in the chain, but
5643      we don't update STMT_INFO, since only the last stmt is marked as reduction
5644      and has reduction properties.  */
5645   if (GROUP_FIRST_ELEMENT (stmt_info)
5646       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5647     {
5648       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5649       first_p = false;
5650     }
5651
5652   if (gimple_code (stmt) == GIMPLE_PHI)
5653     {
5654       /* Analysis is fully done on the reduction stmt invocation.  */
5655       if (! vec_stmt)
5656         {
5657           if (slp_node)
5658             slp_node_instance->reduc_phis = slp_node;
5659
5660           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5661           return true;
5662         }
5663
5664       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5665       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5666         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5667
5668       gcc_assert (is_gimple_assign (reduc_stmt));
5669       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5670         {
5671           tree op = gimple_op (reduc_stmt, k);
5672           if (op == gimple_phi_result (stmt))
5673             continue;
5674           if (k == 1
5675               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5676             continue;
5677           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5678           if (! vectype_in
5679               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5680             vectype_in = tem;
5681           break;
5682         }
5683       gcc_assert (vectype_in);
5684
5685       if (slp_node)
5686         ncopies = 1;
5687       else
5688         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5689
5690       use_operand_p use_p;
5691       gimple *use_stmt;
5692       if (ncopies > 1
5693           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5694               <= vect_used_only_live)
5695           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5696           && (use_stmt == reduc_stmt
5697               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5698                   == reduc_stmt)))
5699         single_defuse_cycle = true;
5700
5701       /* Create the destination vector  */
5702       scalar_dest = gimple_assign_lhs (reduc_stmt);
5703       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5704
5705       if (slp_node)
5706         /* The size vect_schedule_slp_instance computes is off for us.  */
5707         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5708                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5709                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5710       else
5711         vec_num = 1;
5712
5713       /* Generate the reduction PHIs upfront.  */
5714       prev_phi_info = NULL;
5715       for (j = 0; j < ncopies; j++)
5716         {
5717           if (j == 0 || !single_defuse_cycle)
5718             {
5719               for (i = 0; i < vec_num; i++)
5720                 {
5721                   /* Create the reduction-phi that defines the reduction
5722                      operand.  */
5723                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5724                   set_vinfo_for_stmt (new_phi,
5725                                       new_stmt_vec_info (new_phi, loop_vinfo));
5726
5727                   if (slp_node)
5728                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5729                   else
5730                     {
5731                       if (j == 0)
5732                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5733                       else
5734                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5735                       prev_phi_info = vinfo_for_stmt (new_phi);
5736                     }
5737                 }
5738             }
5739         }
5740
5741       return true;
5742     }
5743
5744   /* 1. Is vectorizable reduction?  */
5745   /* Not supportable if the reduction variable is used in the loop, unless
5746      it's a reduction chain.  */
5747   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5748       && !GROUP_FIRST_ELEMENT (stmt_info))
5749     return false;
5750
5751   /* Reductions that are not used even in an enclosing outer-loop,
5752      are expected to be "live" (used out of the loop).  */
5753   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5754       && !STMT_VINFO_LIVE_P (stmt_info))
5755     return false;
5756
5757   /* 2. Has this been recognized as a reduction pattern?
5758
5759      Check if STMT represents a pattern that has been recognized
5760      in earlier analysis stages.  For stmts that represent a pattern,
5761      the STMT_VINFO_RELATED_STMT field records the last stmt in
5762      the original sequence that constitutes the pattern.  */
5763
5764   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5765   if (orig_stmt)
5766     {
5767       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5768       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5769       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5770     }
5771
5772   /* 3. Check the operands of the operation.  The first operands are defined
5773         inside the loop body. The last operand is the reduction variable,
5774         which is defined by the loop-header-phi.  */
5775
5776   gcc_assert (is_gimple_assign (stmt));
5777
5778   /* Flatten RHS.  */
5779   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5780     {
5781     case GIMPLE_BINARY_RHS:
5782       code = gimple_assign_rhs_code (stmt);
5783       op_type = TREE_CODE_LENGTH (code);
5784       gcc_assert (op_type == binary_op);
5785       ops[0] = gimple_assign_rhs1 (stmt);
5786       ops[1] = gimple_assign_rhs2 (stmt);
5787       break;
5788
5789     case GIMPLE_TERNARY_RHS:
5790       code = gimple_assign_rhs_code (stmt);
5791       op_type = TREE_CODE_LENGTH (code);
5792       gcc_assert (op_type == ternary_op);
5793       ops[0] = gimple_assign_rhs1 (stmt);
5794       ops[1] = gimple_assign_rhs2 (stmt);
5795       ops[2] = gimple_assign_rhs3 (stmt);
5796       break;
5797
5798     case GIMPLE_UNARY_RHS:
5799       return false;
5800
5801     default:
5802       gcc_unreachable ();
5803     }
5804
5805   if (code == COND_EXPR && slp_node)
5806     return false;
5807
5808   scalar_dest = gimple_assign_lhs (stmt);
5809   scalar_type = TREE_TYPE (scalar_dest);
5810   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5811       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5812     return false;
5813
5814   /* Do not try to vectorize bit-precision reductions.  */
5815   if (!type_has_mode_precision_p (scalar_type))
5816     return false;
5817
5818   /* All uses but the last are expected to be defined in the loop.
5819      The last use is the reduction variable.  In case of nested cycle this
5820      assumption is not true: we use reduc_index to record the index of the
5821      reduction variable.  */
5822   gimple *reduc_def_stmt = NULL;
5823   int reduc_index = -1;
5824   for (i = 0; i < op_type; i++)
5825     {
5826       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5827       if (i == 0 && code == COND_EXPR)
5828         continue;
5829
5830       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5831                                           &def_stmt, &dts[i], &tem);
5832       dt = dts[i];
5833       gcc_assert (is_simple_use);
5834       if (dt == vect_reduction_def)
5835         {
5836           reduc_def_stmt = def_stmt;
5837           reduc_index = i;
5838           continue;
5839         }
5840       else
5841         {
5842           if (!vectype_in)
5843             vectype_in = tem;
5844         }
5845
5846       if (dt != vect_internal_def
5847           && dt != vect_external_def
5848           && dt != vect_constant_def
5849           && dt != vect_induction_def
5850           && !(dt == vect_nested_cycle && nested_cycle))
5851         return false;
5852
5853       if (dt == vect_nested_cycle)
5854         {
5855           found_nested_cycle_def = true;
5856           reduc_def_stmt = def_stmt;
5857           reduc_index = i;
5858         }
5859
5860       if (i == 1 && code == COND_EXPR)
5861         {
5862           /* Record how value of COND_EXPR is defined.  */
5863           if (dt == vect_constant_def)
5864             {
5865               cond_reduc_dt = dt;
5866               cond_reduc_val = ops[i];
5867             }
5868           if (dt == vect_induction_def && def_stmt != NULL
5869               && is_nonwrapping_integer_induction (def_stmt, loop))
5870             cond_reduc_dt = dt;
5871         }
5872     }
5873
5874   if (!vectype_in)
5875     vectype_in = vectype_out;
5876
5877   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5878      directy used in stmt.  */
5879   if (reduc_index == -1)
5880     {
5881       if (orig_stmt)
5882         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5883       else
5884         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5885     }
5886
5887   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5888     return false;
5889
5890   if (!(reduc_index == -1
5891         || dts[reduc_index] == vect_reduction_def
5892         || dts[reduc_index] == vect_nested_cycle
5893         || ((dts[reduc_index] == vect_internal_def
5894              || dts[reduc_index] == vect_external_def
5895              || dts[reduc_index] == vect_constant_def
5896              || dts[reduc_index] == vect_induction_def)
5897             && nested_cycle && found_nested_cycle_def)))
5898     {
5899       /* For pattern recognized stmts, orig_stmt might be a reduction,
5900          but some helper statements for the pattern might not, or
5901          might be COND_EXPRs with reduction uses in the condition.  */
5902       gcc_assert (orig_stmt);
5903       return false;
5904     }
5905
5906   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5907   enum vect_reduction_type v_reduc_type
5908     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5909   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5910
5911   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5912   /* If we have a condition reduction, see if we can simplify it further.  */
5913   if (v_reduc_type == COND_REDUCTION)
5914     {
5915       if (cond_reduc_dt == vect_induction_def)
5916         {
5917           if (dump_enabled_p ())
5918             dump_printf_loc (MSG_NOTE, vect_location,
5919                              "condition expression based on "
5920                              "integer induction.\n");
5921           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5922             = INTEGER_INDUC_COND_REDUCTION;
5923         }
5924
5925       /* Loop peeling modifies initial value of reduction PHI, which
5926          makes the reduction stmt to be transformed different to the
5927          original stmt analyzed.  We need to record reduction code for
5928          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5929          it can be used directly at transform stage.  */
5930       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5931           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5932         {
5933           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5934           gcc_assert (cond_reduc_dt == vect_constant_def);
5935           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5936         }
5937       else if (cond_reduc_dt == vect_constant_def)
5938         {
5939           enum vect_def_type cond_initial_dt;
5940           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5941           tree cond_initial_val
5942             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5943
5944           gcc_assert (cond_reduc_val != NULL_TREE);
5945           vect_is_simple_use (cond_initial_val, loop_vinfo,
5946                               &def_stmt, &cond_initial_dt);
5947           if (cond_initial_dt == vect_constant_def
5948               && types_compatible_p (TREE_TYPE (cond_initial_val),
5949                                      TREE_TYPE (cond_reduc_val)))
5950             {
5951               tree e = fold_binary (LE_EXPR, boolean_type_node,
5952                                     cond_initial_val, cond_reduc_val);
5953               if (e && (integer_onep (e) || integer_zerop (e)))
5954                 {
5955                   if (dump_enabled_p ())
5956                     dump_printf_loc (MSG_NOTE, vect_location,
5957                                      "condition expression based on "
5958                                      "compile time constant.\n");
5959                   /* Record reduction code at analysis stage.  */
5960                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5961                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5962                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5963                     = CONST_COND_REDUCTION;
5964                 }
5965             }
5966         }
5967     }
5968
5969   if (orig_stmt)
5970     gcc_assert (tmp == orig_stmt
5971                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5972   else
5973     /* We changed STMT to be the first stmt in reduction chain, hence we
5974        check that in this case the first element in the chain is STMT.  */
5975     gcc_assert (stmt == tmp
5976                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5977
5978   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5979     return false;
5980
5981   if (slp_node)
5982     ncopies = 1;
5983   else
5984     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5985
5986   gcc_assert (ncopies >= 1);
5987
5988   vec_mode = TYPE_MODE (vectype_in);
5989
5990   if (code == COND_EXPR)
5991     {
5992       /* Only call during the analysis stage, otherwise we'll lose
5993          STMT_VINFO_TYPE.  */
5994       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5995                                                 ops[reduc_index], 0, NULL))
5996         {
5997           if (dump_enabled_p ())
5998             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5999                              "unsupported condition in reduction\n");
6000           return false;
6001         }
6002     }
6003   else
6004     {
6005       /* 4. Supportable by target?  */
6006
6007       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6008           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6009         {
6010           /* Shifts and rotates are only supported by vectorizable_shifts,
6011              not vectorizable_reduction.  */
6012           if (dump_enabled_p ())
6013             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6014                              "unsupported shift or rotation.\n");
6015           return false;
6016         }
6017
6018       /* 4.1. check support for the operation in the loop  */
6019       optab = optab_for_tree_code (code, vectype_in, optab_default);
6020       if (!optab)
6021         {
6022           if (dump_enabled_p ())
6023             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6024                              "no optab.\n");
6025
6026           return false;
6027         }
6028
6029       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6030         {
6031           if (dump_enabled_p ())
6032             dump_printf (MSG_NOTE, "op not supported by target.\n");
6033
6034           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6035               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6036             return false;
6037
6038           if (dump_enabled_p ())
6039             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6040         }
6041
6042       /* Worthwhile without SIMD support?  */
6043       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6044           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6045         {
6046           if (dump_enabled_p ())
6047             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6048                              "not worthwhile without SIMD support.\n");
6049
6050           return false;
6051         }
6052     }
6053
6054   /* 4.2. Check support for the epilog operation.
6055
6056           If STMT represents a reduction pattern, then the type of the
6057           reduction variable may be different than the type of the rest
6058           of the arguments.  For example, consider the case of accumulation
6059           of shorts into an int accumulator; The original code:
6060                         S1: int_a = (int) short_a;
6061           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6062
6063           was replaced with:
6064                         STMT: int_acc = widen_sum <short_a, int_acc>
6065
6066           This means that:
6067           1. The tree-code that is used to create the vector operation in the
6068              epilog code (that reduces the partial results) is not the
6069              tree-code of STMT, but is rather the tree-code of the original
6070              stmt from the pattern that STMT is replacing.  I.e, in the example
6071              above we want to use 'widen_sum' in the loop, but 'plus' in the
6072              epilog.
6073           2. The type (mode) we use to check available target support
6074              for the vector operation to be created in the *epilog*, is
6075              determined by the type of the reduction variable (in the example
6076              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6077              However the type (mode) we use to check available target support
6078              for the vector operation to be created *inside the loop*, is
6079              determined by the type of the other arguments to STMT (in the
6080              example we'd check this: optab_handler (widen_sum_optab,
6081              vect_short_mode)).
6082
6083           This is contrary to "regular" reductions, in which the types of all
6084           the arguments are the same as the type of the reduction variable.
6085           For "regular" reductions we can therefore use the same vector type
6086           (and also the same tree-code) when generating the epilog code and
6087           when generating the code inside the loop.  */
6088
6089   if (orig_stmt)
6090     {
6091       /* This is a reduction pattern: get the vectype from the type of the
6092          reduction variable, and get the tree-code from orig_stmt.  */
6093       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6094                   == TREE_CODE_REDUCTION);
6095       orig_code = gimple_assign_rhs_code (orig_stmt);
6096       gcc_assert (vectype_out);
6097       vec_mode = TYPE_MODE (vectype_out);
6098     }
6099   else
6100     {
6101       /* Regular reduction: use the same vectype and tree-code as used for
6102          the vector code inside the loop can be used for the epilog code. */
6103       orig_code = code;
6104
6105       if (code == MINUS_EXPR)
6106         orig_code = PLUS_EXPR;
6107
6108       /* For simple condition reductions, replace with the actual expression
6109          we want to base our reduction around.  */
6110       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6111         {
6112           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6113           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6114         }
6115       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6116                  == INTEGER_INDUC_COND_REDUCTION)
6117         orig_code = MAX_EXPR;
6118     }
6119
6120   if (nested_cycle)
6121     {
6122       def_bb = gimple_bb (reduc_def_stmt);
6123       def_stmt_loop = def_bb->loop_father;
6124       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6125                                        loop_preheader_edge (def_stmt_loop));
6126       if (TREE_CODE (def_arg) == SSA_NAME
6127           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6128           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6129           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6130           && vinfo_for_stmt (def_arg_stmt)
6131           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6132               == vect_double_reduction_def)
6133         double_reduc = true;
6134     }
6135
6136   epilog_reduc_code = ERROR_MARK;
6137
6138   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6139     {
6140       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6141         {
6142           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6143                                          optab_default);
6144           if (!reduc_optab)
6145             {
6146               if (dump_enabled_p ())
6147                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6148                                  "no optab for reduction.\n");
6149
6150               epilog_reduc_code = ERROR_MARK;
6151             }
6152           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6153             {
6154               if (dump_enabled_p ())
6155                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6156                                  "reduc op not supported by target.\n");
6157
6158               epilog_reduc_code = ERROR_MARK;
6159             }
6160         }
6161       else
6162         {
6163           if (!nested_cycle || double_reduc)
6164             {
6165               if (dump_enabled_p ())
6166                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6167                                  "no reduc code for scalar code.\n");
6168
6169               return false;
6170             }
6171         }
6172     }
6173   else
6174     {
6175       int scalar_precision
6176         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6177       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6178       cr_index_vector_type = build_vector_type
6179         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6180
6181       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6182                                    optab_default);
6183       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6184           != CODE_FOR_nothing)
6185         epilog_reduc_code = REDUC_MAX_EXPR;
6186     }
6187
6188   if ((double_reduc
6189        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6190       && ncopies > 1)
6191     {
6192       if (dump_enabled_p ())
6193         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6194                          "multiple types in double reduction or condition "
6195                          "reduction.\n");
6196       return false;
6197     }
6198
6199   /* In case of widenning multiplication by a constant, we update the type
6200      of the constant to be the type of the other operand.  We check that the
6201      constant fits the type in the pattern recognition pass.  */
6202   if (code == DOT_PROD_EXPR
6203       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6204     {
6205       if (TREE_CODE (ops[0]) == INTEGER_CST)
6206         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6207       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6208         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6209       else
6210         {
6211           if (dump_enabled_p ())
6212             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6213                              "invalid types in dot-prod\n");
6214
6215           return false;
6216         }
6217     }
6218
6219   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6220     {
6221       widest_int ni;
6222
6223       if (! max_loop_iterations (loop, &ni))
6224         {
6225           if (dump_enabled_p ())
6226             dump_printf_loc (MSG_NOTE, vect_location,
6227                              "loop count not known, cannot create cond "
6228                              "reduction.\n");
6229           return false;
6230         }
6231       /* Convert backedges to iterations.  */
6232       ni += 1;
6233
6234       /* The additional index will be the same type as the condition.  Check
6235          that the loop can fit into this less one (because we'll use up the
6236          zero slot for when there are no matches).  */
6237       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6238       if (wi::geu_p (ni, wi::to_widest (max_index)))
6239         {
6240           if (dump_enabled_p ())
6241             dump_printf_loc (MSG_NOTE, vect_location,
6242                              "loop size is greater than data size.\n");
6243           return false;
6244         }
6245     }
6246
6247   /* In case the vectorization factor (VF) is bigger than the number
6248      of elements that we can fit in a vectype (nunits), we have to generate
6249      more than one vector stmt - i.e - we need to "unroll" the
6250      vector stmt by a factor VF/nunits.  For more details see documentation
6251      in vectorizable_operation.  */
6252
6253   /* If the reduction is used in an outer loop we need to generate
6254      VF intermediate results, like so (e.g. for ncopies=2):
6255         r0 = phi (init, r0)
6256         r1 = phi (init, r1)
6257         r0 = x0 + r0;
6258         r1 = x1 + r1;
6259     (i.e. we generate VF results in 2 registers).
6260     In this case we have a separate def-use cycle for each copy, and therefore
6261     for each copy we get the vector def for the reduction variable from the
6262     respective phi node created for this copy.
6263
6264     Otherwise (the reduction is unused in the loop nest), we can combine
6265     together intermediate results, like so (e.g. for ncopies=2):
6266         r = phi (init, r)
6267         r = x0 + r;
6268         r = x1 + r;
6269    (i.e. we generate VF/2 results in a single register).
6270    In this case for each copy we get the vector def for the reduction variable
6271    from the vectorized reduction operation generated in the previous iteration.
6272
6273    This only works when we see both the reduction PHI and its only consumer
6274    in vectorizable_reduction and there are no intermediate stmts
6275    participating.  */
6276   use_operand_p use_p;
6277   gimple *use_stmt;
6278   if (ncopies > 1
6279       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6280       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6281       && (use_stmt == stmt
6282           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6283     {
6284       single_defuse_cycle = true;
6285       epilog_copies = 1;
6286     }
6287   else
6288     epilog_copies = ncopies;
6289
6290   /* If the reduction stmt is one of the patterns that have lane
6291      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6292   if ((ncopies > 1
6293        && ! single_defuse_cycle)
6294       && (code == DOT_PROD_EXPR
6295           || code == WIDEN_SUM_EXPR
6296           || code == SAD_EXPR))
6297     {
6298       if (dump_enabled_p ())
6299         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6300                          "multi def-use cycle not possible for lane-reducing "
6301                          "reduction operation\n");
6302       return false;
6303     }
6304
6305   if (!vec_stmt) /* transformation not required.  */
6306     {
6307       if (first_p)
6308         vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6309       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6310       return true;
6311     }
6312
6313   /* Transform.  */
6314
6315   if (dump_enabled_p ())
6316     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6317
6318   /* FORNOW: Multiple types are not supported for condition.  */
6319   if (code == COND_EXPR)
6320     gcc_assert (ncopies == 1);
6321
6322   /* Create the destination vector  */
6323   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6324
6325   prev_stmt_info = NULL;
6326   prev_phi_info = NULL;
6327   if (slp_node)
6328     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6329   else
6330     {
6331       vec_num = 1;
6332       vec_oprnds0.create (1);
6333       vec_oprnds1.create (1);
6334       if (op_type == ternary_op)
6335         vec_oprnds2.create (1);
6336     }
6337
6338   phis.create (vec_num);
6339   vect_defs.create (vec_num);
6340   if (!slp_node)
6341     vect_defs.quick_push (NULL_TREE);
6342
6343   if (slp_node)
6344     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6345   else
6346     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6347
6348   for (j = 0; j < ncopies; j++)
6349     {
6350       if (code == COND_EXPR)
6351         {
6352           gcc_assert (!slp_node);
6353           vectorizable_condition (stmt, gsi, vec_stmt,
6354                                   PHI_RESULT (phis[0]),
6355                                   reduc_index, NULL);
6356           /* Multiple types are not supported for condition.  */
6357           break;
6358         }
6359
6360       /* Handle uses.  */
6361       if (j == 0)
6362         {
6363           if (slp_node)
6364             {
6365               /* Get vec defs for all the operands except the reduction index,
6366                  ensuring the ordering of the ops in the vector is kept.  */
6367               auto_vec<tree, 3> slp_ops;
6368               auto_vec<vec<tree>, 3> vec_defs;
6369
6370               slp_ops.quick_push (ops[0]);
6371               slp_ops.quick_push (ops[1]);
6372               if (op_type == ternary_op)
6373                 slp_ops.quick_push (ops[2]);
6374
6375               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6376
6377               vec_oprnds0.safe_splice (vec_defs[0]);
6378               vec_defs[0].release ();
6379               vec_oprnds1.safe_splice (vec_defs[1]);
6380               vec_defs[1].release ();
6381               if (op_type == ternary_op)
6382                 {
6383                   vec_oprnds2.safe_splice (vec_defs[2]);
6384                   vec_defs[2].release ();
6385                 }
6386             }
6387           else
6388             {
6389               vec_oprnds0.quick_push
6390                 (vect_get_vec_def_for_operand (ops[0], stmt));
6391               vec_oprnds1.quick_push
6392                 (vect_get_vec_def_for_operand (ops[1], stmt));
6393               if (op_type == ternary_op)
6394                 vec_oprnds2.quick_push
6395                   (vect_get_vec_def_for_operand (ops[2], stmt));
6396             }
6397         }
6398       else
6399         {
6400           if (!slp_node)
6401             {
6402               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6403
6404               if (single_defuse_cycle && reduc_index == 0)
6405                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6406               else
6407                 vec_oprnds0[0]
6408                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6409               if (single_defuse_cycle && reduc_index == 1)
6410                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6411               else
6412                 vec_oprnds1[0]
6413                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6414               if (op_type == ternary_op)
6415                 {
6416                   if (single_defuse_cycle && reduc_index == 2)
6417                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6418                   else
6419                     vec_oprnds2[0]
6420                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6421                 }
6422             }
6423         }
6424
6425       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6426         {
6427           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6428           if (op_type == ternary_op)
6429             vop[2] = vec_oprnds2[i];
6430
6431           new_temp = make_ssa_name (vec_dest, new_stmt);
6432           new_stmt = gimple_build_assign (new_temp, code,
6433                                           vop[0], vop[1], vop[2]);
6434           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6435
6436           if (slp_node)
6437             {
6438               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6439               vect_defs.quick_push (new_temp);
6440             }
6441           else
6442             vect_defs[0] = new_temp;
6443         }
6444
6445       if (slp_node)
6446         continue;
6447
6448       if (j == 0)
6449         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6450       else
6451         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6452
6453       prev_stmt_info = vinfo_for_stmt (new_stmt);
6454     }
6455
6456   /* Finalize the reduction-phi (set its arguments) and create the
6457      epilog reduction code.  */
6458   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6459     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6460
6461   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6462                                     epilog_copies,
6463                                     epilog_reduc_code, phis,
6464                                     double_reduc, slp_node, slp_node_instance);
6465
6466   return true;
6467 }
6468
6469 /* Function vect_min_worthwhile_factor.
6470
6471    For a loop where we could vectorize the operation indicated by CODE,
6472    return the minimum vectorization factor that makes it worthwhile
6473    to use generic vectors.  */
6474 int
6475 vect_min_worthwhile_factor (enum tree_code code)
6476 {
6477   switch (code)
6478     {
6479     case PLUS_EXPR:
6480     case MINUS_EXPR:
6481     case NEGATE_EXPR:
6482       return 4;
6483
6484     case BIT_AND_EXPR:
6485     case BIT_IOR_EXPR:
6486     case BIT_XOR_EXPR:
6487     case BIT_NOT_EXPR:
6488       return 2;
6489
6490     default:
6491       return INT_MAX;
6492     }
6493 }
6494
6495 /* Return true if VINFO indicates we are doing loop vectorization and if
6496    it is worth decomposing CODE operations into scalar operations for
6497    that loop's vectorization factor.  */
6498
6499 bool
6500 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6501 {
6502   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6503   return (loop_vinfo
6504           && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6505               >= vect_min_worthwhile_factor (code)));
6506 }
6507
6508 /* Function vectorizable_induction
6509
6510    Check if PHI performs an induction computation that can be vectorized.
6511    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6512    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6513    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6514
6515 bool
6516 vectorizable_induction (gimple *phi,
6517                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6518                         gimple **vec_stmt, slp_tree slp_node)
6519 {
6520   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6521   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6522   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6523   unsigned ncopies;
6524   bool nested_in_vect_loop = false;
6525   struct loop *iv_loop;
6526   tree vec_def;
6527   edge pe = loop_preheader_edge (loop);
6528   basic_block new_bb;
6529   tree new_vec, vec_init, vec_step, t;
6530   tree new_name;
6531   gimple *new_stmt;
6532   gphi *induction_phi;
6533   tree induc_def, vec_dest;
6534   tree init_expr, step_expr;
6535   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6536   unsigned i;
6537   tree expr;
6538   gimple_seq stmts;
6539   imm_use_iterator imm_iter;
6540   use_operand_p use_p;
6541   gimple *exit_phi;
6542   edge latch_e;
6543   tree loop_arg;
6544   gimple_stmt_iterator si;
6545   basic_block bb = gimple_bb (phi);
6546
6547   if (gimple_code (phi) != GIMPLE_PHI)
6548     return false;
6549
6550   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6551     return false;
6552
6553   /* Make sure it was recognized as induction computation.  */
6554   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6555     return false;
6556
6557   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6558   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6559
6560   if (slp_node)
6561     ncopies = 1;
6562   else
6563     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6564   gcc_assert (ncopies >= 1);
6565
6566   /* FORNOW. These restrictions should be relaxed.  */
6567   if (nested_in_vect_loop_p (loop, phi))
6568     {
6569       imm_use_iterator imm_iter;
6570       use_operand_p use_p;
6571       gimple *exit_phi;
6572       edge latch_e;
6573       tree loop_arg;
6574
6575       if (ncopies > 1)
6576         {
6577           if (dump_enabled_p ())
6578             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6579                              "multiple types in nested loop.\n");
6580           return false;
6581         }
6582
6583       /* FORNOW: outer loop induction with SLP not supported.  */
6584       if (STMT_SLP_TYPE (stmt_info))
6585         return false;
6586
6587       exit_phi = NULL;
6588       latch_e = loop_latch_edge (loop->inner);
6589       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6590       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6591         {
6592           gimple *use_stmt = USE_STMT (use_p);
6593           if (is_gimple_debug (use_stmt))
6594             continue;
6595
6596           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6597             {
6598               exit_phi = use_stmt;
6599               break;
6600             }
6601         }
6602       if (exit_phi)
6603         {
6604           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6605           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6606                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6607             {
6608               if (dump_enabled_p ())
6609                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6610                                  "inner-loop induction only used outside "
6611                                  "of the outer vectorized loop.\n");
6612               return false;
6613             }
6614         }
6615
6616       nested_in_vect_loop = true;
6617       iv_loop = loop->inner;
6618     }
6619   else
6620     iv_loop = loop;
6621   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6622
6623   if (!vec_stmt) /* transformation not required.  */
6624     {
6625       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6626       if (dump_enabled_p ())
6627         dump_printf_loc (MSG_NOTE, vect_location,
6628                          "=== vectorizable_induction ===\n");
6629       vect_model_induction_cost (stmt_info, ncopies);
6630       return true;
6631     }
6632
6633   /* Transform.  */
6634
6635   /* Compute a vector variable, initialized with the first VF values of
6636      the induction variable.  E.g., for an iv with IV_PHI='X' and
6637      evolution S, for a vector of 4 units, we want to compute:
6638      [X, X + S, X + 2*S, X + 3*S].  */
6639
6640   if (dump_enabled_p ())
6641     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6642
6643   latch_e = loop_latch_edge (iv_loop);
6644   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6645
6646   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6647   gcc_assert (step_expr != NULL_TREE);
6648
6649   pe = loop_preheader_edge (iv_loop);
6650   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6651                                      loop_preheader_edge (iv_loop));
6652
6653   /* Convert the step to the desired type.  */
6654   stmts = NULL;
6655   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6656   if (stmts)
6657     {
6658       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6659       gcc_assert (!new_bb);
6660     }
6661
6662   /* Find the first insertion point in the BB.  */
6663   si = gsi_after_labels (bb);
6664
6665   /* For SLP induction we have to generate several IVs as for example
6666      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6667      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6668      [VF*S, VF*S, VF*S, VF*S] for all.  */
6669   if (slp_node)
6670     {
6671       /* Convert the init to the desired type.  */
6672       stmts = NULL;
6673       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6674       if (stmts)
6675         {
6676           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6677           gcc_assert (!new_bb);
6678         }
6679
6680       /* Generate [VF*S, VF*S, ... ].  */
6681       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6682         {
6683           expr = build_int_cst (integer_type_node, vf);
6684           expr = fold_convert (TREE_TYPE (step_expr), expr);
6685         }
6686       else
6687         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6688       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6689                               expr, step_expr);
6690       if (! CONSTANT_CLASS_P (new_name))
6691         new_name = vect_init_vector (phi, new_name,
6692                                      TREE_TYPE (step_expr), NULL);
6693       new_vec = build_vector_from_val (vectype, new_name);
6694       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6695
6696       /* Now generate the IVs.  */
6697       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6698       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6699       unsigned elts = nunits * nvects;
6700       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6701       gcc_assert (elts % group_size == 0);
6702       tree elt = init_expr;
6703       unsigned ivn;
6704       for (ivn = 0; ivn < nivs; ++ivn)
6705         {
6706           auto_vec<tree, 32> elts (nunits);
6707           stmts = NULL;
6708           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6709             {
6710               if (ivn*nunits + eltn >= group_size
6711                   && (ivn*nunits + eltn) % group_size == 0)
6712                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6713                                     elt, step_expr);
6714               elts.quick_push (elt);
6715             }
6716           vec_init = gimple_build_vector (&stmts, vectype, elts);
6717           if (stmts)
6718             {
6719               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6720               gcc_assert (!new_bb);
6721             }
6722
6723           /* Create the induction-phi that defines the induction-operand.  */
6724           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6725           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6726           set_vinfo_for_stmt (induction_phi,
6727                               new_stmt_vec_info (induction_phi, loop_vinfo));
6728           induc_def = PHI_RESULT (induction_phi);
6729
6730           /* Create the iv update inside the loop  */
6731           vec_def = make_ssa_name (vec_dest);
6732           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6733           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6734           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6735
6736           /* Set the arguments of the phi node:  */
6737           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6738           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6739                        UNKNOWN_LOCATION);
6740
6741           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6742         }
6743
6744       /* Re-use IVs when we can.  */
6745       if (ivn < nvects)
6746         {
6747           unsigned vfp
6748             = least_common_multiple (group_size, nunits) / group_size;
6749           /* Generate [VF'*S, VF'*S, ... ].  */
6750           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6751             {
6752               expr = build_int_cst (integer_type_node, vfp);
6753               expr = fold_convert (TREE_TYPE (step_expr), expr);
6754             }
6755           else
6756             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6757           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6758                                   expr, step_expr);
6759           if (! CONSTANT_CLASS_P (new_name))
6760             new_name = vect_init_vector (phi, new_name,
6761                                          TREE_TYPE (step_expr), NULL);
6762           new_vec = build_vector_from_val (vectype, new_name);
6763           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6764           for (; ivn < nvects; ++ivn)
6765             {
6766               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6767               tree def;
6768               if (gimple_code (iv) == GIMPLE_PHI)
6769                 def = gimple_phi_result (iv);
6770               else
6771                 def = gimple_assign_lhs (iv);
6772               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6773                                               PLUS_EXPR,
6774                                               def, vec_step);
6775               if (gimple_code (iv) == GIMPLE_PHI)
6776                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6777               else
6778                 {
6779                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6780                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6781                 }
6782               set_vinfo_for_stmt (new_stmt,
6783                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6784               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6785             }
6786         }
6787
6788       return true;
6789     }
6790
6791   /* Create the vector that holds the initial_value of the induction.  */
6792   if (nested_in_vect_loop)
6793     {
6794       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6795          been created during vectorization of previous stmts.  We obtain it
6796          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6797       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6798       /* If the initial value is not of proper type, convert it.  */
6799       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6800         {
6801           new_stmt
6802             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6803                                                           vect_simple_var,
6804                                                           "vec_iv_"),
6805                                    VIEW_CONVERT_EXPR,
6806                                    build1 (VIEW_CONVERT_EXPR, vectype,
6807                                            vec_init));
6808           vec_init = gimple_assign_lhs (new_stmt);
6809           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6810                                                  new_stmt);
6811           gcc_assert (!new_bb);
6812           set_vinfo_for_stmt (new_stmt,
6813                               new_stmt_vec_info (new_stmt, loop_vinfo));
6814         }
6815     }
6816   else
6817     {
6818       /* iv_loop is the loop to be vectorized. Create:
6819          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6820       stmts = NULL;
6821       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6822
6823       auto_vec<tree, 32> elts (nunits);
6824       elts.quick_push (new_name);
6825       for (i = 1; i < nunits; i++)
6826         {
6827           /* Create: new_name_i = new_name + step_expr  */
6828           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6829                                    new_name, step_expr);
6830           elts.quick_push (new_name);
6831         }
6832       /* Create a vector from [new_name_0, new_name_1, ...,
6833          new_name_nunits-1]  */
6834       vec_init = gimple_build_vector (&stmts, vectype, elts);
6835       if (stmts)
6836         {
6837           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6838           gcc_assert (!new_bb);
6839         }
6840     }
6841
6842
6843   /* Create the vector that holds the step of the induction.  */
6844   if (nested_in_vect_loop)
6845     /* iv_loop is nested in the loop to be vectorized. Generate:
6846        vec_step = [S, S, S, S]  */
6847     new_name = step_expr;
6848   else
6849     {
6850       /* iv_loop is the loop to be vectorized. Generate:
6851           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6852       gimple_seq seq = NULL;
6853       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6854         {
6855           expr = build_int_cst (integer_type_node, vf);
6856           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6857         }
6858       else
6859         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6860       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6861                                expr, step_expr);
6862       if (seq)
6863         {
6864           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6865           gcc_assert (!new_bb);
6866         }
6867     }
6868
6869   t = unshare_expr (new_name);
6870   gcc_assert (CONSTANT_CLASS_P (new_name)
6871               || TREE_CODE (new_name) == SSA_NAME);
6872   new_vec = build_vector_from_val (vectype, t);
6873   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6874
6875
6876   /* Create the following def-use cycle:
6877      loop prolog:
6878          vec_init = ...
6879          vec_step = ...
6880      loop:
6881          vec_iv = PHI <vec_init, vec_loop>
6882          ...
6883          STMT
6884          ...
6885          vec_loop = vec_iv + vec_step;  */
6886
6887   /* Create the induction-phi that defines the induction-operand.  */
6888   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6889   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6890   set_vinfo_for_stmt (induction_phi,
6891                       new_stmt_vec_info (induction_phi, loop_vinfo));
6892   induc_def = PHI_RESULT (induction_phi);
6893
6894   /* Create the iv update inside the loop  */
6895   vec_def = make_ssa_name (vec_dest);
6896   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6897   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6898   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6899
6900   /* Set the arguments of the phi node:  */
6901   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6902   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6903                UNKNOWN_LOCATION);
6904
6905   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6906
6907   /* In case that vectorization factor (VF) is bigger than the number
6908      of elements that we can fit in a vectype (nunits), we have to generate
6909      more than one vector stmt - i.e - we need to "unroll" the
6910      vector stmt by a factor VF/nunits.  For more details see documentation
6911      in vectorizable_operation.  */
6912
6913   if (ncopies > 1)
6914     {
6915       gimple_seq seq = NULL;
6916       stmt_vec_info prev_stmt_vinfo;
6917       /* FORNOW. This restriction should be relaxed.  */
6918       gcc_assert (!nested_in_vect_loop);
6919
6920       /* Create the vector that holds the step of the induction.  */
6921       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6922         {
6923           expr = build_int_cst (integer_type_node, nunits);
6924           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6925         }
6926       else
6927         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6928       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6929                                expr, step_expr);
6930       if (seq)
6931         {
6932           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6933           gcc_assert (!new_bb);
6934         }
6935
6936       t = unshare_expr (new_name);
6937       gcc_assert (CONSTANT_CLASS_P (new_name)
6938                   || TREE_CODE (new_name) == SSA_NAME);
6939       new_vec = build_vector_from_val (vectype, t);
6940       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6941
6942       vec_def = induc_def;
6943       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6944       for (i = 1; i < ncopies; i++)
6945         {
6946           /* vec_i = vec_prev + vec_step  */
6947           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6948                                           vec_def, vec_step);
6949           vec_def = make_ssa_name (vec_dest, new_stmt);
6950           gimple_assign_set_lhs (new_stmt, vec_def);
6951
6952           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6953           set_vinfo_for_stmt (new_stmt,
6954                               new_stmt_vec_info (new_stmt, loop_vinfo));
6955           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6956           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6957         }
6958     }
6959
6960   if (nested_in_vect_loop)
6961     {
6962       /* Find the loop-closed exit-phi of the induction, and record
6963          the final vector of induction results:  */
6964       exit_phi = NULL;
6965       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6966         {
6967           gimple *use_stmt = USE_STMT (use_p);
6968           if (is_gimple_debug (use_stmt))
6969             continue;
6970
6971           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6972             {
6973               exit_phi = use_stmt;
6974               break;
6975             }
6976         }
6977       if (exit_phi)
6978         {
6979           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6980           /* FORNOW. Currently not supporting the case that an inner-loop induction
6981              is not used in the outer-loop (i.e. only outside the outer-loop).  */
6982           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6983                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
6984
6985           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6986           if (dump_enabled_p ())
6987             {
6988               dump_printf_loc (MSG_NOTE, vect_location,
6989                                "vector of inductions after inner-loop:");
6990               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6991             }
6992         }
6993     }
6994
6995
6996   if (dump_enabled_p ())
6997     {
6998       dump_printf_loc (MSG_NOTE, vect_location,
6999                        "transform induction: created def-use cycle: ");
7000       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7001       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7002                         SSA_NAME_DEF_STMT (vec_def), 0);
7003     }
7004
7005   return true;
7006 }
7007
7008 /* Function vectorizable_live_operation.
7009
7010    STMT computes a value that is used outside the loop.  Check if
7011    it can be supported.  */
7012
7013 bool
7014 vectorizable_live_operation (gimple *stmt,
7015                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7016                              slp_tree slp_node, int slp_index,
7017                              gimple **vec_stmt)
7018 {
7019   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7020   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7021   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7022   imm_use_iterator imm_iter;
7023   tree lhs, lhs_type, bitsize, vec_bitsize;
7024   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7025   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7026   int ncopies;
7027   gimple *use_stmt;
7028   auto_vec<tree> vec_oprnds;
7029
7030   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7031
7032   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7033     return false;
7034
7035   /* FORNOW.  CHECKME.  */
7036   if (nested_in_vect_loop_p (loop, stmt))
7037     return false;
7038
7039   /* If STMT is not relevant and it is a simple assignment and its inputs are
7040      invariant then it can remain in place, unvectorized.  The original last
7041      scalar value that it computes will be used.  */
7042   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7043     {
7044       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7045       if (dump_enabled_p ())
7046         dump_printf_loc (MSG_NOTE, vect_location,
7047                          "statement is simple and uses invariant.  Leaving in "
7048                          "place.\n");
7049       return true;
7050     }
7051
7052   if (slp_node)
7053     ncopies = 1;
7054   else
7055     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7056
7057   if (!vec_stmt)
7058     /* No transformation required.  */
7059     return true;
7060
7061   /* If stmt has a related stmt, then use that for getting the lhs.  */
7062   if (is_pattern_stmt_p (stmt_info))
7063     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7064
7065   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7066         : gimple_get_lhs (stmt);
7067   lhs_type = TREE_TYPE (lhs);
7068
7069   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7070              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7071              : TYPE_SIZE (TREE_TYPE (vectype)));
7072   vec_bitsize = TYPE_SIZE (vectype);
7073
7074   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7075   tree vec_lhs, bitstart;
7076   if (slp_node)
7077     {
7078       gcc_assert (slp_index >= 0);
7079
7080       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7081       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7082
7083       /* Get the last occurrence of the scalar index from the concatenation of
7084          all the slp vectors. Calculate which slp vector it is and the index
7085          within.  */
7086       int pos = (num_vec * nunits) - num_scalar + slp_index;
7087       int vec_entry = pos / nunits;
7088       int vec_index = pos % nunits;
7089
7090       /* Get the correct slp vectorized stmt.  */
7091       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7092
7093       /* Get entry to use.  */
7094       bitstart = bitsize_int (vec_index);
7095       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7096     }
7097   else
7098     {
7099       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7100       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7101
7102       /* For multiple copies, get the last copy.  */
7103       for (int i = 1; i < ncopies; ++i)
7104         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7105                                                   vec_lhs);
7106
7107       /* Get the last lane in the vector.  */
7108       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7109     }
7110
7111   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7112      loop.  */
7113   gimple_seq stmts = NULL;
7114   tree bftype = TREE_TYPE (vectype);
7115   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7116     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7117   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7118   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7119                                    true, NULL_TREE);
7120   if (stmts)
7121     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7122
7123   /* Replace use of lhs with newly computed result.  If the use stmt is a
7124      single arg PHI, just replace all uses of PHI result.  It's necessary
7125      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7126   use_operand_p use_p;
7127   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7128     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7129         && !is_gimple_debug (use_stmt))
7130     {
7131       if (gimple_code (use_stmt) == GIMPLE_PHI
7132           && gimple_phi_num_args (use_stmt) == 1)
7133         {
7134           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7135         }
7136       else
7137         {
7138           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7139             SET_USE (use_p, new_tree);
7140         }
7141       update_stmt (use_stmt);
7142     }
7143
7144   return true;
7145 }
7146
7147 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7148
7149 static void
7150 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7151 {
7152   ssa_op_iter op_iter;
7153   imm_use_iterator imm_iter;
7154   def_operand_p def_p;
7155   gimple *ustmt;
7156
7157   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7158     {
7159       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7160         {
7161           basic_block bb;
7162
7163           if (!is_gimple_debug (ustmt))
7164             continue;
7165
7166           bb = gimple_bb (ustmt);
7167
7168           if (!flow_bb_inside_loop_p (loop, bb))
7169             {
7170               if (gimple_debug_bind_p (ustmt))
7171                 {
7172                   if (dump_enabled_p ())
7173                     dump_printf_loc (MSG_NOTE, vect_location,
7174                                      "killing debug use\n");
7175
7176                   gimple_debug_bind_reset_value (ustmt);
7177                   update_stmt (ustmt);
7178                 }
7179               else
7180                 gcc_unreachable ();
7181             }
7182         }
7183     }
7184 }
7185
7186 /* Given loop represented by LOOP_VINFO, return true if computation of
7187    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7188    otherwise.  */
7189
7190 static bool
7191 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7192 {
7193   /* Constant case.  */
7194   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7195     {
7196       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7197       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7198
7199       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7200       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7201       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7202         return true;
7203     }
7204
7205   widest_int max;
7206   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7207   /* Check the upper bound of loop niters.  */
7208   if (get_max_loop_iterations (loop, &max))
7209     {
7210       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7211       signop sgn = TYPE_SIGN (type);
7212       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7213       if (max < type_max)
7214         return true;
7215     }
7216   return false;
7217 }
7218
7219 /* Scale profiling counters by estimation for LOOP which is vectorized
7220    by factor VF.  */
7221
7222 static void
7223 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7224 {
7225   edge preheader = loop_preheader_edge (loop);
7226   /* Reduce loop iterations by the vectorization factor.  */
7227   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7228   profile_count freq_h = loop->header->count, freq_e = preheader->count;
7229
7230   /* Use frequency only if counts are zero.  */
7231   if (!(freq_h > 0) && !(freq_e > 0))
7232     {
7233       freq_h = profile_count::from_gcov_type (loop->header->frequency);
7234       freq_e = profile_count::from_gcov_type (EDGE_FREQUENCY (preheader));
7235     }
7236   if (freq_h > 0)
7237     {
7238       profile_probability p;
7239
7240       /* Avoid dropping loop body profile counter to 0 because of zero count
7241          in loop's preheader.  */
7242       if (!(freq_e > profile_count::from_gcov_type (1)))
7243        freq_e = profile_count::from_gcov_type (1);
7244       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7245       scale_loop_frequencies (loop, p);
7246     }
7247
7248   basic_block exit_bb = single_pred (loop->latch);
7249   edge exit_e = single_exit (loop);
7250   exit_e->count = loop_preheader_edge (loop)->count;
7251   exit_e->probability = profile_probability::always ()
7252                                  .apply_scale (1, new_est_niter + 1);
7253
7254   edge exit_l = single_pred_edge (loop->latch);
7255   profile_probability prob = exit_l->probability;
7256   exit_l->probability = exit_e->probability.invert ();
7257   exit_l->count = exit_bb->count - exit_e->count;
7258   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7259     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7260 }
7261
7262 /* Function vect_transform_loop.
7263
7264    The analysis phase has determined that the loop is vectorizable.
7265    Vectorize the loop - created vectorized stmts to replace the scalar
7266    stmts in the loop, and update the loop exit condition.
7267    Returns scalar epilogue loop if any.  */
7268
7269 struct loop *
7270 vect_transform_loop (loop_vec_info loop_vinfo)
7271 {
7272   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7273   struct loop *epilogue = NULL;
7274   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7275   int nbbs = loop->num_nodes;
7276   int i;
7277   tree niters_vector = NULL;
7278   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7279   bool grouped_store;
7280   bool slp_scheduled = false;
7281   gimple *stmt, *pattern_stmt;
7282   gimple_seq pattern_def_seq = NULL;
7283   gimple_stmt_iterator pattern_def_si = gsi_none ();
7284   bool transform_pattern_stmt = false;
7285   bool check_profitability = false;
7286   int th;
7287
7288   if (dump_enabled_p ())
7289     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7290
7291   /* Use the more conservative vectorization threshold.  If the number
7292      of iterations is constant assume the cost check has been performed
7293      by our caller.  If the threshold makes all loops profitable that
7294      run at least the vectorization factor number of times checking
7295      is pointless, too.  */
7296   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7297   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7298       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7299     {
7300       if (dump_enabled_p ())
7301         dump_printf_loc (MSG_NOTE, vect_location,
7302                          "Profitability threshold is %d loop iterations.\n",
7303                          th);
7304       check_profitability = true;
7305     }
7306
7307   /* Make sure there exists a single-predecessor exit bb.  Do this before
7308      versioning.   */
7309   edge e = single_exit (loop);
7310   if (! single_pred_p (e->dest))
7311     {
7312       split_loop_exit_edge (e);
7313       if (dump_enabled_p ())
7314         dump_printf (MSG_NOTE, "split exit edge\n");
7315     }
7316
7317   /* Version the loop first, if required, so the profitability check
7318      comes first.  */
7319
7320   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7321     {
7322       vect_loop_versioning (loop_vinfo, th, check_profitability);
7323       check_profitability = false;
7324     }
7325
7326   /* Make sure there exists a single-predecessor exit bb also on the
7327      scalar loop copy.  Do this after versioning but before peeling
7328      so CFG structure is fine for both scalar and if-converted loop
7329      to make slpeel_duplicate_current_defs_from_edges face matched
7330      loop closed PHI nodes on the exit.  */
7331   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7332     {
7333       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7334       if (! single_pred_p (e->dest))
7335         {
7336           split_loop_exit_edge (e);
7337           if (dump_enabled_p ())
7338             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7339         }
7340     }
7341
7342   tree niters = vect_build_loop_niters (loop_vinfo);
7343   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7344   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7345   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7346   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7347                               check_profitability, niters_no_overflow);
7348   if (niters_vector == NULL_TREE)
7349     {
7350       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7351         niters_vector
7352           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7353                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7354       else
7355         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7356                                      niters_no_overflow);
7357     }
7358
7359   /* 1) Make sure the loop header has exactly two entries
7360      2) Make sure we have a preheader basic block.  */
7361
7362   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7363
7364   split_edge (loop_preheader_edge (loop));
7365
7366   /* FORNOW: the vectorizer supports only loops which body consist
7367      of one basic block (header + empty latch). When the vectorizer will
7368      support more involved loop forms, the order by which the BBs are
7369      traversed need to be reconsidered.  */
7370
7371   for (i = 0; i < nbbs; i++)
7372     {
7373       basic_block bb = bbs[i];
7374       stmt_vec_info stmt_info;
7375
7376       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7377            gsi_next (&si))
7378         {
7379           gphi *phi = si.phi ();
7380           if (dump_enabled_p ())
7381             {
7382               dump_printf_loc (MSG_NOTE, vect_location,
7383                                "------>vectorizing phi: ");
7384               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7385             }
7386           stmt_info = vinfo_for_stmt (phi);
7387           if (!stmt_info)
7388             continue;
7389
7390           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7391             vect_loop_kill_debug_uses (loop, phi);
7392
7393           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7394               && !STMT_VINFO_LIVE_P (stmt_info))
7395             continue;
7396
7397           if (STMT_VINFO_VECTYPE (stmt_info)
7398               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7399                   != (unsigned HOST_WIDE_INT) vf)
7400               && dump_enabled_p ())
7401             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7402
7403           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7404                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7405                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7406               && ! PURE_SLP_STMT (stmt_info))
7407             {
7408               if (dump_enabled_p ())
7409                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7410               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7411             }
7412         }
7413
7414       pattern_stmt = NULL;
7415       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7416            !gsi_end_p (si) || transform_pattern_stmt;)
7417         {
7418           bool is_store;
7419
7420           if (transform_pattern_stmt)
7421             stmt = pattern_stmt;
7422           else
7423             {
7424               stmt = gsi_stmt (si);
7425               /* During vectorization remove existing clobber stmts.  */
7426               if (gimple_clobber_p (stmt))
7427                 {
7428                   unlink_stmt_vdef (stmt);
7429                   gsi_remove (&si, true);
7430                   release_defs (stmt);
7431                   continue;
7432                 }
7433             }
7434
7435           if (dump_enabled_p ())
7436             {
7437               dump_printf_loc (MSG_NOTE, vect_location,
7438                                "------>vectorizing statement: ");
7439               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7440             }
7441
7442           stmt_info = vinfo_for_stmt (stmt);
7443
7444           /* vector stmts created in the outer-loop during vectorization of
7445              stmts in an inner-loop may not have a stmt_info, and do not
7446              need to be vectorized.  */
7447           if (!stmt_info)
7448             {
7449               gsi_next (&si);
7450               continue;
7451             }
7452
7453           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7454             vect_loop_kill_debug_uses (loop, stmt);
7455
7456           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7457               && !STMT_VINFO_LIVE_P (stmt_info))
7458             {
7459               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7460                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7461                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7462                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7463                 {
7464                   stmt = pattern_stmt;
7465                   stmt_info = vinfo_for_stmt (stmt);
7466                 }
7467               else
7468                 {
7469                   gsi_next (&si);
7470                   continue;
7471                 }
7472             }
7473           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7474                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7475                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7476                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7477             transform_pattern_stmt = true;
7478
7479           /* If pattern statement has def stmts, vectorize them too.  */
7480           if (is_pattern_stmt_p (stmt_info))
7481             {
7482               if (pattern_def_seq == NULL)
7483                 {
7484                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7485                   pattern_def_si = gsi_start (pattern_def_seq);
7486                 }
7487               else if (!gsi_end_p (pattern_def_si))
7488                 gsi_next (&pattern_def_si);
7489               if (pattern_def_seq != NULL)
7490                 {
7491                   gimple *pattern_def_stmt = NULL;
7492                   stmt_vec_info pattern_def_stmt_info = NULL;
7493
7494                   while (!gsi_end_p (pattern_def_si))
7495                     {
7496                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7497                       pattern_def_stmt_info
7498                         = vinfo_for_stmt (pattern_def_stmt);
7499                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7500                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7501                         break;
7502                       gsi_next (&pattern_def_si);
7503                     }
7504
7505                   if (!gsi_end_p (pattern_def_si))
7506                     {
7507                       if (dump_enabled_p ())
7508                         {
7509                           dump_printf_loc (MSG_NOTE, vect_location,
7510                                            "==> vectorizing pattern def "
7511                                            "stmt: ");
7512                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7513                                             pattern_def_stmt, 0);
7514                         }
7515
7516                       stmt = pattern_def_stmt;
7517                       stmt_info = pattern_def_stmt_info;
7518                     }
7519                   else
7520                     {
7521                       pattern_def_si = gsi_none ();
7522                       transform_pattern_stmt = false;
7523                     }
7524                 }
7525               else
7526                 transform_pattern_stmt = false;
7527             }
7528
7529           if (STMT_VINFO_VECTYPE (stmt_info))
7530             {
7531               unsigned int nunits
7532                 = (unsigned int)
7533                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7534               if (!STMT_SLP_TYPE (stmt_info)
7535                   && nunits != (unsigned int) vf
7536                   && dump_enabled_p ())
7537                   /* For SLP VF is set according to unrolling factor, and not
7538                      to vector size, hence for SLP this print is not valid.  */
7539                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7540             }
7541
7542           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7543              reached.  */
7544           if (STMT_SLP_TYPE (stmt_info))
7545             {
7546               if (!slp_scheduled)
7547                 {
7548                   slp_scheduled = true;
7549
7550                   if (dump_enabled_p ())
7551                     dump_printf_loc (MSG_NOTE, vect_location,
7552                                      "=== scheduling SLP instances ===\n");
7553
7554                   vect_schedule_slp (loop_vinfo);
7555                 }
7556
7557               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7558               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7559                 {
7560                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7561                     {
7562                       pattern_def_seq = NULL;
7563                       gsi_next (&si);
7564                     }
7565                   continue;
7566                 }
7567             }
7568
7569           /* -------- vectorize statement ------------ */
7570           if (dump_enabled_p ())
7571             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7572
7573           grouped_store = false;
7574           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7575           if (is_store)
7576             {
7577               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7578                 {
7579                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7580                      interleaving chain was completed - free all the stores in
7581                      the chain.  */
7582                   gsi_next (&si);
7583                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7584                 }
7585               else
7586                 {
7587                   /* Free the attached stmt_vec_info and remove the stmt.  */
7588                   gimple *store = gsi_stmt (si);
7589                   free_stmt_vec_info (store);
7590                   unlink_stmt_vdef (store);
7591                   gsi_remove (&si, true);
7592                   release_defs (store);
7593                 }
7594
7595               /* Stores can only appear at the end of pattern statements.  */
7596               gcc_assert (!transform_pattern_stmt);
7597               pattern_def_seq = NULL;
7598             }
7599           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7600             {
7601               pattern_def_seq = NULL;
7602               gsi_next (&si);
7603             }
7604         }                       /* stmts in BB */
7605     }                           /* BBs in loop */
7606
7607   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7608
7609   scale_profile_for_vect_loop (loop, vf);
7610
7611   /* The minimum number of iterations performed by the epilogue.  This
7612      is 1 when peeling for gaps because we always need a final scalar
7613      iteration.  */
7614   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7615   /* +1 to convert latch counts to loop iteration counts,
7616      -min_epilogue_iters to remove iterations that cannot be performed
7617        by the vector code.  */
7618   int bias = 1 - min_epilogue_iters;
7619   /* In these calculations the "- 1" converts loop iteration counts
7620      back to latch counts.  */
7621   if (loop->any_upper_bound)
7622     loop->nb_iterations_upper_bound
7623       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7624   if (loop->any_likely_upper_bound)
7625     loop->nb_iterations_likely_upper_bound
7626       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7627   if (loop->any_estimate)
7628     loop->nb_iterations_estimate
7629       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7630
7631   if (dump_enabled_p ())
7632     {
7633       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7634         {
7635           dump_printf_loc (MSG_NOTE, vect_location,
7636                            "LOOP VECTORIZED\n");
7637           if (loop->inner)
7638             dump_printf_loc (MSG_NOTE, vect_location,
7639                              "OUTER LOOP VECTORIZED\n");
7640           dump_printf (MSG_NOTE, "\n");
7641         }
7642       else
7643         dump_printf_loc (MSG_NOTE, vect_location,
7644                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7645                          current_vector_size);
7646     }
7647
7648   /* Free SLP instances here because otherwise stmt reference counting
7649      won't work.  */
7650   slp_instance instance;
7651   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7652     vect_free_slp_instance (instance);
7653   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7654   /* Clear-up safelen field since its value is invalid after vectorization
7655      since vectorized loop can have loop-carried dependencies.  */
7656   loop->safelen = 0;
7657
7658   /* Don't vectorize epilogue for epilogue.  */
7659   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7660     epilogue = NULL;
7661
7662   if (epilogue)
7663     {
7664         unsigned int vector_sizes
7665           = targetm.vectorize.autovectorize_vector_sizes ();
7666         vector_sizes &= current_vector_size - 1;
7667
7668         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7669           epilogue = NULL;
7670         else if (!vector_sizes)
7671           epilogue = NULL;
7672         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7673                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7674           {
7675             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7676             int ratio = current_vector_size / smallest_vec_size;
7677             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7678               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7679             eiters = eiters % vf;
7680
7681             epilogue->nb_iterations_upper_bound = eiters - 1;
7682
7683             if (eiters < vf / ratio)
7684               epilogue = NULL;
7685             }
7686     }
7687
7688   if (epilogue)
7689     {
7690       epilogue->force_vectorize = loop->force_vectorize;
7691       epilogue->safelen = loop->safelen;
7692       epilogue->dont_vectorize = false;
7693
7694       /* We may need to if-convert epilogue to vectorize it.  */
7695       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7696         tree_if_conversion (epilogue);
7697     }
7698
7699   return epilogue;
7700 }
7701
7702 /* The code below is trying to perform simple optimization - revert
7703    if-conversion for masked stores, i.e. if the mask of a store is zero
7704    do not perform it and all stored value producers also if possible.
7705    For example,
7706      for (i=0; i<n; i++)
7707        if (c[i])
7708         {
7709           p1[i] += 1;
7710           p2[i] = p3[i] +2;
7711         }
7712    this transformation will produce the following semi-hammock:
7713
7714    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7715      {
7716        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7717        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7718        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7719        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7720        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7721        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7722      }
7723 */
7724
7725 void
7726 optimize_mask_stores (struct loop *loop)
7727 {
7728   basic_block *bbs = get_loop_body (loop);
7729   unsigned nbbs = loop->num_nodes;
7730   unsigned i;
7731   basic_block bb;
7732   struct loop *bb_loop;
7733   gimple_stmt_iterator gsi;
7734   gimple *stmt;
7735   auto_vec<gimple *> worklist;
7736
7737   vect_location = find_loop_location (loop);
7738   /* Pick up all masked stores in loop if any.  */
7739   for (i = 0; i < nbbs; i++)
7740     {
7741       bb = bbs[i];
7742       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7743            gsi_next (&gsi))
7744         {
7745           stmt = gsi_stmt (gsi);
7746           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7747             worklist.safe_push (stmt);
7748         }
7749     }
7750
7751   free (bbs);
7752   if (worklist.is_empty ())
7753     return;
7754
7755   /* Loop has masked stores.  */
7756   while (!worklist.is_empty ())
7757     {
7758       gimple *last, *last_store;
7759       edge e, efalse;
7760       tree mask;
7761       basic_block store_bb, join_bb;
7762       gimple_stmt_iterator gsi_to;
7763       tree vdef, new_vdef;
7764       gphi *phi;
7765       tree vectype;
7766       tree zero;
7767
7768       last = worklist.pop ();
7769       mask = gimple_call_arg (last, 2);
7770       bb = gimple_bb (last);
7771       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7772          the same loop as if_bb.  It could be different to LOOP when two
7773          level loop-nest is vectorized and mask_store belongs to the inner
7774          one.  */
7775       e = split_block (bb, last);
7776       bb_loop = bb->loop_father;
7777       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7778       join_bb = e->dest;
7779       store_bb = create_empty_bb (bb);
7780       add_bb_to_loop (store_bb, bb_loop);
7781       e->flags = EDGE_TRUE_VALUE;
7782       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7783       /* Put STORE_BB to likely part.  */
7784       efalse->probability = profile_probability::unlikely ();
7785       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7786       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7787       if (dom_info_available_p (CDI_DOMINATORS))
7788         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7789       if (dump_enabled_p ())
7790         dump_printf_loc (MSG_NOTE, vect_location,
7791                          "Create new block %d to sink mask stores.",
7792                          store_bb->index);
7793       /* Create vector comparison with boolean result.  */
7794       vectype = TREE_TYPE (mask);
7795       zero = build_zero_cst (vectype);
7796       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7797       gsi = gsi_last_bb (bb);
7798       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7799       /* Create new PHI node for vdef of the last masked store:
7800          .MEM_2 = VDEF <.MEM_1>
7801          will be converted to
7802          .MEM.3 = VDEF <.MEM_1>
7803          and new PHI node will be created in join bb
7804          .MEM_2 = PHI <.MEM_1, .MEM_3>
7805       */
7806       vdef = gimple_vdef (last);
7807       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7808       gimple_set_vdef (last, new_vdef);
7809       phi = create_phi_node (vdef, join_bb);
7810       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7811
7812       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7813       while (true)
7814         {
7815           gimple_stmt_iterator gsi_from;
7816           gimple *stmt1 = NULL;
7817
7818           /* Move masked store to STORE_BB.  */
7819           last_store = last;
7820           gsi = gsi_for_stmt (last);
7821           gsi_from = gsi;
7822           /* Shift GSI to the previous stmt for further traversal.  */
7823           gsi_prev (&gsi);
7824           gsi_to = gsi_start_bb (store_bb);
7825           gsi_move_before (&gsi_from, &gsi_to);
7826           /* Setup GSI_TO to the non-empty block start.  */
7827           gsi_to = gsi_start_bb (store_bb);
7828           if (dump_enabled_p ())
7829             {
7830               dump_printf_loc (MSG_NOTE, vect_location,
7831                                "Move stmt to created bb\n");
7832               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7833             }
7834           /* Move all stored value producers if possible.  */
7835           while (!gsi_end_p (gsi))
7836             {
7837               tree lhs;
7838               imm_use_iterator imm_iter;
7839               use_operand_p use_p;
7840               bool res;
7841
7842               /* Skip debug statements.  */
7843               if (is_gimple_debug (gsi_stmt (gsi)))
7844                 {
7845                   gsi_prev (&gsi);
7846                   continue;
7847                 }
7848               stmt1 = gsi_stmt (gsi);
7849               /* Do not consider statements writing to memory or having
7850                  volatile operand.  */
7851               if (gimple_vdef (stmt1)
7852                   || gimple_has_volatile_ops (stmt1))
7853                 break;
7854               gsi_from = gsi;
7855               gsi_prev (&gsi);
7856               lhs = gimple_get_lhs (stmt1);
7857               if (!lhs)
7858                 break;
7859
7860               /* LHS of vectorized stmt must be SSA_NAME.  */
7861               if (TREE_CODE (lhs) != SSA_NAME)
7862                 break;
7863
7864               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7865                 {
7866                   /* Remove dead scalar statement.  */
7867                   if (has_zero_uses (lhs))
7868                     {
7869                       gsi_remove (&gsi_from, true);
7870                       continue;
7871                     }
7872                 }
7873
7874               /* Check that LHS does not have uses outside of STORE_BB.  */
7875               res = true;
7876               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7877                 {
7878                   gimple *use_stmt;
7879                   use_stmt = USE_STMT (use_p);
7880                   if (is_gimple_debug (use_stmt))
7881                     continue;
7882                   if (gimple_bb (use_stmt) != store_bb)
7883                     {
7884                       res = false;
7885                       break;
7886                     }
7887                 }
7888               if (!res)
7889                 break;
7890
7891               if (gimple_vuse (stmt1)
7892                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7893                 break;
7894
7895               /* Can move STMT1 to STORE_BB.  */
7896               if (dump_enabled_p ())
7897                 {
7898                   dump_printf_loc (MSG_NOTE, vect_location,
7899                                    "Move stmt to created bb\n");
7900                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7901                 }
7902               gsi_move_before (&gsi_from, &gsi_to);
7903               /* Shift GSI_TO for further insertion.  */
7904               gsi_prev (&gsi_to);
7905             }
7906           /* Put other masked stores with the same mask to STORE_BB.  */
7907           if (worklist.is_empty ()
7908               || gimple_call_arg (worklist.last (), 2) != mask
7909               || worklist.last () != stmt1)
7910             break;
7911           last = worklist.pop ();
7912         }
7913       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7914     }
7915 }