gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2017 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52 #include "tree-if-conv.h"
  53
  54 /* Loop Vectorization Pass.
  55
  56    This pass tries to vectorize loops.
  57
  58    For example, the vectorizer transforms the following simple loop:
  59
  60         short a[N]; short b[N]; short c[N]; int i;
  61
  62         for (i=0; i<N; i++){
  63           a[i] = b[i] + c[i];
  64         }
  65
  66    as if it was manually vectorized by rewriting the source code into:
  67
  68         typedef int __attribute__((mode(V8HI))) v8hi;
  69         short a[N];  short b[N]; short c[N];   int i;
  70         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  71         v8hi va, vb, vc;
  72
  73         for (i=0; i<N/8; i++){
  74           vb = pb[i];
  75           vc = pc[i];
  76           va = vb + vc;
  77           pa[i] = va;
  78         }
  79
  80         The main entry to this pass is vectorize_loops(), in which
  81    the vectorizer applies a set of analyses on a given set of loops,
  82    followed by the actual vectorization transformation for the loops that
  83    had successfully passed the analysis phase.
  84         Throughout this pass we make a distinction between two types of
  85    data: scalars (which are represented by SSA_NAMES), and memory references
  86    ("data-refs").  These two types of data require different handling both
  87    during analysis and transformation. The types of data-refs that the
  88    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  89    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  90    accesses are required to have a simple (consecutive) access pattern.
  91
  92    Analysis phase:
  93    ===============
  94         The driver for the analysis phase is vect_analyze_loop().
  95    It applies a set of analyses, some of which rely on the scalar evolution
  96    analyzer (scev) developed by Sebastian Pop.
  97
  98         During the analysis phase the vectorizer records some information
  99    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 100    loop, as well as general information about the loop as a whole, which is
 101    recorded in a "loop_vec_info" struct attached to each loop.
 102
 103    Transformation phase:
 104    =====================
 105         The loop transformation phase scans all the stmts in the loop, and
 106    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 107    the loop that needs to be vectorized.  It inserts the vector code sequence
 108    just before the scalar stmt S, and records a pointer to the vector code
 109    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 110    attached to S).  This pointer will be used for the vectorization of following
 111    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 112    otherwise, we rely on dead code elimination for removing it.
 113
 114         For example, say stmt S1 was vectorized into stmt VS1:
 115
 116    VS1: vb = px[i];
 117    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 118    S2:  a = b;
 119
 120    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 121    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 122    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 123    resulting sequence would be:
 124
 125    VS1: vb = px[i];
 126    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 127    VS2: va = vb;
 128    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 129
 130         Operands that are not SSA_NAMEs, are data-refs that appear in
 131    load/store operations (like 'x[i]' in S1), and are handled differently.
 132
 133    Target modeling:
 134    =================
 135         Currently the only target specific information that is used is the
 136    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 137    Targets that can support different sizes of vectors, for now will need
 138    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 139    flexibility will be added in the future.
 140
 141         Since we only vectorize operations which vector form can be
 142    expressed using existing tree codes, to verify that an operation is
 143    supported, the vectorizer checks the relevant optab at the relevant
 144    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 145    the value found is CODE_FOR_nothing, then there's no target support, and
 146    we can't vectorize the stmt.
 147
 148    For additional information on this project see:
 149    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 150 */
 151
 152 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 153
 154 /* Function vect_determine_vectorization_factor
 155
 156    Determine the vectorization factor (VF).  VF is the number of data elements
 157    that are operated upon in parallel in a single iteration of the vectorized
 158    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 159    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 160    elements can fit in a single vector register.
 161
 162    We currently support vectorization of loops in which all types operated upon
 163    are of the same size.  Therefore this function currently sets VF according to
 164    the size of the types operated upon, and fails if there are multiple sizes
 165    in the loop.
 166
 167    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 168    original loop:
 169         for (i=0; i<N; i++){
 170           a[i] = b[i] + c[i];
 171         }
 172
 173    vectorized loop:
 174         for (i=0; i<N; i+=VF){
 175           a[i:VF] = b[i:VF] + c[i:VF];
 176         }
 177 */
 178
 179 static bool
 180 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 181 {
 182   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 183   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 184   unsigned nbbs = loop->num_nodes;
 185   unsigned int vectorization_factor = 0;
 186   tree scalar_type = NULL_TREE;
 187   gphi *phi;
 188   tree vectype;
 189   unsigned int nunits;
 190   stmt_vec_info stmt_info;
 191   unsigned i;
 192   HOST_WIDE_INT dummy;
 193   gimple *stmt, *pattern_stmt = NULL;
 194   gimple_seq pattern_def_seq = NULL;
 195   gimple_stmt_iterator pattern_def_si = gsi_none ();
 196   bool analyze_pattern_stmt = false;
 197   bool bool_result;
 198   auto_vec<stmt_vec_info> mask_producers;
 199
 200   if (dump_enabled_p ())
 201     dump_printf_loc (MSG_NOTE, vect_location,
 202                      "=== vect_determine_vectorization_factor ===\n");
 203
 204   for (i = 0; i < nbbs; i++)
 205     {
 206       basic_block bb = bbs[i];
 207
 208       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 209            gsi_next (&si))
 210         {
 211           phi = si.phi ();
 212           stmt_info = vinfo_for_stmt (phi);
 213           if (dump_enabled_p ())
 214             {
 215               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 216               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 217             }
 218
 219           gcc_assert (stmt_info);
 220
 221           if (STMT_VINFO_RELEVANT_P (stmt_info)
 222               || STMT_VINFO_LIVE_P (stmt_info))
 223             {
 224               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 225               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 226
 227               if (dump_enabled_p ())
 228                 {
 229                   dump_printf_loc (MSG_NOTE, vect_location,
 230                                    "get vectype for scalar type:  ");
 231                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 232                   dump_printf (MSG_NOTE, "\n");
 233                 }
 234
 235               vectype = get_vectype_for_scalar_type (scalar_type);
 236               if (!vectype)
 237                 {
 238                   if (dump_enabled_p ())
 239                     {
 240                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 241                                        "not vectorized: unsupported "
 242                                        "data-type ");
 243                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 244                                          scalar_type);
 245                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 246                     }
 247                   return false;
 248                 }
 249               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 250
 251               if (dump_enabled_p ())
 252                 {
 253                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 254                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 255                   dump_printf (MSG_NOTE, "\n");
 256                 }
 257
 258               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 259               if (dump_enabled_p ())
 260                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 261                                  nunits);
 262
 263               if (!vectorization_factor
 264                   || (nunits > vectorization_factor))
 265                 vectorization_factor = nunits;
 266             }
 267         }
 268
 269       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 270            !gsi_end_p (si) || analyze_pattern_stmt;)
 271         {
 272           tree vf_vectype;
 273
 274           if (analyze_pattern_stmt)
 275             stmt = pattern_stmt;
 276           else
 277             stmt = gsi_stmt (si);
 278
 279           stmt_info = vinfo_for_stmt (stmt);
 280
 281           if (dump_enabled_p ())
 282             {
 283               dump_printf_loc (MSG_NOTE, vect_location,
 284                                "==> examining statement: ");
 285               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 286             }
 287
 288           gcc_assert (stmt_info);
 289
 290           /* Skip stmts which do not need to be vectorized.  */
 291           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 292                && !STMT_VINFO_LIVE_P (stmt_info))
 293               || gimple_clobber_p (stmt))
 294             {
 295               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 296                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 297                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 298                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 299                 {
 300                   stmt = pattern_stmt;
 301                   stmt_info = vinfo_for_stmt (pattern_stmt);
 302                   if (dump_enabled_p ())
 303                     {
 304                       dump_printf_loc (MSG_NOTE, vect_location,
 305                                        "==> examining pattern statement: ");
 306                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 307                     }
 308                 }
 309               else
 310                 {
 311                   if (dump_enabled_p ())
 312                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 313                   gsi_next (&si);
 314                   continue;
 315                 }
 316             }
 317           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 318                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 319                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 320                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 321             analyze_pattern_stmt = true;
 322
 323           /* If a pattern statement has def stmts, analyze them too.  */
 324           if (is_pattern_stmt_p (stmt_info))
 325             {
 326               if (pattern_def_seq == NULL)
 327                 {
 328                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 329                   pattern_def_si = gsi_start (pattern_def_seq);
 330                 }
 331               else if (!gsi_end_p (pattern_def_si))
 332                 gsi_next (&pattern_def_si);
 333               if (pattern_def_seq != NULL)
 334                 {
 335                   gimple *pattern_def_stmt = NULL;
 336                   stmt_vec_info pattern_def_stmt_info = NULL;
 337
 338                   while (!gsi_end_p (pattern_def_si))
 339                     {
 340                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 341                       pattern_def_stmt_info
 342                         = vinfo_for_stmt (pattern_def_stmt);
 343                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 344                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 345                         break;
 346                       gsi_next (&pattern_def_si);
 347                     }
 348
 349                   if (!gsi_end_p (pattern_def_si))
 350                     {
 351                       if (dump_enabled_p ())
 352                         {
 353                           dump_printf_loc (MSG_NOTE, vect_location,
 354                                            "==> examining pattern def stmt: ");
 355                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 356                                             pattern_def_stmt, 0);
 357                         }
 358
 359                       stmt = pattern_def_stmt;
 360                       stmt_info = pattern_def_stmt_info;
 361                     }
 362                   else
 363                     {
 364                       pattern_def_si = gsi_none ();
 365                       analyze_pattern_stmt = false;
 366                     }
 367                 }
 368               else
 369                 analyze_pattern_stmt = false;
 370             }
 371
 372           if (gimple_get_lhs (stmt) == NULL_TREE
 373               /* MASK_STORE has no lhs, but is ok.  */
 374               && (!is_gimple_call (stmt)
 375                   || !gimple_call_internal_p (stmt)
 376                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 377             {
 378               if (is_gimple_call (stmt))
 379                 {
 380                   /* Ignore calls with no lhs.  These must be calls to
 381                      #pragma omp simd functions, and what vectorization factor
 382                      it really needs can't be determined until
 383                      vectorizable_simd_clone_call.  */
 384                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 385                     {
 386                       pattern_def_seq = NULL;
 387                       gsi_next (&si);
 388                     }
 389                   continue;
 390                 }
 391               if (dump_enabled_p ())
 392                 {
 393                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 394                                    "not vectorized: irregular stmt.");
 395                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 396                                     0);
 397                 }
 398               return false;
 399             }
 400
 401           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 402             {
 403               if (dump_enabled_p ())
 404                 {
 405                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 406                                    "not vectorized: vector stmt in loop:");
 407                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 408                 }
 409               return false;
 410             }
 411
 412           bool_result = false;
 413
 414           if (STMT_VINFO_VECTYPE (stmt_info))
 415             {
 416               /* The only case when a vectype had been already set is for stmts
 417                  that contain a dataref, or for "pattern-stmts" (stmts
 418                  generated by the vectorizer to represent/replace a certain
 419                  idiom).  */
 420               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 421                           || is_pattern_stmt_p (stmt_info)
 422                           || !gsi_end_p (pattern_def_si));
 423               vectype = STMT_VINFO_VECTYPE (stmt_info);
 424             }
 425           else
 426             {
 427               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 428               if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
 429                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 430               else
 431                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 432
 433               /* Bool ops don't participate in vectorization factor
 434                  computation.  For comparison use compared types to
 435                  compute a factor.  */
 436               if (VECT_SCALAR_BOOLEAN_TYPE_P (scalar_type)
 437                   && is_gimple_assign (stmt)
 438                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 439                 {
 440                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 441                       || STMT_VINFO_LIVE_P (stmt_info))
 442                     mask_producers.safe_push (stmt_info);
 443                   bool_result = true;
 444
 445                   if (TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 446                       == tcc_comparison
 447                       && !VECT_SCALAR_BOOLEAN_TYPE_P
 448                             (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 449                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 450                   else
 451                     {
 452                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 453                         {
 454                           pattern_def_seq = NULL;
 455                           gsi_next (&si);
 456                         }
 457                       continue;
 458                     }
 459                 }
 460
 461               if (dump_enabled_p ())
 462                 {
 463                   dump_printf_loc (MSG_NOTE, vect_location,
 464                                    "get vectype for scalar type:  ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468               vectype = get_vectype_for_scalar_type (scalar_type);
 469               if (!vectype)
 470                 {
 471                   if (dump_enabled_p ())
 472                     {
 473                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 474                                        "not vectorized: unsupported "
 475                                        "data-type ");
 476                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 477                                          scalar_type);
 478                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 479                     }
 480                   return false;
 481                 }
 482
 483               if (!bool_result)
 484                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 485
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 489                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 490                   dump_printf (MSG_NOTE, "\n");
 491                 }
 492             }
 493
 494           /* Don't try to compute VF out scalar types if we stmt
 495              produces boolean vector.  Use result vectype instead.  */
 496           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 497             vf_vectype = vectype;
 498           else
 499             {
 500               /* The vectorization factor is according to the smallest
 501                  scalar type (or the largest vector size, but we only
 502                  support one vector size per loop).  */
 503               if (!bool_result)
 504                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 505                                                              &dummy);
 506               if (dump_enabled_p ())
 507                 {
 508                   dump_printf_loc (MSG_NOTE, vect_location,
 509                                    "get vectype for scalar type:  ");
 510                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 511                   dump_printf (MSG_NOTE, "\n");
 512                 }
 513               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 514             }
 515           if (!vf_vectype)
 516             {
 517               if (dump_enabled_p ())
 518                 {
 519                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 520                                    "not vectorized: unsupported data-type ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      scalar_type);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 529                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 530             {
 531               if (dump_enabled_p ())
 532                 {
 533                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 534                                    "not vectorized: different sized vector "
 535                                    "types in statement, ");
 536                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 537                                      vectype);
 538                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 539                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 540                                      vf_vectype);
 541                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 542                 }
 543               return false;
 544             }
 545
 546           if (dump_enabled_p ())
 547             {
 548               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 549               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 550               dump_printf (MSG_NOTE, "\n");
 551             }
 552
 553           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 554           if (dump_enabled_p ())
 555             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 556           if (!vectorization_factor
 557               || (nunits > vectorization_factor))
 558             vectorization_factor = nunits;
 559
 560           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 561             {
 562               pattern_def_seq = NULL;
 563               gsi_next (&si);
 564             }
 565         }
 566     }
 567
 568   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 569   if (dump_enabled_p ())
 570     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 571                      vectorization_factor);
 572   if (vectorization_factor <= 1)
 573     {
 574       if (dump_enabled_p ())
 575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 576                          "not vectorized: unsupported data-type\n");
 577       return false;
 578     }
 579   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 580
 581   for (i = 0; i < mask_producers.length (); i++)
 582     {
 583       tree mask_type = NULL;
 584
 585       stmt = STMT_VINFO_STMT (mask_producers[i]);
 586
 587       if (is_gimple_assign (stmt)
 588           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 589           && !VECT_SCALAR_BOOLEAN_TYPE_P
 590                                       (TREE_TYPE (gimple_assign_rhs1 (stmt))))
 591         {
 592           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 593           mask_type = get_mask_type_for_scalar_type (scalar_type);
 594
 595           if (!mask_type)
 596             {
 597               if (dump_enabled_p ())
 598                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 599                                  "not vectorized: unsupported mask\n");
 600               return false;
 601             }
 602         }
 603       else
 604         {
 605           tree rhs;
 606           ssa_op_iter iter;
 607           gimple *def_stmt;
 608           enum vect_def_type dt;
 609
 610           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 611             {
 612               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 613                                        &def_stmt, &dt, &vectype))
 614                 {
 615                   if (dump_enabled_p ())
 616                     {
 617                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                                        "not vectorized: can't compute mask type "
 619                                        "for statement, ");
 620                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 621                                         0);
 622                     }
 623                   return false;
 624                 }
 625
 626               /* No vectype probably means external definition.
 627                  Allow it in case there is another operand which
 628                  allows to determine mask type.  */
 629               if (!vectype)
 630                 continue;
 631
 632               if (!mask_type)
 633                 mask_type = vectype;
 634               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 635                        != TYPE_VECTOR_SUBPARTS (vectype))
 636                 {
 637                   if (dump_enabled_p ())
 638                     {
 639                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 640                                        "not vectorized: different sized masks "
 641                                        "types in statement, ");
 642                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 643                                          mask_type);
 644                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 645                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 646                                          vectype);
 647                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 648                     }
 649                   return false;
 650                 }
 651               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 652                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 653                 {
 654                   if (dump_enabled_p ())
 655                     {
 656                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 657                                        "not vectorized: mixed mask and "
 658                                        "nonmask vector types in statement, ");
 659                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 660                                          mask_type);
 661                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 662                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 663                                          vectype);
 664                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 665                     }
 666                   return false;
 667                 }
 668             }
 669
 670           /* We may compare boolean value loaded as vector of integers.
 671              Fix mask_type in such case.  */
 672           if (mask_type
 673               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 674               && gimple_code (stmt) == GIMPLE_ASSIGN
 675               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 676             mask_type = build_same_sized_truth_vector_type (mask_type);
 677         }
 678
 679       /* No mask_type should mean loop invariant predicate.
 680          This is probably a subject for optimization in
 681          if-conversion.  */
 682       if (!mask_type)
 683         {
 684           if (dump_enabled_p ())
 685             {
 686               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 687                                "not vectorized: can't compute mask type "
 688                                "for statement, ");
 689               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 690                                 0);
 691             }
 692           return false;
 693         }
 694
 695       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 696     }
 697
 698   return true;
 699 }
 700
 701
 702 /* Function vect_is_simple_iv_evolution.
 703
 704    FORNOW: A simple evolution of an induction variables in the loop is
 705    considered a polynomial evolution.  */
 706
 707 static bool
 708 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 709                              tree * step)
 710 {
 711   tree init_expr;
 712   tree step_expr;
 713   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 714   basic_block bb;
 715
 716   /* When there is no evolution in this loop, the evolution function
 717      is not "simple".  */
 718   if (evolution_part == NULL_TREE)
 719     return false;
 720
 721   /* When the evolution is a polynomial of degree >= 2
 722      the evolution function is not "simple".  */
 723   if (tree_is_chrec (evolution_part))
 724     return false;
 725
 726   step_expr = evolution_part;
 727   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 728
 729   if (dump_enabled_p ())
 730     {
 731       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 732       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 733       dump_printf (MSG_NOTE, ",  init: ");
 734       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 735       dump_printf (MSG_NOTE, "\n");
 736     }
 737
 738   *init = init_expr;
 739   *step = step_expr;
 740
 741   if (TREE_CODE (step_expr) != INTEGER_CST
 742       && (TREE_CODE (step_expr) != SSA_NAME
 743           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 744               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 745           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 746               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 747                   || !flag_associative_math)))
 748       && (TREE_CODE (step_expr) != REAL_CST
 749           || !flag_associative_math))
 750     {
 751       if (dump_enabled_p ())
 752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                          "step unknown.\n");
 754       return false;
 755     }
 756
 757   return true;
 758 }
 759
 760 /* Function vect_analyze_scalar_cycles_1.
 761
 762    Examine the cross iteration def-use cycles of scalar variables
 763    in LOOP.  LOOP_VINFO represents the loop that is now being
 764    considered for vectorization (can be LOOP, or an outer-loop
 765    enclosing LOOP).  */
 766
 767 static void
 768 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 769 {
 770   basic_block bb = loop->header;
 771   tree init, step;
 772   auto_vec<gimple *, 64> worklist;
 773   gphi_iterator gsi;
 774   bool double_reduc;
 775
 776   if (dump_enabled_p ())
 777     dump_printf_loc (MSG_NOTE, vect_location,
 778                      "=== vect_analyze_scalar_cycles ===\n");
 779
 780   /* First - identify all inductions.  Reduction detection assumes that all the
 781      inductions have been identified, therefore, this order must not be
 782      changed.  */
 783   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 784     {
 785       gphi *phi = gsi.phi ();
 786       tree access_fn = NULL;
 787       tree def = PHI_RESULT (phi);
 788       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 789
 790       if (dump_enabled_p ())
 791         {
 792           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 793           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 794         }
 795
 796       /* Skip virtual phi's.  The data dependences that are associated with
 797          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 798       if (virtual_operand_p (def))
 799         continue;
 800
 801       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 802
 803       /* Analyze the evolution function.  */
 804       access_fn = analyze_scalar_evolution (loop, def);
 805       if (access_fn)
 806         {
 807           STRIP_NOPS (access_fn);
 808           if (dump_enabled_p ())
 809             {
 810               dump_printf_loc (MSG_NOTE, vect_location,
 811                                "Access function of PHI: ");
 812               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 813               dump_printf (MSG_NOTE, "\n");
 814             }
 815           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 816             = initial_condition_in_loop_num (access_fn, loop->num);
 817           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 818             = evolution_part_in_loop_num (access_fn, loop->num);
 819         }
 820
 821       if (!access_fn
 822           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 823           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 824               && TREE_CODE (step) != INTEGER_CST))
 825         {
 826           worklist.safe_push (phi);
 827           continue;
 828         }
 829
 830       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 831                   != NULL_TREE);
 832       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 833
 834       if (dump_enabled_p ())
 835         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 836       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 837     }
 838
 839
 840   /* Second - identify all reductions and nested cycles.  */
 841   while (worklist.length () > 0)
 842     {
 843       gimple *phi = worklist.pop ();
 844       tree def = PHI_RESULT (phi);
 845       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 846       gimple *reduc_stmt;
 847
 848       if (dump_enabled_p ())
 849         {
 850           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 851           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 852         }
 853
 854       gcc_assert (!virtual_operand_p (def)
 855                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 856
 857       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi,
 858                                                 &double_reduc, false);
 859       if (reduc_stmt)
 860         {
 861           if (double_reduc)
 862             {
 863               if (dump_enabled_p ())
 864                 dump_printf_loc (MSG_NOTE, vect_location,
 865                                  "Detected double reduction.\n");
 866
 867               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 868               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 869                                                     vect_double_reduction_def;
 870             }
 871           else
 872             {
 873               if (loop != LOOP_VINFO_LOOP (loop_vinfo))
 874                 {
 875                   if (dump_enabled_p ())
 876                     dump_printf_loc (MSG_NOTE, vect_location,
 877                                      "Detected vectorizable nested cycle.\n");
 878
 879                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 880                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 881                                                              vect_nested_cycle;
 882                 }
 883               else
 884                 {
 885                   if (dump_enabled_p ())
 886                     dump_printf_loc (MSG_NOTE, vect_location,
 887                                      "Detected reduction.\n");
 888
 889                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 890                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 891                                                            vect_reduction_def;
 892                   /* Store the reduction cycles for possible vectorization in
 893                      loop-aware SLP if it was not detected as reduction
 894                      chain.  */
 895                   if (! GROUP_FIRST_ELEMENT (vinfo_for_stmt (reduc_stmt)))
 896                     LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 897                 }
 898             }
 899         }
 900       else
 901         if (dump_enabled_p ())
 902           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 903                            "Unknown def-use cycle pattern.\n");
 904     }
 905 }
 906
 907
 908 /* Function vect_analyze_scalar_cycles.
 909
 910    Examine the cross iteration def-use cycles of scalar variables, by
 911    analyzing the loop-header PHIs of scalar variables.  Classify each
 912    cycle as one of the following: invariant, induction, reduction, unknown.
 913    We do that for the loop represented by LOOP_VINFO, and also to its
 914    inner-loop, if exists.
 915    Examples for scalar cycles:
 916
 917    Example1: reduction:
 918
 919               loop1:
 920               for (i=0; i<N; i++)
 921                  sum += a[i];
 922
 923    Example2: induction:
 924
 925               loop2:
 926               for (i=0; i<N; i++)
 927                  a[i] = i;  */
 928
 929 static void
 930 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 931 {
 932   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 933
 934   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 935
 936   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 937      Reductions in such inner-loop therefore have different properties than
 938      the reductions in the nest that gets vectorized:
 939      1. When vectorized, they are executed in the same order as in the original
 940         scalar loop, so we can't change the order of computation when
 941         vectorizing them.
 942      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 943         current checks are too strict.  */
 944
 945   if (loop->inner)
 946     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 947 }
 948
 949 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 950
 951 static void
 952 vect_fixup_reduc_chain (gimple *stmt)
 953 {
 954   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 955   gimple *stmtp;
 956   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 957               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 958   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 959   do
 960     {
 961       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 962       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 963       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 964       if (stmt)
 965         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 966           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 967     }
 968   while (stmt);
 969   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 970 }
 971
 972 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 973
 974 static void
 975 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 976 {
 977   gimple *first;
 978   unsigned i;
 979
 980   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 981     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 982       {
 983         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 984         while (next)
 985           {
 986             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 987               break;
 988             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 989           }
 990         /* If not all stmt in the chain are patterns try to handle
 991            the chain without patterns.  */
 992         if (! next)
 993           {
 994             vect_fixup_reduc_chain (first);
 995             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 996               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 997           }
 998       }
 999 }
1000
1001 /* Function vect_get_loop_niters.
1002
1003    Determine how many iterations the loop is executed and place it
1004    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1005    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1006    niter information holds in ASSUMPTIONS.
1007
1008    Return the loop exit condition.  */
1009
1010
1011 static gcond *
1012 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1013                       tree *number_of_iterations, tree *number_of_iterationsm1)
1014 {
1015   edge exit = single_exit (loop);
1016   struct tree_niter_desc niter_desc;
1017   tree niter_assumptions, niter, may_be_zero;
1018   gcond *cond = get_loop_exit_condition (loop);
1019
1020   *assumptions = boolean_true_node;
1021   *number_of_iterationsm1 = chrec_dont_know;
1022   *number_of_iterations = chrec_dont_know;
1023   if (dump_enabled_p ())
1024     dump_printf_loc (MSG_NOTE, vect_location,
1025                      "=== get_loop_niters ===\n");
1026
1027   if (!exit)
1028     return cond;
1029
1030   niter = chrec_dont_know;
1031   may_be_zero = NULL_TREE;
1032   niter_assumptions = boolean_true_node;
1033   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1034       || chrec_contains_undetermined (niter_desc.niter))
1035     return cond;
1036
1037   niter_assumptions = niter_desc.assumptions;
1038   may_be_zero = niter_desc.may_be_zero;
1039   niter = niter_desc.niter;
1040
1041   if (may_be_zero && integer_zerop (may_be_zero))
1042     may_be_zero = NULL_TREE;
1043
1044   if (may_be_zero)
1045     {
1046       if (COMPARISON_CLASS_P (may_be_zero))
1047         {
1048           /* Try to combine may_be_zero with assumptions, this can simplify
1049              computation of niter expression.  */
1050           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1051             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1052                                              niter_assumptions,
1053                                              fold_build1 (TRUTH_NOT_EXPR,
1054                                                           boolean_type_node,
1055                                                           may_be_zero));
1056           else
1057             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1058                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1059
1060           may_be_zero = NULL_TREE;
1061         }
1062       else if (integer_nonzerop (may_be_zero))
1063         {
1064           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1065           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1066           return cond;
1067         }
1068       else
1069         return cond;
1070     }
1071
1072   *assumptions = niter_assumptions;
1073   *number_of_iterationsm1 = niter;
1074
1075   /* We want the number of loop header executions which is the number
1076      of latch executions plus one.
1077      ???  For UINT_MAX latch executions this number overflows to zero
1078      for loops like do { n++; } while (n != 0);  */
1079   if (niter && !chrec_contains_undetermined (niter))
1080     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1081                           build_int_cst (TREE_TYPE (niter), 1));
1082   *number_of_iterations = niter;
1083
1084   return cond;
1085 }
1086
1087 /* Function bb_in_loop_p
1088
1089    Used as predicate for dfs order traversal of the loop bbs.  */
1090
1091 static bool
1092 bb_in_loop_p (const_basic_block bb, const void *data)
1093 {
1094   const struct loop *const loop = (const struct loop *)data;
1095   if (flow_bb_inside_loop_p (loop, bb))
1096     return true;
1097   return false;
1098 }
1099
1100
1101 /* Create and initialize a new loop_vec_info struct for LOOP_IN, as well as
1102    stmt_vec_info structs for all the stmts in LOOP_IN.  */
1103
1104 _loop_vec_info::_loop_vec_info (struct loop *loop_in)
1105   : vec_info (vec_info::loop, init_cost (loop_in)),
1106     loop (loop_in),
1107     bbs (XCNEWVEC (basic_block, loop->num_nodes)),
1108     num_itersm1 (NULL_TREE),
1109     num_iters (NULL_TREE),
1110     num_iters_unchanged (NULL_TREE),
1111     num_iters_assumptions (NULL_TREE),
1112     th (0),
1113     vectorization_factor (0),
1114     max_vectorization_factor (0),
1115     unaligned_dr (NULL),
1116     peeling_for_alignment (0),
1117     ptr_mask (0),
1118     slp_unrolling_factor (1),
1119     single_scalar_iteration_cost (0),
1120     vectorizable (false),
1121     peeling_for_gaps (false),
1122     peeling_for_niter (false),
1123     operands_swapped (false),
1124     no_data_dependencies (false),
1125     has_mask_store (false),
1126     scalar_loop (NULL),
1127     orig_loop_info (NULL)
1128 {
1129   /* Create/Update stmt_info for all stmts in the loop.  */
1130   basic_block *body = get_loop_body (loop);
1131   for (unsigned int i = 0; i < loop->num_nodes; i++)
1132     {
1133       basic_block bb = body[i];
1134       gimple_stmt_iterator si;
1135
1136       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1137         {
1138           gimple *phi = gsi_stmt (si);
1139           gimple_set_uid (phi, 0);
1140           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, this));
1141         }
1142
1143       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1144         {
1145           gimple *stmt = gsi_stmt (si);
1146           gimple_set_uid (stmt, 0);
1147           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, this));
1148         }
1149     }
1150   free (body);
1151
1152   /* CHECKME: We want to visit all BBs before their successors (except for
1153      latch blocks, for which this assertion wouldn't hold).  In the simple
1154      case of the loop forms we allow, a dfs order of the BBs would the same
1155      as reversed postorder traversal, so we are safe.  */
1156
1157   unsigned int nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1158                                           bbs, loop->num_nodes, loop);
1159   gcc_assert (nbbs == loop->num_nodes);
1160 }
1161
1162
1163 /* Free all memory used by the _loop_vec_info, as well as all the
1164    stmt_vec_info structs of all the stmts in the loop.  */
1165
1166 _loop_vec_info::~_loop_vec_info ()
1167 {
1168   int nbbs;
1169   gimple_stmt_iterator si;
1170   int j;
1171
1172   nbbs = loop->num_nodes;
1173   for (j = 0; j < nbbs; j++)
1174     {
1175       basic_block bb = bbs[j];
1176       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1177         free_stmt_vec_info (gsi_stmt (si));
1178
1179       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1180         {
1181           gimple *stmt = gsi_stmt (si);
1182
1183           /* We may have broken canonical form by moving a constant
1184              into RHS1 of a commutative op.  Fix such occurrences.  */
1185           if (operands_swapped && is_gimple_assign (stmt))
1186             {
1187               enum tree_code code = gimple_assign_rhs_code (stmt);
1188
1189               if ((code == PLUS_EXPR
1190                    || code == POINTER_PLUS_EXPR
1191                    || code == MULT_EXPR)
1192                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1193                 swap_ssa_operands (stmt,
1194                                    gimple_assign_rhs1_ptr (stmt),
1195                                    gimple_assign_rhs2_ptr (stmt));
1196               else if (code == COND_EXPR
1197                        && CONSTANT_CLASS_P (gimple_assign_rhs2 (stmt)))
1198                 {
1199                   tree cond_expr = gimple_assign_rhs1 (stmt);
1200                   enum tree_code cond_code = TREE_CODE (cond_expr);
1201
1202                   if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
1203                     {
1204                       bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr,
1205                                                                   0));
1206                       cond_code = invert_tree_comparison (cond_code,
1207                                                           honor_nans);
1208                       if (cond_code != ERROR_MARK)
1209                         {
1210                           TREE_SET_CODE (cond_expr, cond_code);
1211                           swap_ssa_operands (stmt,
1212                                              gimple_assign_rhs2_ptr (stmt),
1213                                              gimple_assign_rhs3_ptr (stmt));
1214                         }
1215                     }
1216                 }
1217             }
1218
1219           /* Free stmt_vec_info.  */
1220           free_stmt_vec_info (stmt);
1221           gsi_next (&si);
1222         }
1223     }
1224
1225   free (bbs);
1226
1227   loop->aux = NULL;
1228 }
1229
1230
1231 /* Calculate the cost of one scalar iteration of the loop.  */
1232 static void
1233 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1234 {
1235   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1236   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1237   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1238   int innerloop_iters, i;
1239
1240   /* Count statements in scalar loop.  Using this as scalar cost for a single
1241      iteration for now.
1242
1243      TODO: Add outer loop support.
1244
1245      TODO: Consider assigning different costs to different scalar
1246      statements.  */
1247
1248   /* FORNOW.  */
1249   innerloop_iters = 1;
1250   if (loop->inner)
1251     innerloop_iters = 50; /* FIXME */
1252
1253   for (i = 0; i < nbbs; i++)
1254     {
1255       gimple_stmt_iterator si;
1256       basic_block bb = bbs[i];
1257
1258       if (bb->loop_father == loop->inner)
1259         factor = innerloop_iters;
1260       else
1261         factor = 1;
1262
1263       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1264         {
1265           gimple *stmt = gsi_stmt (si);
1266           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1267
1268           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1269             continue;
1270
1271           /* Skip stmts that are not vectorized inside the loop.  */
1272           if (stmt_info
1273               && !STMT_VINFO_RELEVANT_P (stmt_info)
1274               && (!STMT_VINFO_LIVE_P (stmt_info)
1275                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1276               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1277             continue;
1278
1279           vect_cost_for_stmt kind;
1280           if (STMT_VINFO_DATA_REF (stmt_info))
1281             {
1282               if (DR_IS_READ (STMT_VINFO_DATA_REF (stmt_info)))
1283                kind = scalar_load;
1284              else
1285                kind = scalar_store;
1286             }
1287           else
1288             kind = scalar_stmt;
1289
1290           scalar_single_iter_cost
1291             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1292                                  factor, kind, stmt_info, 0, vect_prologue);
1293         }
1294     }
1295   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1296     = scalar_single_iter_cost;
1297 }
1298
1299
1300 /* Function vect_analyze_loop_form_1.
1301
1302    Verify that certain CFG restrictions hold, including:
1303    - the loop has a pre-header
1304    - the loop has a single entry and exit
1305    - the loop exit condition is simple enough
1306    - the number of iterations can be analyzed, i.e, a countable loop.  The
1307      niter could be analyzed under some assumptions.  */
1308
1309 bool
1310 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1311                           tree *assumptions, tree *number_of_iterationsm1,
1312                           tree *number_of_iterations, gcond **inner_loop_cond)
1313 {
1314   if (dump_enabled_p ())
1315     dump_printf_loc (MSG_NOTE, vect_location,
1316                      "=== vect_analyze_loop_form ===\n");
1317
1318   /* Different restrictions apply when we are considering an inner-most loop,
1319      vs. an outer (nested) loop.
1320      (FORNOW. May want to relax some of these restrictions in the future).  */
1321
1322   if (!loop->inner)
1323     {
1324       /* Inner-most loop.  We currently require that the number of BBs is
1325          exactly 2 (the header and latch).  Vectorizable inner-most loops
1326          look like this:
1327
1328                         (pre-header)
1329                            |
1330                           header <--------+
1331                            | |            |
1332                            | +--> latch --+
1333                            |
1334                         (exit-bb)  */
1335
1336       if (loop->num_nodes != 2)
1337         {
1338           if (dump_enabled_p ())
1339             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1340                              "not vectorized: control flow in loop.\n");
1341           return false;
1342         }
1343
1344       if (empty_block_p (loop->header))
1345         {
1346           if (dump_enabled_p ())
1347             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1348                              "not vectorized: empty loop.\n");
1349           return false;
1350         }
1351     }
1352   else
1353     {
1354       struct loop *innerloop = loop->inner;
1355       edge entryedge;
1356
1357       /* Nested loop. We currently require that the loop is doubly-nested,
1358          contains a single inner loop, and the number of BBs is exactly 5.
1359          Vectorizable outer-loops look like this:
1360
1361                         (pre-header)
1362                            |
1363                           header <---+
1364                            |         |
1365                           inner-loop |
1366                            |         |
1367                           tail ------+
1368                            |
1369                         (exit-bb)
1370
1371          The inner-loop has the properties expected of inner-most loops
1372          as described above.  */
1373
1374       if ((loop->inner)->inner || (loop->inner)->next)
1375         {
1376           if (dump_enabled_p ())
1377             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1378                              "not vectorized: multiple nested loops.\n");
1379           return false;
1380         }
1381
1382       if (loop->num_nodes != 5)
1383         {
1384           if (dump_enabled_p ())
1385             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1386                              "not vectorized: control flow in loop.\n");
1387           return false;
1388         }
1389
1390       entryedge = loop_preheader_edge (innerloop);
1391       if (entryedge->src != loop->header
1392           || !single_exit (innerloop)
1393           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1394         {
1395           if (dump_enabled_p ())
1396             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1397                              "not vectorized: unsupported outerloop form.\n");
1398           return false;
1399         }
1400
1401       /* Analyze the inner-loop.  */
1402       tree inner_niterm1, inner_niter, inner_assumptions;
1403       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1404                                       &inner_assumptions, &inner_niterm1,
1405                                       &inner_niter, NULL)
1406           /* Don't support analyzing niter under assumptions for inner
1407              loop.  */
1408           || !integer_onep (inner_assumptions))
1409         {
1410           if (dump_enabled_p ())
1411             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1412                              "not vectorized: Bad inner loop.\n");
1413           return false;
1414         }
1415
1416       if (!expr_invariant_in_loop_p (loop, inner_niter))
1417         {
1418           if (dump_enabled_p ())
1419             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1420                              "not vectorized: inner-loop count not"
1421                              " invariant.\n");
1422           return false;
1423         }
1424
1425       if (dump_enabled_p ())
1426         dump_printf_loc (MSG_NOTE, vect_location,
1427                          "Considering outer-loop vectorization.\n");
1428     }
1429
1430   if (!single_exit (loop)
1431       || EDGE_COUNT (loop->header->preds) != 2)
1432     {
1433       if (dump_enabled_p ())
1434         {
1435           if (!single_exit (loop))
1436             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1437                              "not vectorized: multiple exits.\n");
1438           else if (EDGE_COUNT (loop->header->preds) != 2)
1439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1440                              "not vectorized: too many incoming edges.\n");
1441         }
1442       return false;
1443     }
1444
1445   /* We assume that the loop exit condition is at the end of the loop. i.e,
1446      that the loop is represented as a do-while (with a proper if-guard
1447      before the loop if needed), where the loop header contains all the
1448      executable statements, and the latch is empty.  */
1449   if (!empty_block_p (loop->latch)
1450       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1451     {
1452       if (dump_enabled_p ())
1453         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1454                          "not vectorized: latch block not empty.\n");
1455       return false;
1456     }
1457
1458   /* Make sure the exit is not abnormal.  */
1459   edge e = single_exit (loop);
1460   if (e->flags & EDGE_ABNORMAL)
1461     {
1462       if (dump_enabled_p ())
1463         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1464                          "not vectorized: abnormal loop exit edge.\n");
1465       return false;
1466     }
1467
1468   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1469                                      number_of_iterationsm1);
1470   if (!*loop_cond)
1471     {
1472       if (dump_enabled_p ())
1473         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1474                          "not vectorized: complicated exit condition.\n");
1475       return false;
1476     }
1477
1478   if (integer_zerop (*assumptions)
1479       || !*number_of_iterations
1480       || chrec_contains_undetermined (*number_of_iterations))
1481     {
1482       if (dump_enabled_p ())
1483         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1484                          "not vectorized: number of iterations cannot be "
1485                          "computed.\n");
1486       return false;
1487     }
1488
1489   if (integer_zerop (*number_of_iterations))
1490     {
1491       if (dump_enabled_p ())
1492         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493                          "not vectorized: number of iterations = 0.\n");
1494       return false;
1495     }
1496
1497   return true;
1498 }
1499
1500 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1501
1502 loop_vec_info
1503 vect_analyze_loop_form (struct loop *loop)
1504 {
1505   tree assumptions, number_of_iterations, number_of_iterationsm1;
1506   gcond *loop_cond, *inner_loop_cond = NULL;
1507
1508   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1509                                   &assumptions, &number_of_iterationsm1,
1510                                   &number_of_iterations, &inner_loop_cond))
1511     return NULL;
1512
1513   loop_vec_info loop_vinfo = new _loop_vec_info (loop);
1514   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1515   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1516   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1517   if (!integer_onep (assumptions))
1518     {
1519       /* We consider to vectorize this loop by versioning it under
1520          some assumptions.  In order to do this, we need to clear
1521          existing information computed by scev and niter analyzer.  */
1522       scev_reset_htab ();
1523       free_numbers_of_iterations_estimates (loop);
1524       /* Also set flag for this loop so that following scev and niter
1525          analysis are done under the assumptions.  */
1526       loop_constraint_set (loop, LOOP_C_FINITE);
1527       /* Also record the assumptions for versioning.  */
1528       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1529     }
1530
1531   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1532     {
1533       if (dump_enabled_p ())
1534         {
1535           dump_printf_loc (MSG_NOTE, vect_location,
1536                            "Symbolic number of iterations is ");
1537           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1538           dump_printf (MSG_NOTE, "\n");
1539         }
1540     }
1541
1542   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1543   if (inner_loop_cond)
1544     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1545       = loop_exit_ctrl_vec_info_type;
1546
1547   gcc_assert (!loop->aux);
1548   loop->aux = loop_vinfo;
1549   return loop_vinfo;
1550 }
1551
1552
1553
1554 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1555    statements update the vectorization factor.  */
1556
1557 static void
1558 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1559 {
1560   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1561   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1562   int nbbs = loop->num_nodes;
1563   unsigned int vectorization_factor;
1564   int i;
1565
1566   if (dump_enabled_p ())
1567     dump_printf_loc (MSG_NOTE, vect_location,
1568                      "=== vect_update_vf_for_slp ===\n");
1569
1570   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1571   gcc_assert (vectorization_factor != 0);
1572
1573   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1574      vectorization factor of the loop is the unrolling factor required by
1575      the SLP instances.  If that unrolling factor is 1, we say, that we
1576      perform pure SLP on loop - cross iteration parallelism is not
1577      exploited.  */
1578   bool only_slp_in_loop = true;
1579   for (i = 0; i < nbbs; i++)
1580     {
1581       basic_block bb = bbs[i];
1582       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1583            gsi_next (&si))
1584         {
1585           gimple *stmt = gsi_stmt (si);
1586           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1587           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1588               && STMT_VINFO_RELATED_STMT (stmt_info))
1589             {
1590               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1591               stmt_info = vinfo_for_stmt (stmt);
1592             }
1593           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1594                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1595               && !PURE_SLP_STMT (stmt_info))
1596             /* STMT needs both SLP and loop-based vectorization.  */
1597             only_slp_in_loop = false;
1598         }
1599     }
1600
1601   if (only_slp_in_loop)
1602     {
1603       dump_printf_loc (MSG_NOTE, vect_location,
1604                        "Loop contains only SLP stmts\n");
1605       vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1606     }
1607   else
1608     {
1609       dump_printf_loc (MSG_NOTE, vect_location,
1610                        "Loop contains SLP and non-SLP stmts\n");
1611       vectorization_factor
1612         = least_common_multiple (vectorization_factor,
1613                                  LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1614     }
1615
1616   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1617   if (dump_enabled_p ())
1618     dump_printf_loc (MSG_NOTE, vect_location,
1619                      "Updating vectorization factor to %d\n",
1620                      vectorization_factor);
1621 }
1622
1623 /* Function vect_analyze_loop_operations.
1624
1625    Scan the loop stmts and make sure they are all vectorizable.  */
1626
1627 static bool
1628 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1629 {
1630   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1631   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1632   int nbbs = loop->num_nodes;
1633   int i;
1634   stmt_vec_info stmt_info;
1635   bool need_to_vectorize = false;
1636   bool ok;
1637
1638   if (dump_enabled_p ())
1639     dump_printf_loc (MSG_NOTE, vect_location,
1640                      "=== vect_analyze_loop_operations ===\n");
1641
1642   for (i = 0; i < nbbs; i++)
1643     {
1644       basic_block bb = bbs[i];
1645
1646       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1647            gsi_next (&si))
1648         {
1649           gphi *phi = si.phi ();
1650           ok = true;
1651
1652           stmt_info = vinfo_for_stmt (phi);
1653           if (dump_enabled_p ())
1654             {
1655               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1656               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1657             }
1658           if (virtual_operand_p (gimple_phi_result (phi)))
1659             continue;
1660
1661           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1662              (i.e., a phi in the tail of the outer-loop).  */
1663           if (! is_loop_header_bb_p (bb))
1664             {
1665               /* FORNOW: we currently don't support the case that these phis
1666                  are not used in the outerloop (unless it is double reduction,
1667                  i.e., this phi is vect_reduction_def), cause this case
1668                  requires to actually do something here.  */
1669               if (STMT_VINFO_LIVE_P (stmt_info)
1670                   && STMT_VINFO_DEF_TYPE (stmt_info)
1671                      != vect_double_reduction_def)
1672                 {
1673                   if (dump_enabled_p ())
1674                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1675                                      "Unsupported loop-closed phi in "
1676                                      "outer-loop.\n");
1677                   return false;
1678                 }
1679
1680               /* If PHI is used in the outer loop, we check that its operand
1681                  is defined in the inner loop.  */
1682               if (STMT_VINFO_RELEVANT_P (stmt_info))
1683                 {
1684                   tree phi_op;
1685                   gimple *op_def_stmt;
1686
1687                   if (gimple_phi_num_args (phi) != 1)
1688                     return false;
1689
1690                   phi_op = PHI_ARG_DEF (phi, 0);
1691                   if (TREE_CODE (phi_op) != SSA_NAME)
1692                     return false;
1693
1694                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1695                   if (gimple_nop_p (op_def_stmt)
1696                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1697                       || !vinfo_for_stmt (op_def_stmt))
1698                     return false;
1699
1700                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1701                         != vect_used_in_outer
1702                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1703                            != vect_used_in_outer_by_reduction)
1704                     return false;
1705                 }
1706
1707               continue;
1708             }
1709
1710           gcc_assert (stmt_info);
1711
1712           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1713                || STMT_VINFO_LIVE_P (stmt_info))
1714               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1715             {
1716               /* A scalar-dependence cycle that we don't support.  */
1717               if (dump_enabled_p ())
1718                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1719                                  "not vectorized: scalar dependence cycle.\n");
1720               return false;
1721             }
1722
1723           if (STMT_VINFO_RELEVANT_P (stmt_info))
1724             {
1725               need_to_vectorize = true;
1726               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
1727                   && ! PURE_SLP_STMT (stmt_info))
1728                 ok = vectorizable_induction (phi, NULL, NULL, NULL);
1729               else if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
1730                         || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
1731                        && ! PURE_SLP_STMT (stmt_info))
1732                 ok = vectorizable_reduction (phi, NULL, NULL, NULL, NULL);
1733             }
1734
1735           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1736             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1737
1738           if (!ok)
1739             {
1740               if (dump_enabled_p ())
1741                 {
1742                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743                                    "not vectorized: relevant phi not "
1744                                    "supported: ");
1745                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1746                 }
1747               return false;
1748             }
1749         }
1750
1751       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1752            gsi_next (&si))
1753         {
1754           gimple *stmt = gsi_stmt (si);
1755           if (!gimple_clobber_p (stmt)
1756               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL, NULL))
1757             return false;
1758         }
1759     } /* bbs */
1760
1761   /* All operations in the loop are either irrelevant (deal with loop
1762      control, or dead), or only used outside the loop and can be moved
1763      out of the loop (e.g. invariants, inductions).  The loop can be
1764      optimized away by scalar optimizations.  We're better off not
1765      touching this loop.  */
1766   if (!need_to_vectorize)
1767     {
1768       if (dump_enabled_p ())
1769         dump_printf_loc (MSG_NOTE, vect_location,
1770                          "All the computation can be taken out of the loop.\n");
1771       if (dump_enabled_p ())
1772         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1773                          "not vectorized: redundant loop. no profit to "
1774                          "vectorize.\n");
1775       return false;
1776     }
1777
1778   return true;
1779 }
1780
1781
1782 /* Function vect_analyze_loop_2.
1783
1784    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1785    for it.  The different analyses will record information in the
1786    loop_vec_info struct.  */
1787 static bool
1788 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1789 {
1790   bool ok;
1791   int max_vf = MAX_VECTORIZATION_FACTOR;
1792   int min_vf = 2;
1793   unsigned int n_stmts = 0;
1794
1795   /* The first group of checks is independent of the vector size.  */
1796   fatal = true;
1797
1798   /* Find all data references in the loop (which correspond to vdefs/vuses)
1799      and analyze their evolution in the loop.  */
1800
1801   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1802
1803   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1804   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1805     {
1806       if (dump_enabled_p ())
1807         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1808                          "not vectorized: loop nest containing two "
1809                          "or more consecutive inner loops cannot be "
1810                          "vectorized\n");
1811       return false;
1812     }
1813
1814   for (unsigned i = 0; i < loop->num_nodes; i++)
1815     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1816          !gsi_end_p (gsi); gsi_next (&gsi))
1817       {
1818         gimple *stmt = gsi_stmt (gsi);
1819         if (is_gimple_debug (stmt))
1820           continue;
1821         ++n_stmts;
1822         if (!find_data_references_in_stmt (loop, stmt,
1823                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1824           {
1825             if (is_gimple_call (stmt) && loop->safelen)
1826               {
1827                 tree fndecl = gimple_call_fndecl (stmt), op;
1828                 if (fndecl != NULL_TREE)
1829                   {
1830                     cgraph_node *node = cgraph_node::get (fndecl);
1831                     if (node != NULL && node->simd_clones != NULL)
1832                       {
1833                         unsigned int j, n = gimple_call_num_args (stmt);
1834                         for (j = 0; j < n; j++)
1835                           {
1836                             op = gimple_call_arg (stmt, j);
1837                             if (DECL_P (op)
1838                                 || (REFERENCE_CLASS_P (op)
1839                                     && get_base_address (op)))
1840                               break;
1841                           }
1842                         op = gimple_call_lhs (stmt);
1843                         /* Ignore #pragma omp declare simd functions
1844                            if they don't have data references in the
1845                            call stmt itself.  */
1846                         if (j == n
1847                             && !(op
1848                                  && (DECL_P (op)
1849                                      || (REFERENCE_CLASS_P (op)
1850                                          && get_base_address (op)))))
1851                           continue;
1852                       }
1853                   }
1854               }
1855             if (dump_enabled_p ())
1856               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1857                                "not vectorized: loop contains function "
1858                                "calls or data references that cannot "
1859                                "be analyzed\n");
1860             return false;
1861           }
1862       }
1863
1864   /* Analyze the data references and also adjust the minimal
1865      vectorization factor according to the loads and stores.  */
1866
1867   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1868   if (!ok)
1869     {
1870       if (dump_enabled_p ())
1871         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1872                          "bad data references.\n");
1873       return false;
1874     }
1875
1876   /* Classify all cross-iteration scalar data-flow cycles.
1877      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1878   vect_analyze_scalar_cycles (loop_vinfo);
1879
1880   vect_pattern_recog (loop_vinfo);
1881
1882   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1883
1884   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1885      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1886
1887   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1888   if (!ok)
1889     {
1890       if (dump_enabled_p ())
1891         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1892                          "bad data access.\n");
1893       return false;
1894     }
1895
1896   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1897
1898   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1899   if (!ok)
1900     {
1901       if (dump_enabled_p ())
1902         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1903                          "unexpected pattern.\n");
1904       return false;
1905     }
1906
1907   /* While the rest of the analysis below depends on it in some way.  */
1908   fatal = false;
1909
1910   /* Analyze data dependences between the data-refs in the loop
1911      and adjust the maximum vectorization factor according to
1912      the dependences.
1913      FORNOW: fail at the first data dependence that we encounter.  */
1914
1915   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1916   if (!ok
1917       || max_vf < min_vf)
1918     {
1919       if (dump_enabled_p ())
1920             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1921                              "bad data dependence.\n");
1922       return false;
1923     }
1924   LOOP_VINFO_MAX_VECT_FACTOR (loop_vinfo) = max_vf;
1925
1926   ok = vect_determine_vectorization_factor (loop_vinfo);
1927   if (!ok)
1928     {
1929       if (dump_enabled_p ())
1930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                          "can't determine vectorization factor.\n");
1932       return false;
1933     }
1934   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1935     {
1936       if (dump_enabled_p ())
1937         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1938                          "bad data dependence.\n");
1939       return false;
1940     }
1941
1942   /* Compute the scalar iteration cost.  */
1943   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1944
1945   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1946   HOST_WIDE_INT estimated_niter;
1947   unsigned th;
1948   int min_scalar_loop_bound;
1949
1950   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1951   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1952   if (!ok)
1953     return false;
1954
1955   /* If there are any SLP instances mark them as pure_slp.  */
1956   bool slp = vect_make_slp_decision (loop_vinfo);
1957   if (slp)
1958     {
1959       /* Find stmts that need to be both vectorized and SLPed.  */
1960       vect_detect_hybrid_slp (loop_vinfo);
1961
1962       /* Update the vectorization factor based on the SLP decision.  */
1963       vect_update_vf_for_slp (loop_vinfo);
1964     }
1965
1966   /* This is the point where we can re-start analysis with SLP forced off.  */
1967 start_over:
1968
1969   /* Now the vectorization factor is final.  */
1970   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1971   gcc_assert (vectorization_factor != 0);
1972
1973   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1974     dump_printf_loc (MSG_NOTE, vect_location,
1975                      "vectorization_factor = %d, niters = "
1976                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1977                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1978
1979   HOST_WIDE_INT max_niter
1980     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1981   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1982        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1983       || (max_niter != -1
1984           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1985     {
1986       if (dump_enabled_p ())
1987         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1988                          "not vectorized: iteration count smaller than "
1989                          "vectorization factor.\n");
1990       return false;
1991     }
1992
1993   /* Analyze the alignment of the data-refs in the loop.
1994      Fail if a data reference is found that cannot be vectorized.  */
1995
1996   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1997   if (!ok)
1998     {
1999       if (dump_enabled_p ())
2000         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2001                          "bad data alignment.\n");
2002       return false;
2003     }
2004
2005   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2006      It is important to call pruning after vect_analyze_data_ref_accesses,
2007      since we use grouping information gathered by interleaving analysis.  */
2008   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2009   if (!ok)
2010     return false;
2011
2012   /* Do not invoke vect_enhance_data_refs_alignment for eplilogue
2013      vectorization.  */
2014   if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
2015     {
2016     /* This pass will decide on using loop versioning and/or loop peeling in
2017        order to enhance the alignment of data references in the loop.  */
2018     ok = vect_enhance_data_refs_alignment (loop_vinfo);
2019     if (!ok)
2020       {
2021         if (dump_enabled_p ())
2022           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2023                            "bad data alignment.\n");
2024         return false;
2025       }
2026     }
2027
2028   if (slp)
2029     {
2030       /* Analyze operations in the SLP instances.  Note this may
2031          remove unsupported SLP instances which makes the above
2032          SLP kind detection invalid.  */
2033       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2034       vect_slp_analyze_operations (loop_vinfo);
2035       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2036         goto again;
2037     }
2038
2039   /* Scan all the remaining operations in the loop that are not subject
2040      to SLP and make sure they are vectorizable.  */
2041   ok = vect_analyze_loop_operations (loop_vinfo);
2042   if (!ok)
2043     {
2044       if (dump_enabled_p ())
2045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2046                          "bad operation or unsupported loop bound.\n");
2047       return false;
2048     }
2049
2050   /* If epilog loop is required because of data accesses with gaps,
2051      one additional iteration needs to be peeled.  Check if there is
2052      enough iterations for vectorization.  */
2053   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2054       && LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2055     {
2056       int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2057       tree scalar_niters = LOOP_VINFO_NITERSM1 (loop_vinfo);
2058
2059       if (wi::to_widest (scalar_niters) < vf)
2060         {
2061           if (dump_enabled_p ())
2062             dump_printf_loc (MSG_NOTE, vect_location,
2063                              "loop has no enough iterations to support"
2064                              " peeling for gaps.\n");
2065           return false;
2066         }
2067     }
2068
2069   /* Analyze cost.  Decide if worth while to vectorize.  */
2070   int min_profitable_estimate, min_profitable_iters;
2071   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2072                                       &min_profitable_estimate);
2073
2074   if (min_profitable_iters < 0)
2075     {
2076       if (dump_enabled_p ())
2077         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2078                          "not vectorized: vectorization not profitable.\n");
2079       if (dump_enabled_p ())
2080         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2081                          "not vectorized: vector version will never be "
2082                          "profitable.\n");
2083       goto again;
2084     }
2085
2086   min_scalar_loop_bound = (PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2087                            * vectorization_factor);
2088
2089   /* Use the cost model only if it is more conservative than user specified
2090      threshold.  */
2091   th = (unsigned) MAX (min_scalar_loop_bound, min_profitable_iters);
2092
2093   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2094
2095   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2096       && LOOP_VINFO_INT_NITERS (loop_vinfo) < th)
2097     {
2098       if (dump_enabled_p ())
2099         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2100                          "not vectorized: vectorization not profitable.\n");
2101       if (dump_enabled_p ())
2102         dump_printf_loc (MSG_NOTE, vect_location,
2103                          "not vectorized: iteration count smaller than user "
2104                          "specified loop bound parameter or minimum profitable "
2105                          "iterations (whichever is more conservative).\n");
2106       goto again;
2107     }
2108
2109   estimated_niter
2110     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2111   if (estimated_niter == -1)
2112     estimated_niter = max_niter;
2113   if (estimated_niter != -1
2114       && ((unsigned HOST_WIDE_INT) estimated_niter
2115           < MAX (th, (unsigned) min_profitable_estimate)))
2116     {
2117       if (dump_enabled_p ())
2118         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2119                          "not vectorized: estimated iteration count too "
2120                          "small.\n");
2121       if (dump_enabled_p ())
2122         dump_printf_loc (MSG_NOTE, vect_location,
2123                          "not vectorized: estimated iteration count smaller "
2124                          "than specified loop bound parameter or minimum "
2125                          "profitable iterations (whichever is more "
2126                          "conservative).\n");
2127       goto again;
2128     }
2129
2130   /* Decide whether we need to create an epilogue loop to handle
2131      remaining scalar iterations.  */
2132   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo)
2133          / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2134         * LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2135
2136   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2137       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2138     {
2139       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2140                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2141           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2142         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2143     }
2144   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2145            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2146                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2147                /* In case of versioning, check if the maximum number of
2148                   iterations is greater than th.  If they are identical,
2149                   the epilogue is unnecessary.  */
2150                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2151                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2152     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2153
2154   /* If an epilogue loop is required make sure we can create one.  */
2155   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2156       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2157     {
2158       if (dump_enabled_p ())
2159         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2160       if (!vect_can_advance_ivs_p (loop_vinfo)
2161           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2162                                            single_exit (LOOP_VINFO_LOOP
2163                                                          (loop_vinfo))))
2164         {
2165           if (dump_enabled_p ())
2166             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2167                              "not vectorized: can't create required "
2168                              "epilog loop\n");
2169           goto again;
2170         }
2171     }
2172
2173   /* During peeling, we need to check if number of loop iterations is
2174      enough for both peeled prolog loop and vector loop.  This check
2175      can be merged along with threshold check of loop versioning, so
2176      increase threshold for this case if necessary.  */
2177   if (LOOP_REQUIRES_VERSIONING (loop_vinfo)
2178       && (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2179           || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)))
2180     {
2181       unsigned niters_th;
2182
2183       /* Niters for peeled prolog loop.  */
2184       if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2185         {
2186           struct data_reference *dr = LOOP_VINFO_UNALIGNED_DR (loop_vinfo);
2187           tree vectype = STMT_VINFO_VECTYPE (vinfo_for_stmt (DR_STMT (dr)));
2188
2189           niters_th = TYPE_VECTOR_SUBPARTS (vectype) - 1;
2190         }
2191       else
2192         niters_th = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2193
2194       /* Niters for at least one iteration of vectorized loop.  */
2195       niters_th += LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2196       /* One additional iteration because of peeling for gap.  */
2197       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
2198         niters_th++;
2199       if (LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) < niters_th)
2200         LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = niters_th;
2201     }
2202
2203   gcc_assert (vectorization_factor
2204               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2205
2206   /* Ok to vectorize!  */
2207   return true;
2208
2209 again:
2210   /* Try again with SLP forced off but if we didn't do any SLP there is
2211      no point in re-trying.  */
2212   if (!slp)
2213     return false;
2214
2215   /* If there are reduction chains re-trying will fail anyway.  */
2216   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2217     return false;
2218
2219   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2220      via interleaving or lane instructions.  */
2221   slp_instance instance;
2222   slp_tree node;
2223   unsigned i, j;
2224   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2225     {
2226       stmt_vec_info vinfo;
2227       vinfo = vinfo_for_stmt
2228           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2229       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2230         continue;
2231       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2232       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2233       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2234       if (! vect_store_lanes_supported (vectype, size)
2235           && ! vect_grouped_store_supported (vectype, size))
2236         return false;
2237       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2238         {
2239           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2240           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2241           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2242           size = STMT_VINFO_GROUP_SIZE (vinfo);
2243           vectype = STMT_VINFO_VECTYPE (vinfo);
2244           if (! vect_load_lanes_supported (vectype, size)
2245               && ! vect_grouped_load_supported (vectype, single_element_p,
2246                                                 size))
2247             return false;
2248         }
2249     }
2250
2251   if (dump_enabled_p ())
2252     dump_printf_loc (MSG_NOTE, vect_location,
2253                      "re-trying with SLP disabled\n");
2254
2255   /* Roll back state appropriately.  No SLP this time.  */
2256   slp = false;
2257   /* Restore vectorization factor as it were without SLP.  */
2258   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2259   /* Free the SLP instances.  */
2260   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2261     vect_free_slp_instance (instance);
2262   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2263   /* Reset SLP type to loop_vect on all stmts.  */
2264   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2265     {
2266       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2267       for (gimple_stmt_iterator si = gsi_start_phis (bb);
2268            !gsi_end_p (si); gsi_next (&si))
2269         {
2270           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2271           STMT_SLP_TYPE (stmt_info) = loop_vect;
2272         }
2273       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2274            !gsi_end_p (si); gsi_next (&si))
2275         {
2276           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2277           STMT_SLP_TYPE (stmt_info) = loop_vect;
2278           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2279             {
2280               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2281               STMT_SLP_TYPE (stmt_info) = loop_vect;
2282               for (gimple_stmt_iterator pi
2283                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2284                    !gsi_end_p (pi); gsi_next (&pi))
2285                 {
2286                   gimple *pstmt = gsi_stmt (pi);
2287                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2288                 }
2289             }
2290         }
2291     }
2292   /* Free optimized alias test DDRS.  */
2293   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2294   LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).release ();
2295   /* Reset target cost data.  */
2296   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2297   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2298     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2299   /* Reset assorted flags.  */
2300   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2301   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2302   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2303
2304   goto start_over;
2305 }
2306
2307 /* Function vect_analyze_loop.
2308
2309    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2310    for it.  The different analyses will record information in the
2311    loop_vec_info struct.  If ORIG_LOOP_VINFO is not NULL epilogue must
2312    be vectorized.  */
2313 loop_vec_info
2314 vect_analyze_loop (struct loop *loop, loop_vec_info orig_loop_vinfo)
2315 {
2316   loop_vec_info loop_vinfo;
2317   unsigned int vector_sizes;
2318
2319   /* Autodetect first vector size we try.  */
2320   current_vector_size = 0;
2321   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2322
2323   if (dump_enabled_p ())
2324     dump_printf_loc (MSG_NOTE, vect_location,
2325                      "===== analyze_loop_nest =====\n");
2326
2327   if (loop_outer (loop)
2328       && loop_vec_info_for_loop (loop_outer (loop))
2329       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2330     {
2331       if (dump_enabled_p ())
2332         dump_printf_loc (MSG_NOTE, vect_location,
2333                          "outer-loop already vectorized.\n");
2334       return NULL;
2335     }
2336
2337   while (1)
2338     {
2339       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2340       loop_vinfo = vect_analyze_loop_form (loop);
2341       if (!loop_vinfo)
2342         {
2343           if (dump_enabled_p ())
2344             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2345                              "bad loop form.\n");
2346           return NULL;
2347         }
2348
2349       bool fatal = false;
2350
2351       if (orig_loop_vinfo)
2352         LOOP_VINFO_ORIG_LOOP_INFO (loop_vinfo) = orig_loop_vinfo;
2353
2354       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2355         {
2356           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2357
2358           return loop_vinfo;
2359         }
2360
2361       delete loop_vinfo;
2362
2363       vector_sizes &= ~current_vector_size;
2364       if (fatal
2365           || vector_sizes == 0
2366           || current_vector_size == 0)
2367         return NULL;
2368
2369       /* Try the next biggest vector size.  */
2370       current_vector_size = 1 << floor_log2 (vector_sizes);
2371       if (dump_enabled_p ())
2372         dump_printf_loc (MSG_NOTE, vect_location,
2373                          "***** Re-trying analysis with "
2374                          "vector size %d\n", current_vector_size);
2375     }
2376 }
2377
2378
2379 /* Function reduction_code_for_scalar_code
2380
2381    Input:
2382    CODE - tree_code of a reduction operations.
2383
2384    Output:
2385    REDUC_CODE - the corresponding tree-code to be used to reduce the
2386       vector of partial results into a single scalar result, or ERROR_MARK
2387       if the operation is a supported reduction operation, but does not have
2388       such a tree-code.
2389
2390    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2391
2392 static bool
2393 reduction_code_for_scalar_code (enum tree_code code,
2394                                 enum tree_code *reduc_code)
2395 {
2396   switch (code)
2397     {
2398       case MAX_EXPR:
2399         *reduc_code = REDUC_MAX_EXPR;
2400         return true;
2401
2402       case MIN_EXPR:
2403         *reduc_code = REDUC_MIN_EXPR;
2404         return true;
2405
2406       case PLUS_EXPR:
2407         *reduc_code = REDUC_PLUS_EXPR;
2408         return true;
2409
2410       case MULT_EXPR:
2411       case MINUS_EXPR:
2412       case BIT_IOR_EXPR:
2413       case BIT_XOR_EXPR:
2414       case BIT_AND_EXPR:
2415         *reduc_code = ERROR_MARK;
2416         return true;
2417
2418       default:
2419        return false;
2420     }
2421 }
2422
2423
2424 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2425    STMT is printed with a message MSG. */
2426
2427 static void
2428 report_vect_op (dump_flags_t msg_type, gimple *stmt, const char *msg)
2429 {
2430   dump_printf_loc (msg_type, vect_location, "%s", msg);
2431   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2432 }
2433
2434
2435 /* Detect SLP reduction of the form:
2436
2437    #a1 = phi <a5, a0>
2438    a2 = operation (a1)
2439    a3 = operation (a2)
2440    a4 = operation (a3)
2441    a5 = operation (a4)
2442
2443    #a = phi <a5>
2444
2445    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2446    FIRST_STMT is the first reduction stmt in the chain
2447    (a2 = operation (a1)).
2448
2449    Return TRUE if a reduction chain was detected.  */
2450
2451 static bool
2452 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2453                        gimple *first_stmt)
2454 {
2455   struct loop *loop = (gimple_bb (phi))->loop_father;
2456   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2457   enum tree_code code;
2458   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2459   stmt_vec_info use_stmt_info, current_stmt_info;
2460   tree lhs;
2461   imm_use_iterator imm_iter;
2462   use_operand_p use_p;
2463   int nloop_uses, size = 0, n_out_of_loop_uses;
2464   bool found = false;
2465
2466   if (loop != vect_loop)
2467     return false;
2468
2469   lhs = PHI_RESULT (phi);
2470   code = gimple_assign_rhs_code (first_stmt);
2471   while (1)
2472     {
2473       nloop_uses = 0;
2474       n_out_of_loop_uses = 0;
2475       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2476         {
2477           gimple *use_stmt = USE_STMT (use_p);
2478           if (is_gimple_debug (use_stmt))
2479             continue;
2480
2481           /* Check if we got back to the reduction phi.  */
2482           if (use_stmt == phi)
2483             {
2484               loop_use_stmt = use_stmt;
2485               found = true;
2486               break;
2487             }
2488
2489           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2490             {
2491               loop_use_stmt = use_stmt;
2492               nloop_uses++;
2493             }
2494            else
2495              n_out_of_loop_uses++;
2496
2497            /* There are can be either a single use in the loop or two uses in
2498               phi nodes.  */
2499            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2500              return false;
2501         }
2502
2503       if (found)
2504         break;
2505
2506       /* We reached a statement with no loop uses.  */
2507       if (nloop_uses == 0)
2508         return false;
2509
2510       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2511       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2512         return false;
2513
2514       if (!is_gimple_assign (loop_use_stmt)
2515           || code != gimple_assign_rhs_code (loop_use_stmt)
2516           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2517         return false;
2518
2519       /* Insert USE_STMT into reduction chain.  */
2520       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2521       if (current_stmt)
2522         {
2523           current_stmt_info = vinfo_for_stmt (current_stmt);
2524           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2525           GROUP_FIRST_ELEMENT (use_stmt_info)
2526             = GROUP_FIRST_ELEMENT (current_stmt_info);
2527         }
2528       else
2529         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2530
2531       lhs = gimple_assign_lhs (loop_use_stmt);
2532       current_stmt = loop_use_stmt;
2533       size++;
2534    }
2535
2536   if (!found || loop_use_stmt != phi || size < 2)
2537     return false;
2538
2539   /* Swap the operands, if needed, to make the reduction operand be the second
2540      operand.  */
2541   lhs = PHI_RESULT (phi);
2542   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2543   while (next_stmt)
2544     {
2545       if (gimple_assign_rhs2 (next_stmt) == lhs)
2546         {
2547           tree op = gimple_assign_rhs1 (next_stmt);
2548           gimple *def_stmt = NULL;
2549
2550           if (TREE_CODE (op) == SSA_NAME)
2551             def_stmt = SSA_NAME_DEF_STMT (op);
2552
2553           /* Check that the other def is either defined in the loop
2554              ("vect_internal_def"), or it's an induction (defined by a
2555              loop-header phi-node).  */
2556           if (def_stmt
2557               && gimple_bb (def_stmt)
2558               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2559               && (is_gimple_assign (def_stmt)
2560                   || is_gimple_call (def_stmt)
2561                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2562                            == vect_induction_def
2563                   || (gimple_code (def_stmt) == GIMPLE_PHI
2564                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2565                                   == vect_internal_def
2566                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2567             {
2568               lhs = gimple_assign_lhs (next_stmt);
2569               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2570               continue;
2571             }
2572
2573           return false;
2574         }
2575       else
2576         {
2577           tree op = gimple_assign_rhs2 (next_stmt);
2578           gimple *def_stmt = NULL;
2579
2580           if (TREE_CODE (op) == SSA_NAME)
2581             def_stmt = SSA_NAME_DEF_STMT (op);
2582
2583           /* Check that the other def is either defined in the loop
2584             ("vect_internal_def"), or it's an induction (defined by a
2585             loop-header phi-node).  */
2586           if (def_stmt
2587               && gimple_bb (def_stmt)
2588               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2589               && (is_gimple_assign (def_stmt)
2590                   || is_gimple_call (def_stmt)
2591                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2592                               == vect_induction_def
2593                   || (gimple_code (def_stmt) == GIMPLE_PHI
2594                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2595                                   == vect_internal_def
2596                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2597             {
2598               if (dump_enabled_p ())
2599                 {
2600                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2601                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2602                 }
2603
2604               swap_ssa_operands (next_stmt,
2605                                  gimple_assign_rhs1_ptr (next_stmt),
2606                                  gimple_assign_rhs2_ptr (next_stmt));
2607               update_stmt (next_stmt);
2608
2609               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2610                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2611             }
2612           else
2613             return false;
2614         }
2615
2616       lhs = gimple_assign_lhs (next_stmt);
2617       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2618     }
2619
2620   /* Save the chain for further analysis in SLP detection.  */
2621   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2622   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2623   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2624
2625   return true;
2626 }
2627
2628
2629 /* Function vect_is_simple_reduction
2630
2631    (1) Detect a cross-iteration def-use cycle that represents a simple
2632    reduction computation.  We look for the following pattern:
2633
2634    loop_header:
2635      a1 = phi < a0, a2 >
2636      a3 = ...
2637      a2 = operation (a3, a1)
2638
2639    or
2640
2641    a3 = ...
2642    loop_header:
2643      a1 = phi < a0, a2 >
2644      a2 = operation (a3, a1)
2645
2646    such that:
2647    1. operation is commutative and associative and it is safe to
2648       change the order of the computation
2649    2. no uses for a2 in the loop (a2 is used out of the loop)
2650    3. no uses of a1 in the loop besides the reduction operation
2651    4. no uses of a1 outside the loop.
2652
2653    Conditions 1,4 are tested here.
2654    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2655
2656    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2657    nested cycles.
2658
2659    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2660    reductions:
2661
2662      a1 = phi < a0, a2 >
2663      inner loop (def of a3)
2664      a2 = phi < a3 >
2665
2666    (4) Detect condition expressions, ie:
2667      for (int i = 0; i < N; i++)
2668        if (a[i] < val)
2669         ret_val = a[i];
2670
2671 */
2672
2673 static gimple *
2674 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2675                           bool *double_reduc,
2676                           bool need_wrapping_integral_overflow,
2677                           enum vect_reduction_type *v_reduc_type)
2678 {
2679   struct loop *loop = (gimple_bb (phi))->loop_father;
2680   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2681   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2682   enum tree_code orig_code, code;
2683   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2684   tree type;
2685   int nloop_uses;
2686   tree name;
2687   imm_use_iterator imm_iter;
2688   use_operand_p use_p;
2689   bool phi_def;
2690
2691   *double_reduc = false;
2692   *v_reduc_type = TREE_CODE_REDUCTION;
2693
2694   tree phi_name = PHI_RESULT (phi);
2695   /* ???  If there are no uses of the PHI result the inner loop reduction
2696      won't be detected as possibly double-reduction by vectorizable_reduction
2697      because that tries to walk the PHI arg from the preheader edge which
2698      can be constant.  See PR60382.  */
2699   if (has_zero_uses (phi_name))
2700     return NULL;
2701   nloop_uses = 0;
2702   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, phi_name)
2703     {
2704       gimple *use_stmt = USE_STMT (use_p);
2705       if (is_gimple_debug (use_stmt))
2706         continue;
2707
2708       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2709         {
2710           if (dump_enabled_p ())
2711             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2712                              "intermediate value used outside loop.\n");
2713
2714           return NULL;
2715         }
2716
2717       nloop_uses++;
2718       if (nloop_uses > 1)
2719         {
2720           if (dump_enabled_p ())
2721             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2722                              "reduction value used in loop.\n");
2723           return NULL;
2724         }
2725
2726       phi_use_stmt = use_stmt;
2727     }
2728
2729   edge latch_e = loop_latch_edge (loop);
2730   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2731   if (TREE_CODE (loop_arg) != SSA_NAME)
2732     {
2733       if (dump_enabled_p ())
2734         {
2735           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2736                            "reduction: not ssa_name: ");
2737           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2738           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2739         }
2740       return NULL;
2741     }
2742
2743   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2744   if (is_gimple_assign (def_stmt))
2745     {
2746       name = gimple_assign_lhs (def_stmt);
2747       phi_def = false;
2748     }
2749   else if (gimple_code (def_stmt) == GIMPLE_PHI)
2750     {
2751       name = PHI_RESULT (def_stmt);
2752       phi_def = true;
2753     }
2754   else
2755     {
2756       if (dump_enabled_p ())
2757         {
2758           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2759                            "reduction: unhandled reduction operation: ");
2760           dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, def_stmt, 0);
2761         }
2762       return NULL;
2763     }
2764
2765   if (! flow_bb_inside_loop_p (loop, gimple_bb (def_stmt)))
2766     return NULL;
2767
2768   nloop_uses = 0;
2769   auto_vec<gphi *, 3> lcphis;
2770   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2771     {
2772       gimple *use_stmt = USE_STMT (use_p);
2773       if (is_gimple_debug (use_stmt))
2774         continue;
2775       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2776         nloop_uses++;
2777       else
2778         /* We can have more than one loop-closed PHI.  */
2779         lcphis.safe_push (as_a <gphi *> (use_stmt));
2780       if (nloop_uses > 1)
2781         {
2782           if (dump_enabled_p ())
2783             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2784                              "reduction used in loop.\n");
2785           return NULL;
2786         }
2787     }
2788
2789   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2790      defined in the inner loop.  */
2791   if (phi_def)
2792     {
2793       op1 = PHI_ARG_DEF (def_stmt, 0);
2794
2795       if (gimple_phi_num_args (def_stmt) != 1
2796           || TREE_CODE (op1) != SSA_NAME)
2797         {
2798           if (dump_enabled_p ())
2799             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2800                              "unsupported phi node definition.\n");
2801
2802           return NULL;
2803         }
2804
2805       def1 = SSA_NAME_DEF_STMT (op1);
2806       if (gimple_bb (def1)
2807           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2808           && loop->inner
2809           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2810           && is_gimple_assign (def1)
2811           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2812         {
2813           if (dump_enabled_p ())
2814             report_vect_op (MSG_NOTE, def_stmt,
2815                             "detected double reduction: ");
2816
2817           *double_reduc = true;
2818           return def_stmt;
2819         }
2820
2821       return NULL;
2822     }
2823
2824   /* If we are vectorizing an inner reduction we are executing that
2825      in the original order only in case we are not dealing with a
2826      double reduction.  */
2827   bool check_reduction = true;
2828   if (flow_loop_nested_p (vect_loop, loop))
2829     {
2830       gphi *lcphi;
2831       unsigned i;
2832       check_reduction = false;
2833       FOR_EACH_VEC_ELT (lcphis, i, lcphi)
2834         FOR_EACH_IMM_USE_FAST (use_p, imm_iter, gimple_phi_result (lcphi))
2835           {
2836             gimple *use_stmt = USE_STMT (use_p);
2837             if (is_gimple_debug (use_stmt))
2838               continue;
2839             if (! flow_bb_inside_loop_p (vect_loop, gimple_bb (use_stmt)))
2840               check_reduction = true;
2841           }
2842     }
2843
2844   bool nested_in_vect_loop = flow_loop_nested_p (vect_loop, loop);
2845   code = orig_code = gimple_assign_rhs_code (def_stmt);
2846
2847   /* We can handle "res -= x[i]", which is non-associative by
2848      simply rewriting this into "res += -x[i]".  Avoid changing
2849      gimple instruction for the first simple tests and only do this
2850      if we're allowed to change code at all.  */
2851   if (code == MINUS_EXPR && gimple_assign_rhs2 (def_stmt) != phi_name)
2852     code = PLUS_EXPR;
2853
2854   if (code == COND_EXPR)
2855     {
2856       if (! nested_in_vect_loop)
2857         *v_reduc_type = COND_REDUCTION;
2858
2859       op3 = gimple_assign_rhs1 (def_stmt);
2860       if (COMPARISON_CLASS_P (op3))
2861         {
2862           op4 = TREE_OPERAND (op3, 1);
2863           op3 = TREE_OPERAND (op3, 0);
2864         }
2865       if (op3 == phi_name || op4 == phi_name)
2866         {
2867           if (dump_enabled_p ())
2868             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2869                             "reduction: condition depends on previous"
2870                             " iteration: ");
2871           return NULL;
2872         }
2873
2874       op1 = gimple_assign_rhs2 (def_stmt);
2875       op2 = gimple_assign_rhs3 (def_stmt);
2876     }
2877   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2878     {
2879       if (dump_enabled_p ())
2880         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2881                         "reduction: not commutative/associative: ");
2882       return NULL;
2883     }
2884   else if (get_gimple_rhs_class (code) == GIMPLE_BINARY_RHS)
2885     {
2886       op1 = gimple_assign_rhs1 (def_stmt);
2887       op2 = gimple_assign_rhs2 (def_stmt);
2888     }
2889   else
2890     {
2891       if (dump_enabled_p ())
2892         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2893                         "reduction: not handled operation: ");
2894       return NULL;
2895     }
2896
2897   if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2898     {
2899       if (dump_enabled_p ())
2900         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2901                         "reduction: both uses not ssa_names: ");
2902
2903       return NULL;
2904     }
2905
2906   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2907   if ((TREE_CODE (op1) == SSA_NAME
2908        && !types_compatible_p (type,TREE_TYPE (op1)))
2909       || (TREE_CODE (op2) == SSA_NAME
2910           && !types_compatible_p (type, TREE_TYPE (op2)))
2911       || (op3 && TREE_CODE (op3) == SSA_NAME
2912           && !types_compatible_p (type, TREE_TYPE (op3)))
2913       || (op4 && TREE_CODE (op4) == SSA_NAME
2914           && !types_compatible_p (type, TREE_TYPE (op4))))
2915     {
2916       if (dump_enabled_p ())
2917         {
2918           dump_printf_loc (MSG_NOTE, vect_location,
2919                            "reduction: multiple types: operation type: ");
2920           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2921           dump_printf (MSG_NOTE, ", operands types: ");
2922           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2923                              TREE_TYPE (op1));
2924           dump_printf (MSG_NOTE, ",");
2925           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2926                              TREE_TYPE (op2));
2927           if (op3)
2928             {
2929               dump_printf (MSG_NOTE, ",");
2930               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2931                                  TREE_TYPE (op3));
2932             }
2933
2934           if (op4)
2935             {
2936               dump_printf (MSG_NOTE, ",");
2937               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2938                                  TREE_TYPE (op4));
2939             }
2940           dump_printf (MSG_NOTE, "\n");
2941         }
2942
2943       return NULL;
2944     }
2945
2946   /* Check that it's ok to change the order of the computation.
2947      Generally, when vectorizing a reduction we change the order of the
2948      computation.  This may change the behavior of the program in some
2949      cases, so we need to check that this is ok.  One exception is when
2950      vectorizing an outer-loop: the inner-loop is executed sequentially,
2951      and therefore vectorizing reductions in the inner-loop during
2952      outer-loop vectorization is safe.  */
2953
2954   if (*v_reduc_type != COND_REDUCTION
2955       && check_reduction)
2956     {
2957       /* CHECKME: check for !flag_finite_math_only too?  */
2958       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2959         {
2960           /* Changing the order of operations changes the semantics.  */
2961           if (dump_enabled_p ())
2962             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2963                         "reduction: unsafe fp math optimization: ");
2964           return NULL;
2965         }
2966       else if (INTEGRAL_TYPE_P (type))
2967         {
2968           if (!operation_no_trapping_overflow (type, code))
2969             {
2970               /* Changing the order of operations changes the semantics.  */
2971               if (dump_enabled_p ())
2972                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2973                                 "reduction: unsafe int math optimization"
2974                                 " (overflow traps): ");
2975               return NULL;
2976             }
2977           if (need_wrapping_integral_overflow
2978               && !TYPE_OVERFLOW_WRAPS (type)
2979               && operation_can_overflow (code))
2980             {
2981               /* Changing the order of operations changes the semantics.  */
2982               if (dump_enabled_p ())
2983                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2984                                 "reduction: unsafe int math optimization"
2985                                 " (overflow doesn't wrap): ");
2986               return NULL;
2987             }
2988         }
2989       else if (SAT_FIXED_POINT_TYPE_P (type))
2990         {
2991           /* Changing the order of operations changes the semantics.  */
2992           if (dump_enabled_p ())
2993           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2994                           "reduction: unsafe fixed-point math optimization: ");
2995           return NULL;
2996         }
2997     }
2998
2999   /* Reduction is safe. We're dealing with one of the following:
3000      1) integer arithmetic and no trapv
3001      2) floating point arithmetic, and special flags permit this optimization
3002      3) nested cycle (i.e., outer loop vectorization).  */
3003   if (TREE_CODE (op1) == SSA_NAME)
3004     def1 = SSA_NAME_DEF_STMT (op1);
3005
3006   if (TREE_CODE (op2) == SSA_NAME)
3007     def2 = SSA_NAME_DEF_STMT (op2);
3008
3009   if (code != COND_EXPR
3010       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
3011     {
3012       if (dump_enabled_p ())
3013         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
3014       return NULL;
3015     }
3016
3017   /* Check that one def is the reduction def, defined by PHI,
3018      the other def is either defined in the loop ("vect_internal_def"),
3019      or it's an induction (defined by a loop-header phi-node).  */
3020
3021   if (def2 && def2 == phi
3022       && (code == COND_EXPR
3023           || !def1 || gimple_nop_p (def1)
3024           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
3025           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
3026               && (is_gimple_assign (def1)
3027                   || is_gimple_call (def1)
3028                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3029                       == vect_induction_def
3030                   || (gimple_code (def1) == GIMPLE_PHI
3031                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
3032                           == vect_internal_def
3033                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
3034     {
3035       if (dump_enabled_p ())
3036         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3037       return def_stmt;
3038     }
3039
3040   if (def1 && def1 == phi
3041       && (code == COND_EXPR
3042           || !def2 || gimple_nop_p (def2)
3043           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3044           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3045               && (is_gimple_assign (def2)
3046                   || is_gimple_call (def2)
3047                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3048                        == vect_induction_def
3049                   || (gimple_code (def2) == GIMPLE_PHI
3050                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3051                            == vect_internal_def
3052                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3053     {
3054       if (! nested_in_vect_loop && orig_code != MINUS_EXPR)
3055         {
3056           /* Check if we can swap operands (just for simplicity - so that
3057              the rest of the code can assume that the reduction variable
3058              is always the last (second) argument).  */
3059           if (code == COND_EXPR)
3060             {
3061               /* Swap cond_expr by inverting the condition.  */
3062               tree cond_expr = gimple_assign_rhs1 (def_stmt);
3063               enum tree_code invert_code = ERROR_MARK;
3064               enum tree_code cond_code = TREE_CODE (cond_expr);
3065
3066               if (TREE_CODE_CLASS (cond_code) == tcc_comparison)
3067                 {
3068                   bool honor_nans = HONOR_NANS (TREE_OPERAND (cond_expr, 0));
3069                   invert_code = invert_tree_comparison (cond_code, honor_nans);
3070                 }
3071               if (invert_code != ERROR_MARK)
3072                 {
3073                   TREE_SET_CODE (cond_expr, invert_code);
3074                   swap_ssa_operands (def_stmt,
3075                                      gimple_assign_rhs2_ptr (def_stmt),
3076                                      gimple_assign_rhs3_ptr (def_stmt));
3077                 }
3078               else
3079                 {
3080                   if (dump_enabled_p ())
3081                     report_vect_op (MSG_NOTE, def_stmt,
3082                                     "detected reduction: cannot swap operands "
3083                                     "for cond_expr");
3084                   return NULL;
3085                 }
3086             }
3087           else
3088             swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3089                                gimple_assign_rhs2_ptr (def_stmt));
3090
3091           if (dump_enabled_p ())
3092             report_vect_op (MSG_NOTE, def_stmt,
3093                             "detected reduction: need to swap operands: ");
3094
3095           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3096             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3097         }
3098       else
3099         {
3100           if (dump_enabled_p ())
3101             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3102         }
3103
3104       return def_stmt;
3105     }
3106
3107   /* Try to find SLP reduction chain.  */
3108   if (! nested_in_vect_loop
3109       && code != COND_EXPR
3110       && orig_code != MINUS_EXPR
3111       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3112     {
3113       if (dump_enabled_p ())
3114         report_vect_op (MSG_NOTE, def_stmt,
3115                         "reduction: detected reduction chain: ");
3116
3117       return def_stmt;
3118     }
3119
3120   /* Dissolve group eventually half-built by vect_is_slp_reduction.  */
3121   gimple *first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (def_stmt));
3122   while (first)
3123     {
3124       gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
3125       GROUP_FIRST_ELEMENT (vinfo_for_stmt (first)) = NULL;
3126       GROUP_NEXT_ELEMENT (vinfo_for_stmt (first)) = NULL;
3127       first = next;
3128     }
3129
3130   /* Look for the expression computing loop_arg from loop PHI result.  */
3131   auto_vec<std::pair<ssa_op_iter, use_operand_p> > path;
3132   auto_bitmap visited;
3133   tree lookfor = PHI_RESULT (phi);
3134   ssa_op_iter curri;
3135   use_operand_p curr = op_iter_init_phiuse (&curri, as_a <gphi *>(phi),
3136                                             SSA_OP_USE);
3137   while (USE_FROM_PTR (curr) != loop_arg)
3138     curr = op_iter_next_use (&curri);
3139   curri.i = curri.numops;
3140   do
3141     {
3142       path.safe_push (std::make_pair (curri, curr));
3143       tree use = USE_FROM_PTR (curr);
3144       if (use == lookfor)
3145         break;
3146       gimple *def = SSA_NAME_DEF_STMT (use);
3147       if (gimple_nop_p (def)
3148           || ! flow_bb_inside_loop_p (loop, gimple_bb (def)))
3149         {
3150 pop:
3151           do
3152             {
3153               std::pair<ssa_op_iter, use_operand_p> x = path.pop ();
3154               curri = x.first;
3155               curr = x.second;
3156               do
3157                 curr = op_iter_next_use (&curri);
3158               /* Skip already visited or non-SSA operands (from iterating
3159                  over PHI args).  */
3160               while (curr != NULL_USE_OPERAND_P
3161                      && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3162                          || ! bitmap_set_bit (visited,
3163                                               SSA_NAME_VERSION
3164                                                 (USE_FROM_PTR (curr)))));
3165             }
3166           while (curr == NULL_USE_OPERAND_P && ! path.is_empty ());
3167           if (curr == NULL_USE_OPERAND_P)
3168             break;
3169         }
3170       else
3171         {
3172           if (gimple_code (def) == GIMPLE_PHI)
3173             curr = op_iter_init_phiuse (&curri, as_a <gphi *>(def), SSA_OP_USE);
3174           else
3175             curr = op_iter_init_use (&curri, def, SSA_OP_USE);
3176           while (curr != NULL_USE_OPERAND_P
3177                  && (TREE_CODE (USE_FROM_PTR (curr)) != SSA_NAME
3178                      || ! bitmap_set_bit (visited,
3179                                           SSA_NAME_VERSION
3180                                             (USE_FROM_PTR (curr)))))
3181             curr = op_iter_next_use (&curri);
3182           if (curr == NULL_USE_OPERAND_P)
3183             goto pop;
3184         }
3185     }
3186   while (1);
3187   if (dump_file && (dump_flags & TDF_DETAILS))
3188     {
3189       dump_printf_loc (MSG_NOTE, vect_location,
3190                        "reduction path: ");
3191       unsigned i;
3192       std::pair<ssa_op_iter, use_operand_p> *x;
3193       FOR_EACH_VEC_ELT (path, i, x)
3194         {
3195           dump_generic_expr (MSG_NOTE, TDF_SLIM, USE_FROM_PTR (x->second));
3196           dump_printf (MSG_NOTE, " ");
3197         }
3198       dump_printf (MSG_NOTE, "\n");
3199     }
3200
3201   /* Check whether the reduction path detected is valid.  */
3202   bool fail = path.length () == 0;
3203   bool neg = false;
3204   for (unsigned i = 1; i < path.length (); ++i)
3205     {
3206       gimple *use_stmt = USE_STMT (path[i].second);
3207       tree op = USE_FROM_PTR (path[i].second);
3208       if (! has_single_use (op)
3209           || ! is_gimple_assign (use_stmt))
3210         {
3211           fail = true;
3212           break;
3213         }
3214       if (gimple_assign_rhs_code (use_stmt) != code)
3215         {
3216           if (code == PLUS_EXPR
3217               && gimple_assign_rhs_code (use_stmt) == MINUS_EXPR)
3218             {
3219               /* Track whether we negate the reduction value each iteration.  */
3220               if (gimple_assign_rhs2 (use_stmt) == op)
3221                 neg = ! neg;
3222             }
3223           else
3224             {
3225               fail = true;
3226               break;
3227             }
3228         }
3229     }
3230   if (! fail && ! neg)
3231     return def_stmt;
3232
3233   if (dump_enabled_p ())
3234     {
3235       report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3236                       "reduction: unknown pattern: ");
3237     }
3238
3239   return NULL;
3240 }
3241
3242 /* Wrapper around vect_is_simple_reduction, which will modify code
3243    in-place if it enables detection of more reductions.  Arguments
3244    as there.  */
3245
3246 gimple *
3247 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3248                              bool *double_reduc,
3249                              bool need_wrapping_integral_overflow)
3250 {
3251   enum vect_reduction_type v_reduc_type;
3252   gimple *def = vect_is_simple_reduction (loop_info, phi, double_reduc,
3253                                           need_wrapping_integral_overflow,
3254                                           &v_reduc_type);
3255   if (def)
3256     {
3257       stmt_vec_info reduc_def_info = vinfo_for_stmt (phi);
3258       STMT_VINFO_REDUC_TYPE (reduc_def_info) = v_reduc_type;
3259       STMT_VINFO_REDUC_DEF (reduc_def_info) = def;
3260       reduc_def_info = vinfo_for_stmt (def);
3261       STMT_VINFO_REDUC_DEF (reduc_def_info) = phi;
3262     }
3263   return def;
3264 }
3265
3266 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3267 int
3268 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3269                              int *peel_iters_epilogue,
3270                              stmt_vector_for_cost *scalar_cost_vec,
3271                              stmt_vector_for_cost *prologue_cost_vec,
3272                              stmt_vector_for_cost *epilogue_cost_vec)
3273 {
3274   int retval = 0;
3275   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3276
3277   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3278     {
3279       *peel_iters_epilogue = vf/2;
3280       if (dump_enabled_p ())
3281         dump_printf_loc (MSG_NOTE, vect_location,
3282                          "cost model: epilogue peel iters set to vf/2 "
3283                          "because loop iterations are unknown .\n");
3284
3285       /* If peeled iterations are known but number of scalar loop
3286          iterations are unknown, count a taken branch per peeled loop.  */
3287       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3288                                  NULL, 0, vect_prologue);
3289       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3290                                  NULL, 0, vect_epilogue);
3291     }
3292   else
3293     {
3294       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3295       peel_iters_prologue = niters < peel_iters_prologue ?
3296                             niters : peel_iters_prologue;
3297       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3298       /* If we need to peel for gaps, but no peeling is required, we have to
3299          peel VF iterations.  */
3300       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3301         *peel_iters_epilogue = vf;
3302     }
3303
3304   stmt_info_for_cost *si;
3305   int j;
3306   if (peel_iters_prologue)
3307     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3308         {
3309           stmt_vec_info stmt_info
3310             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3311           retval += record_stmt_cost (prologue_cost_vec,
3312                                       si->count * peel_iters_prologue,
3313                                       si->kind, stmt_info, si->misalign,
3314                                       vect_prologue);
3315         }
3316   if (*peel_iters_epilogue)
3317     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3318         {
3319           stmt_vec_info stmt_info
3320             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3321           retval += record_stmt_cost (epilogue_cost_vec,
3322                                       si->count * *peel_iters_epilogue,
3323                                       si->kind, stmt_info, si->misalign,
3324                                       vect_epilogue);
3325         }
3326
3327   return retval;
3328 }
3329
3330 /* Function vect_estimate_min_profitable_iters
3331
3332    Return the number of iterations required for the vector version of the
3333    loop to be profitable relative to the cost of the scalar version of the
3334    loop.
3335
3336    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3337    of iterations for vectorization.  -1 value means loop vectorization
3338    is not profitable.  This returned value may be used for dynamic
3339    profitability check.
3340
3341    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3342    for static check against estimated number of iterations.  */
3343
3344 static void
3345 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3346                                     int *ret_min_profitable_niters,
3347                                     int *ret_min_profitable_estimate)
3348 {
3349   int min_profitable_iters;
3350   int min_profitable_estimate;
3351   int peel_iters_prologue;
3352   int peel_iters_epilogue;
3353   unsigned vec_inside_cost = 0;
3354   int vec_outside_cost = 0;
3355   unsigned vec_prologue_cost = 0;
3356   unsigned vec_epilogue_cost = 0;
3357   int scalar_single_iter_cost = 0;
3358   int scalar_outside_cost = 0;
3359   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3360   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3361   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3362
3363   /* Cost model disabled.  */
3364   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3365     {
3366       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3367       *ret_min_profitable_niters = 0;
3368       *ret_min_profitable_estimate = 0;
3369       return;
3370     }
3371
3372   /* Requires loop versioning tests to handle misalignment.  */
3373   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3374     {
3375       /*  FIXME: Make cost depend on complexity of individual check.  */
3376       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3377       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3378                             vect_prologue);
3379       dump_printf (MSG_NOTE,
3380                    "cost model: Adding cost of checks for loop "
3381                    "versioning to treat misalignment.\n");
3382     }
3383
3384   /* Requires loop versioning with alias checks.  */
3385   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3386     {
3387       /*  FIXME: Make cost depend on complexity of individual check.  */
3388       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3389       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3390                             vect_prologue);
3391       len = LOOP_VINFO_CHECK_UNEQUAL_ADDRS (loop_vinfo).length ();
3392       if (len)
3393         /* Count LEN - 1 ANDs and LEN comparisons.  */
3394         (void) add_stmt_cost (target_cost_data, len * 2 - 1, scalar_stmt,
3395                               NULL, 0, vect_prologue);
3396       dump_printf (MSG_NOTE,
3397                    "cost model: Adding cost of checks for loop "
3398                    "versioning aliasing.\n");
3399     }
3400
3401   /* Requires loop versioning with niter checks.  */
3402   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3403     {
3404       /*  FIXME: Make cost depend on complexity of individual check.  */
3405       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3406                             vect_prologue);
3407       dump_printf (MSG_NOTE,
3408                    "cost model: Adding cost of checks for loop "
3409                    "versioning niters.\n");
3410     }
3411
3412   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3413     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3414                           vect_prologue);
3415
3416   /* Count statements in scalar loop.  Using this as scalar cost for a single
3417      iteration for now.
3418
3419      TODO: Add outer loop support.
3420
3421      TODO: Consider assigning different costs to different scalar
3422      statements.  */
3423
3424   scalar_single_iter_cost
3425     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3426
3427   /* Add additional cost for the peeled instructions in prologue and epilogue
3428      loop.
3429
3430      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3431      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3432
3433      TODO: Build an expression that represents peel_iters for prologue and
3434      epilogue to be used in a run-time test.  */
3435
3436   if (npeel  < 0)
3437     {
3438       peel_iters_prologue = vf/2;
3439       dump_printf (MSG_NOTE, "cost model: "
3440                    "prologue peel iters set to vf/2.\n");
3441
3442       /* If peeling for alignment is unknown, loop bound of main loop becomes
3443          unknown.  */
3444       peel_iters_epilogue = vf/2;
3445       dump_printf (MSG_NOTE, "cost model: "
3446                    "epilogue peel iters set to vf/2 because "
3447                    "peeling for alignment is unknown.\n");
3448
3449       /* If peeled iterations are unknown, count a taken branch and a not taken
3450          branch per peeled loop. Even if scalar loop iterations are known,
3451          vector iterations are not known since peeled prologue iterations are
3452          not known. Hence guards remain the same.  */
3453       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3454                             NULL, 0, vect_prologue);
3455       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3456                             NULL, 0, vect_prologue);
3457       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3458                             NULL, 0, vect_epilogue);
3459       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3460                             NULL, 0, vect_epilogue);
3461       stmt_info_for_cost *si;
3462       int j;
3463       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3464         {
3465           struct _stmt_vec_info *stmt_info
3466             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3467           (void) add_stmt_cost (target_cost_data,
3468                                 si->count * peel_iters_prologue,
3469                                 si->kind, stmt_info, si->misalign,
3470                                 vect_prologue);
3471           (void) add_stmt_cost (target_cost_data,
3472                                 si->count * peel_iters_epilogue,
3473                                 si->kind, stmt_info, si->misalign,
3474                                 vect_epilogue);
3475         }
3476     }
3477   else
3478     {
3479       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3480       stmt_info_for_cost *si;
3481       int j;
3482       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3483
3484       prologue_cost_vec.create (2);
3485       epilogue_cost_vec.create (2);
3486       peel_iters_prologue = npeel;
3487
3488       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3489                                           &peel_iters_epilogue,
3490                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3491                                             (loop_vinfo),
3492                                           &prologue_cost_vec,
3493                                           &epilogue_cost_vec);
3494
3495       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3496         {
3497           struct _stmt_vec_info *stmt_info
3498             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3499           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3500                                 si->misalign, vect_prologue);
3501         }
3502
3503       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3504         {
3505           struct _stmt_vec_info *stmt_info
3506             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3507           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3508                                 si->misalign, vect_epilogue);
3509         }
3510
3511       prologue_cost_vec.release ();
3512       epilogue_cost_vec.release ();
3513     }
3514
3515   /* FORNOW: The scalar outside cost is incremented in one of the
3516      following ways:
3517
3518      1. The vectorizer checks for alignment and aliasing and generates
3519      a condition that allows dynamic vectorization.  A cost model
3520      check is ANDED with the versioning condition.  Hence scalar code
3521      path now has the added cost of the versioning check.
3522
3523        if (cost > th & versioning_check)
3524          jmp to vector code
3525
3526      Hence run-time scalar is incremented by not-taken branch cost.
3527
3528      2. The vectorizer then checks if a prologue is required.  If the
3529      cost model check was not done before during versioning, it has to
3530      be done before the prologue check.
3531
3532        if (cost <= th)
3533          prologue = scalar_iters
3534        if (prologue == 0)
3535          jmp to vector code
3536        else
3537          execute prologue
3538        if (prologue == num_iters)
3539          go to exit
3540
3541      Hence the run-time scalar cost is incremented by a taken branch,
3542      plus a not-taken branch, plus a taken branch cost.
3543
3544      3. The vectorizer then checks if an epilogue is required.  If the
3545      cost model check was not done before during prologue check, it
3546      has to be done with the epilogue check.
3547
3548        if (prologue == 0)
3549          jmp to vector code
3550        else
3551          execute prologue
3552        if (prologue == num_iters)
3553          go to exit
3554        vector code:
3555          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3556            jmp to epilogue
3557
3558      Hence the run-time scalar cost should be incremented by 2 taken
3559      branches.
3560
3561      TODO: The back end may reorder the BBS's differently and reverse
3562      conditions/branch directions.  Change the estimates below to
3563      something more reasonable.  */
3564
3565   /* If the number of iterations is known and we do not do versioning, we can
3566      decide whether to vectorize at compile time.  Hence the scalar version
3567      do not carry cost model guard costs.  */
3568   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3569       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3570     {
3571       /* Cost model check occurs at versioning.  */
3572       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3573         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3574       else
3575         {
3576           /* Cost model check occurs at prologue generation.  */
3577           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3578             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3579               + vect_get_stmt_cost (cond_branch_not_taken);
3580           /* Cost model check occurs at epilogue generation.  */
3581           else
3582             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3583         }
3584     }
3585
3586   /* Complete the target-specific cost calculations.  */
3587   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3588                &vec_inside_cost, &vec_epilogue_cost);
3589
3590   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3591
3592   if (dump_enabled_p ())
3593     {
3594       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3595       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3596                    vec_inside_cost);
3597       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3598                    vec_prologue_cost);
3599       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3600                    vec_epilogue_cost);
3601       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3602                    scalar_single_iter_cost);
3603       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3604                    scalar_outside_cost);
3605       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3606                    vec_outside_cost);
3607       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3608                    peel_iters_prologue);
3609       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3610                    peel_iters_epilogue);
3611     }
3612
3613   /* Calculate number of iterations required to make the vector version
3614      profitable, relative to the loop bodies only.  The following condition
3615      must hold true:
3616      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3617      where
3618      SIC = scalar iteration cost, VIC = vector iteration cost,
3619      VOC = vector outside cost, VF = vectorization factor,
3620      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3621      SOC = scalar outside cost for run time cost model check.  */
3622
3623   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3624     {
3625       if (vec_outside_cost <= 0)
3626         min_profitable_iters = 0;
3627       else
3628         {
3629           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3630                                   - vec_inside_cost * peel_iters_prologue
3631                                   - vec_inside_cost * peel_iters_epilogue)
3632                                  / ((scalar_single_iter_cost * vf)
3633                                     - vec_inside_cost);
3634
3635           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3636               <= (((int) vec_inside_cost * min_profitable_iters)
3637                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3638             min_profitable_iters++;
3639         }
3640     }
3641   /* vector version will never be profitable.  */
3642   else
3643     {
3644       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3645         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3646                     "did not happen for a simd loop");
3647
3648       if (dump_enabled_p ())
3649         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3650                          "cost model: the vector iteration cost = %d "
3651                          "divided by the scalar iteration cost = %d "
3652                          "is greater or equal to the vectorization factor = %d"
3653                          ".\n",
3654                          vec_inside_cost, scalar_single_iter_cost, vf);
3655       *ret_min_profitable_niters = -1;
3656       *ret_min_profitable_estimate = -1;
3657       return;
3658     }
3659
3660   dump_printf (MSG_NOTE,
3661                "  Calculated minimum iters for profitability: %d\n",
3662                min_profitable_iters);
3663
3664   /* We want the vectorized loop to execute at least once.  */
3665   if (min_profitable_iters < (vf + peel_iters_prologue))
3666     min_profitable_iters = vf + peel_iters_prologue;
3667
3668   if (dump_enabled_p ())
3669     dump_printf_loc (MSG_NOTE, vect_location,
3670                      "  Runtime profitability threshold = %d\n",
3671                      min_profitable_iters);
3672
3673   *ret_min_profitable_niters = min_profitable_iters;
3674
3675   /* Calculate number of iterations required to make the vector version
3676      profitable, relative to the loop bodies only.
3677
3678      Non-vectorized variant is SIC * niters and it must win over vector
3679      variant on the expected loop trip count.  The following condition must hold true:
3680      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3681
3682   if (vec_outside_cost <= 0)
3683     min_profitable_estimate = 0;
3684   else
3685     {
3686       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3687                                  - vec_inside_cost * peel_iters_prologue
3688                                  - vec_inside_cost * peel_iters_epilogue)
3689                                  / ((scalar_single_iter_cost * vf)
3690                                    - vec_inside_cost);
3691     }
3692   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3693   if (dump_enabled_p ())
3694     dump_printf_loc (MSG_NOTE, vect_location,
3695                      "  Static estimate profitability threshold = %d\n",
3696                      min_profitable_estimate);
3697
3698   *ret_min_profitable_estimate = min_profitable_estimate;
3699 }
3700
3701 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3702    vector elements (not bits) for a vector with NELT elements.  */
3703 static void
3704 calc_vec_perm_mask_for_shift (unsigned int offset, unsigned int nelt,
3705                               vec_perm_indices *sel)
3706 {
3707   unsigned int i;
3708
3709   for (i = 0; i < nelt; i++)
3710     sel->quick_push ((i + offset) & (2 * nelt - 1));
3711 }
3712
3713 /* Checks whether the target supports whole-vector shifts for vectors of mode
3714    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3715    it supports vec_perm_const with masks for all necessary shift amounts.  */
3716 static bool
3717 have_whole_vector_shift (machine_mode mode)
3718 {
3719   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3720     return true;
3721
3722   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3723     return false;
3724
3725   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3726   auto_vec_perm_indices sel (nelt);
3727
3728   for (i = nelt/2; i >= 1; i/=2)
3729     {
3730       sel.truncate (0);
3731       calc_vec_perm_mask_for_shift (i, nelt, &sel);
3732       if (!can_vec_perm_p (mode, false, &sel))
3733         return false;
3734     }
3735   return true;
3736 }
3737
3738 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3739    functions. Design better to avoid maintenance issues.  */
3740
3741 /* Function vect_model_reduction_cost.
3742
3743    Models cost for a reduction operation, including the vector ops
3744    generated within the strip-mine loop, the initial definition before
3745    the loop, and the epilogue code that must be generated.  */
3746
3747 static void
3748 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3749                            int ncopies)
3750 {
3751   int prologue_cost = 0, epilogue_cost = 0;
3752   enum tree_code code;
3753   optab optab;
3754   tree vectype;
3755   gimple *orig_stmt;
3756   machine_mode mode;
3757   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3758   struct loop *loop = NULL;
3759   void *target_cost_data;
3760
3761   if (loop_vinfo)
3762     {
3763       loop = LOOP_VINFO_LOOP (loop_vinfo);
3764       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3765     }
3766   else
3767     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3768
3769   /* Condition reductions generate two reductions in the loop.  */
3770   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3771     ncopies *= 2;
3772
3773   /* Cost of reduction op inside loop.  */
3774   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3775                                         stmt_info, 0, vect_body);
3776
3777   vectype = STMT_VINFO_VECTYPE (stmt_info);
3778   mode = TYPE_MODE (vectype);
3779   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3780
3781   if (!orig_stmt)
3782     orig_stmt = STMT_VINFO_STMT (stmt_info);
3783
3784   code = gimple_assign_rhs_code (orig_stmt);
3785
3786   /* Add in cost for initial definition.
3787      For cond reduction we have four vectors: initial index, step, initial
3788      result of the data reduction, initial value of the index reduction.  */
3789   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3790                        == COND_REDUCTION ? 4 : 1;
3791   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3792                                   scalar_to_vec, stmt_info, 0,
3793                                   vect_prologue);
3794
3795   /* Determine cost of epilogue code.
3796
3797      We have a reduction operator that will reduce the vector in one statement.
3798      Also requires scalar extract.  */
3799
3800   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3801     {
3802       if (reduc_code != ERROR_MARK)
3803         {
3804           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3805             {
3806               /* An EQ stmt and an COND_EXPR stmt.  */
3807               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3808                                               vector_stmt, stmt_info, 0,
3809                                               vect_epilogue);
3810               /* Reduction of the max index and a reduction of the found
3811                  values.  */
3812               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3813                                               vec_to_scalar, stmt_info, 0,
3814                                               vect_epilogue);
3815               /* A broadcast of the max value.  */
3816               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3817                                               scalar_to_vec, stmt_info, 0,
3818                                               vect_epilogue);
3819             }
3820           else
3821             {
3822               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3823                                               stmt_info, 0, vect_epilogue);
3824               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3825                                               vec_to_scalar, stmt_info, 0,
3826                                               vect_epilogue);
3827             }
3828         }
3829       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3830         {
3831           unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
3832           /* Extraction of scalar elements.  */
3833           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits,
3834                                           vec_to_scalar, stmt_info, 0,
3835                                           vect_epilogue);
3836           /* Scalar max reductions via COND_EXPR / MAX_EXPR.  */
3837           epilogue_cost += add_stmt_cost (target_cost_data, 2 * nunits - 3,
3838                                           scalar_stmt, stmt_info, 0,
3839                                           vect_epilogue);
3840         }
3841       else
3842         {
3843           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3844           tree bitsize =
3845             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3846           int element_bitsize = tree_to_uhwi (bitsize);
3847           int nelements = vec_size_in_bits / element_bitsize;
3848
3849           if (code == COND_EXPR)
3850             code = MAX_EXPR;
3851
3852           optab = optab_for_tree_code (code, vectype, optab_default);
3853
3854           /* We have a whole vector shift available.  */
3855           if (optab != unknown_optab
3856               && VECTOR_MODE_P (mode)
3857               && optab_handler (optab, mode) != CODE_FOR_nothing
3858               && have_whole_vector_shift (mode))
3859             {
3860               /* Final reduction via vector shifts and the reduction operator.
3861                  Also requires scalar extract.  */
3862               epilogue_cost += add_stmt_cost (target_cost_data,
3863                                               exact_log2 (nelements) * 2,
3864                                               vector_stmt, stmt_info, 0,
3865                                               vect_epilogue);
3866               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3867                                               vec_to_scalar, stmt_info, 0,
3868                                               vect_epilogue);
3869             }
3870           else
3871             /* Use extracts and reduction op for final reduction.  For N
3872                elements, we have N extracts and N-1 reduction ops.  */
3873             epilogue_cost += add_stmt_cost (target_cost_data,
3874                                             nelements + nelements - 1,
3875                                             vector_stmt, stmt_info, 0,
3876                                             vect_epilogue);
3877         }
3878     }
3879
3880   if (dump_enabled_p ())
3881     dump_printf (MSG_NOTE,
3882                  "vect_model_reduction_cost: inside_cost = %d, "
3883                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3884                  prologue_cost, epilogue_cost);
3885 }
3886
3887
3888 /* Function vect_model_induction_cost.
3889
3890    Models cost for induction operations.  */
3891
3892 static void
3893 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3894 {
3895   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3896   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3897   unsigned inside_cost, prologue_cost;
3898
3899   if (PURE_SLP_STMT (stmt_info))
3900     return;
3901
3902   /* loop cost for vec_loop.  */
3903   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3904                                stmt_info, 0, vect_body);
3905
3906   /* prologue cost for vec_init and vec_step.  */
3907   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3908                                  stmt_info, 0, vect_prologue);
3909
3910   if (dump_enabled_p ())
3911     dump_printf_loc (MSG_NOTE, vect_location,
3912                      "vect_model_induction_cost: inside_cost = %d, "
3913                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3914 }
3915
3916
3917
3918 /* Function get_initial_def_for_reduction
3919
3920    Input:
3921    STMT - a stmt that performs a reduction operation in the loop.
3922    INIT_VAL - the initial value of the reduction variable
3923
3924    Output:
3925    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3926         of the reduction (used for adjusting the epilog - see below).
3927    Return a vector variable, initialized according to the operation that STMT
3928         performs. This vector will be used as the initial value of the
3929         vector of partial results.
3930
3931    Option1 (adjust in epilog): Initialize the vector as follows:
3932      add/bit or/xor:    [0,0,...,0,0]
3933      mult/bit and:      [1,1,...,1,1]
3934      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3935    and when necessary (e.g. add/mult case) let the caller know
3936    that it needs to adjust the result by init_val.
3937
3938    Option2: Initialize the vector as follows:
3939      add/bit or/xor:    [init_val,0,0,...,0]
3940      mult/bit and:      [init_val,1,1,...,1]
3941      min/max/cond_expr: [init_val,init_val,...,init_val]
3942    and no adjustments are needed.
3943
3944    For example, for the following code:
3945
3946    s = init_val;
3947    for (i=0;i<n;i++)
3948      s = s + a[i];
3949
3950    STMT is 's = s + a[i]', and the reduction variable is 's'.
3951    For a vector of 4 units, we want to return either [0,0,0,init_val],
3952    or [0,0,0,0] and let the caller know that it needs to adjust
3953    the result at the end by 'init_val'.
3954
3955    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3956    initialization vector is simpler (same element in all entries), if
3957    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3958
3959    A cost model should help decide between these two schemes.  */
3960
3961 tree
3962 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3963                                tree *adjustment_def)
3964 {
3965   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3966   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3967   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3968   tree scalar_type = TREE_TYPE (init_val);
3969   tree vectype = get_vectype_for_scalar_type (scalar_type);
3970   int nunits;
3971   enum tree_code code = gimple_assign_rhs_code (stmt);
3972   tree def_for_init;
3973   tree init_def;
3974   int i;
3975   bool nested_in_vect_loop = false;
3976   REAL_VALUE_TYPE real_init_val = dconst0;
3977   int int_init_val = 0;
3978   gimple *def_stmt = NULL;
3979   gimple_seq stmts = NULL;
3980
3981   gcc_assert (vectype);
3982   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3983
3984   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3985               || SCALAR_FLOAT_TYPE_P (scalar_type));
3986
3987   if (nested_in_vect_loop_p (loop, stmt))
3988     nested_in_vect_loop = true;
3989   else
3990     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3991
3992   /* In case of double reduction we only create a vector variable to be put
3993      in the reduction phi node.  The actual statement creation is done in
3994      vect_create_epilog_for_reduction.  */
3995   if (adjustment_def && nested_in_vect_loop
3996       && TREE_CODE (init_val) == SSA_NAME
3997       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3998       && gimple_code (def_stmt) == GIMPLE_PHI
3999       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4000       && vinfo_for_stmt (def_stmt)
4001       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4002           == vect_double_reduction_def)
4003     {
4004       *adjustment_def = NULL;
4005       return vect_create_destination_var (init_val, vectype);
4006     }
4007
4008   /* In case of a nested reduction do not use an adjustment def as
4009      that case is not supported by the epilogue generation correctly
4010      if ncopies is not one.  */
4011   if (adjustment_def && nested_in_vect_loop)
4012     {
4013       *adjustment_def = NULL;
4014       return vect_get_vec_def_for_operand (init_val, stmt);
4015     }
4016
4017   switch (code)
4018     {
4019     case WIDEN_SUM_EXPR:
4020     case DOT_PROD_EXPR:
4021     case SAD_EXPR:
4022     case PLUS_EXPR:
4023     case MINUS_EXPR:
4024     case BIT_IOR_EXPR:
4025     case BIT_XOR_EXPR:
4026     case MULT_EXPR:
4027     case BIT_AND_EXPR:
4028       {
4029         /* ADJUSMENT_DEF is NULL when called from
4030            vect_create_epilog_for_reduction to vectorize double reduction.  */
4031         if (adjustment_def)
4032           *adjustment_def = init_val;
4033
4034         if (code == MULT_EXPR)
4035           {
4036             real_init_val = dconst1;
4037             int_init_val = 1;
4038           }
4039
4040         if (code == BIT_AND_EXPR)
4041           int_init_val = -1;
4042
4043         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4044           def_for_init = build_real (scalar_type, real_init_val);
4045         else
4046           def_for_init = build_int_cst (scalar_type, int_init_val);
4047
4048         if (adjustment_def)
4049           /* Option1: the first element is '0' or '1' as well.  */
4050           init_def = gimple_build_vector_from_val (&stmts, vectype,
4051                                                    def_for_init);
4052         else
4053           {
4054             /* Option2: the first element is INIT_VAL.  */
4055             auto_vec<tree, 32> elts (nunits);
4056             elts.quick_push (init_val);
4057             for (i = 1; i < nunits; ++i)
4058               elts.quick_push (def_for_init);
4059             init_def = gimple_build_vector (&stmts, vectype, elts);
4060           }
4061       }
4062       break;
4063
4064     case MIN_EXPR:
4065     case MAX_EXPR:
4066     case COND_EXPR:
4067       {
4068         if (adjustment_def)
4069           {
4070             *adjustment_def = NULL_TREE;
4071             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4072               {
4073                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4074                 break;
4075               }
4076           }
4077         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4078         init_def = gimple_build_vector_from_val (&stmts, vectype, init_val);
4079       }
4080       break;
4081
4082     default:
4083       gcc_unreachable ();
4084     }
4085
4086   if (stmts)
4087     gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4088   return init_def;
4089 }
4090
4091 /* Get at the initial defs for the reduction PHIs in SLP_NODE.
4092    NUMBER_OF_VECTORS is the number of vector defs to create.  */
4093
4094 static void
4095 get_initial_defs_for_reduction (slp_tree slp_node,
4096                                 vec<tree> *vec_oprnds,
4097                                 unsigned int number_of_vectors,
4098                                 enum tree_code code, bool reduc_chain)
4099 {
4100   vec<gimple *> stmts = SLP_TREE_SCALAR_STMTS (slp_node);
4101   gimple *stmt = stmts[0];
4102   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4103   unsigned nunits;
4104   unsigned j, number_of_places_left_in_vector;
4105   tree vector_type, scalar_type;
4106   tree vop;
4107   int group_size = stmts.length ();
4108   unsigned int vec_num, i;
4109   unsigned number_of_copies = 1;
4110   vec<tree> voprnds;
4111   voprnds.create (number_of_vectors);
4112   tree neutral_op = NULL;
4113   struct loop *loop;
4114
4115   vector_type = STMT_VINFO_VECTYPE (stmt_vinfo);
4116   scalar_type = TREE_TYPE (vector_type);
4117   nunits = TYPE_VECTOR_SUBPARTS (vector_type);
4118
4119   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_reduction_def);
4120
4121   loop = (gimple_bb (stmt))->loop_father;
4122   gcc_assert (loop);
4123   edge pe = loop_preheader_edge (loop);
4124
4125   /* op is the reduction operand of the first stmt already.  */
4126   /* For additional copies (see the explanation of NUMBER_OF_COPIES below)
4127      we need either neutral operands or the original operands.  See
4128      get_initial_def_for_reduction() for details.  */
4129   switch (code)
4130     {
4131     case WIDEN_SUM_EXPR:
4132     case DOT_PROD_EXPR:
4133     case SAD_EXPR:
4134     case PLUS_EXPR:
4135     case MINUS_EXPR:
4136     case BIT_IOR_EXPR:
4137     case BIT_XOR_EXPR:
4138       neutral_op = build_zero_cst (scalar_type);
4139       break;
4140
4141     case MULT_EXPR:
4142       neutral_op = build_one_cst (scalar_type);
4143       break;
4144
4145     case BIT_AND_EXPR:
4146       neutral_op = build_all_ones_cst (scalar_type);
4147       break;
4148
4149     /* For MIN/MAX we don't have an easy neutral operand but
4150        the initial values can be used fine here.  Only for
4151        a reduction chain we have to force a neutral element.  */
4152     case MAX_EXPR:
4153     case MIN_EXPR:
4154       if (! reduc_chain)
4155         neutral_op = NULL;
4156       else
4157         neutral_op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4158       break;
4159
4160     default:
4161       gcc_assert (! reduc_chain);
4162       neutral_op = NULL;
4163     }
4164
4165   /* NUMBER_OF_COPIES is the number of times we need to use the same values in
4166      created vectors. It is greater than 1 if unrolling is performed.
4167
4168      For example, we have two scalar operands, s1 and s2 (e.g., group of
4169      strided accesses of size two), while NUNITS is four (i.e., four scalars
4170      of this type can be packed in a vector).  The output vector will contain
4171      two copies of each scalar operand: {s1, s2, s1, s2}.  (NUMBER_OF_COPIES
4172      will be 2).
4173
4174      If GROUP_SIZE > NUNITS, the scalars will be split into several vectors
4175      containing the operands.
4176
4177      For example, NUNITS is four as before, and the group size is 8
4178      (s1, s2, ..., s8).  We will create two vectors {s1, s2, s3, s4} and
4179      {s5, s6, s7, s8}.  */
4180
4181   number_of_copies = nunits * number_of_vectors / group_size;
4182
4183   number_of_places_left_in_vector = nunits;
4184   auto_vec<tree, 32> elts (nunits);
4185   elts.quick_grow (nunits);
4186   for (j = 0; j < number_of_copies; j++)
4187     {
4188       for (i = group_size - 1; stmts.iterate (i, &stmt); i--)
4189         {
4190           tree op;
4191           /* Get the def before the loop.  In reduction chain we have only
4192              one initial value.  */
4193           if ((j != (number_of_copies - 1)
4194                || (reduc_chain && i != 0))
4195               && neutral_op)
4196             op = neutral_op;
4197           else
4198             op = PHI_ARG_DEF_FROM_EDGE (stmt, pe);
4199
4200           /* Create 'vect_ = {op0,op1,...,opn}'.  */
4201           number_of_places_left_in_vector--;
4202           elts[number_of_places_left_in_vector] = op;
4203
4204           if (number_of_places_left_in_vector == 0)
4205             {
4206               gimple_seq ctor_seq = NULL;
4207               tree init = gimple_build_vector (&ctor_seq, vector_type, elts);
4208               if (ctor_seq != NULL)
4209                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4210               voprnds.quick_push (init);
4211
4212               number_of_places_left_in_vector = nunits;
4213             }
4214         }
4215     }
4216
4217   /* Since the vectors are created in the reverse order, we should invert
4218      them.  */
4219   vec_num = voprnds.length ();
4220   for (j = vec_num; j != 0; j--)
4221     {
4222       vop = voprnds[j - 1];
4223       vec_oprnds->quick_push (vop);
4224     }
4225
4226   voprnds.release ();
4227
4228   /* In case that VF is greater than the unrolling factor needed for the SLP
4229      group of stmts, NUMBER_OF_VECTORS to be created is greater than
4230      NUMBER_OF_SCALARS/NUNITS or NUNITS/NUMBER_OF_SCALARS, and hence we have
4231      to replicate the vectors.  */
4232   tree neutral_vec = NULL;
4233   while (number_of_vectors > vec_oprnds->length ())
4234     {
4235       if (neutral_op)
4236         {
4237           if (!neutral_vec)
4238             {
4239               gimple_seq ctor_seq = NULL;
4240               neutral_vec = gimple_build_vector_from_val
4241                 (&ctor_seq, vector_type, neutral_op);
4242               if (ctor_seq != NULL)
4243                 gsi_insert_seq_on_edge_immediate (pe, ctor_seq);
4244             }
4245           vec_oprnds->quick_push (neutral_vec);
4246         }
4247       else
4248         {
4249           for (i = 0; vec_oprnds->iterate (i, &vop) && i < vec_num; i++)
4250             vec_oprnds->quick_push (vop);
4251         }
4252     }
4253 }
4254
4255
4256 /* Function vect_create_epilog_for_reduction
4257
4258    Create code at the loop-epilog to finalize the result of a reduction
4259    computation.
4260
4261    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4262      reduction statements.
4263    STMT is the scalar reduction stmt that is being vectorized.
4264    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4265      number of elements that we can fit in a vectype (nunits).  In this case
4266      we have to generate more than one vector stmt - i.e - we need to "unroll"
4267      the vector stmt by a factor VF/nunits.  For more details see documentation
4268      in vectorizable_operation.
4269    REDUC_CODE is the tree-code for the epilog reduction.
4270    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4271      computation.
4272    REDUC_INDEX is the index of the operand in the right hand side of the
4273      statement that is defined by REDUCTION_PHI.
4274    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4275    SLP_NODE is an SLP node containing a group of reduction statements. The
4276      first one in this group is STMT.
4277
4278    This function:
4279    1. Creates the reduction def-use cycles: sets the arguments for
4280       REDUCTION_PHIS:
4281       The loop-entry argument is the vectorized initial-value of the reduction.
4282       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4283       sums.
4284    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4285       by applying the operation specified by REDUC_CODE if available, or by
4286       other means (whole-vector shifts or a scalar loop).
4287       The function also creates a new phi node at the loop exit to preserve
4288       loop-closed form, as illustrated below.
4289
4290      The flow at the entry to this function:
4291
4292         loop:
4293           vec_def = phi <null, null>            # REDUCTION_PHI
4294           VECT_DEF = vector_stmt                # vectorized form of STMT
4295           s_loop = scalar_stmt                  # (scalar) STMT
4296         loop_exit:
4297           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4298           use <s_out0>
4299           use <s_out0>
4300
4301      The above is transformed by this function into:
4302
4303         loop:
4304           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4305           VECT_DEF = vector_stmt                # vectorized form of STMT
4306           s_loop = scalar_stmt                  # (scalar) STMT
4307         loop_exit:
4308           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4309           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4310           v_out2 = reduce <v_out1>
4311           s_out3 = extract_field <v_out2, 0>
4312           s_out4 = adjust_result <s_out3>
4313           use <s_out4>
4314           use <s_out4>
4315 */
4316
4317 static void
4318 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4319                                   gimple *reduc_def_stmt,
4320                                   int ncopies, enum tree_code reduc_code,
4321                                   vec<gimple *> reduction_phis,
4322                                   bool double_reduc,
4323                                   slp_tree slp_node,
4324                                   slp_instance slp_node_instance)
4325 {
4326   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4327   stmt_vec_info prev_phi_info;
4328   tree vectype;
4329   machine_mode mode;
4330   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4331   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4332   basic_block exit_bb;
4333   tree scalar_dest;
4334   tree scalar_type;
4335   gimple *new_phi = NULL, *phi;
4336   gimple_stmt_iterator exit_gsi;
4337   tree vec_dest;
4338   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4339   gimple *epilog_stmt = NULL;
4340   enum tree_code code = gimple_assign_rhs_code (stmt);
4341   gimple *exit_phi;
4342   tree bitsize;
4343   tree adjustment_def = NULL;
4344   tree vec_initial_def = NULL;
4345   tree expr, def, initial_def = NULL;
4346   tree orig_name, scalar_result;
4347   imm_use_iterator imm_iter, phi_imm_iter;
4348   use_operand_p use_p, phi_use_p;
4349   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4350   bool nested_in_vect_loop = false;
4351   auto_vec<gimple *> new_phis;
4352   auto_vec<gimple *> inner_phis;
4353   enum vect_def_type dt = vect_unknown_def_type;
4354   int j, i;
4355   auto_vec<tree> scalar_results;
4356   unsigned int group_size = 1, k, ratio;
4357   auto_vec<tree> vec_initial_defs;
4358   auto_vec<gimple *> phis;
4359   bool slp_reduc = false;
4360   tree new_phi_result;
4361   gimple *inner_phi = NULL;
4362   tree induction_index = NULL_TREE;
4363
4364   if (slp_node)
4365     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4366
4367   if (nested_in_vect_loop_p (loop, stmt))
4368     {
4369       outer_loop = loop;
4370       loop = loop->inner;
4371       nested_in_vect_loop = true;
4372       gcc_assert (!slp_node);
4373     }
4374
4375   vectype = STMT_VINFO_VECTYPE (stmt_info);
4376   gcc_assert (vectype);
4377   mode = TYPE_MODE (vectype);
4378
4379   /* 1. Create the reduction def-use cycle:
4380      Set the arguments of REDUCTION_PHIS, i.e., transform
4381
4382         loop:
4383           vec_def = phi <null, null>            # REDUCTION_PHI
4384           VECT_DEF = vector_stmt                # vectorized form of STMT
4385           ...
4386
4387      into:
4388
4389         loop:
4390           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4391           VECT_DEF = vector_stmt                # vectorized form of STMT
4392           ...
4393
4394      (in case of SLP, do it for all the phis). */
4395
4396   /* Get the loop-entry arguments.  */
4397   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4398   if (slp_node)
4399     {
4400       unsigned vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
4401       vec_initial_defs.reserve (vec_num);
4402       get_initial_defs_for_reduction (slp_node_instance->reduc_phis,
4403                                       &vec_initial_defs, vec_num, code,
4404                                       GROUP_FIRST_ELEMENT (stmt_info));
4405     }
4406   else
4407     {
4408       /* Get at the scalar def before the loop, that defines the initial value
4409          of the reduction variable.  */
4410       gimple *def_stmt;
4411       initial_def = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
4412                                            loop_preheader_edge (loop));
4413       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4414       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4415                                                        &adjustment_def);
4416       vec_initial_defs.create (1);
4417       vec_initial_defs.quick_push (vec_initial_def);
4418     }
4419
4420   /* Set phi nodes arguments.  */
4421   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4422     {
4423       tree vec_init_def = vec_initial_defs[i];
4424       tree def = vect_defs[i];
4425       for (j = 0; j < ncopies; j++)
4426         {
4427           if (j != 0)
4428             {
4429               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4430               if (nested_in_vect_loop)
4431                 vec_init_def
4432                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4433                                                     vec_init_def);
4434             }
4435
4436           /* Set the loop-entry arg of the reduction-phi.  */
4437
4438           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4439               == INTEGER_INDUC_COND_REDUCTION)
4440             {
4441               /* Initialise the reduction phi to zero.  This prevents initial
4442                  values of non-zero interferring with the reduction op.  */
4443               gcc_assert (ncopies == 1);
4444               gcc_assert (i == 0);
4445
4446               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4447               tree zero_vec = build_zero_cst (vec_init_def_type);
4448
4449               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4450                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4451             }
4452           else
4453             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4454                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4455
4456           /* Set the loop-latch arg for the reduction-phi.  */
4457           if (j > 0)
4458             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4459
4460           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4461                        UNKNOWN_LOCATION);
4462
4463           if (dump_enabled_p ())
4464             {
4465               dump_printf_loc (MSG_NOTE, vect_location,
4466                                "transform reduction: created def-use cycle: ");
4467               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4468               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4469             }
4470         }
4471     }
4472
4473   /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
4474      which is updated with the current index of the loop for every match of
4475      the original loop's cond_expr (VEC_STMT).  This results in a vector
4476      containing the last time the condition passed for that vector lane.
4477      The first match will be a 1 to allow 0 to be used for non-matching
4478      indexes.  If there are no matches at all then the vector will be all
4479      zeroes.  */
4480   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4481     {
4482       tree indx_before_incr, indx_after_incr;
4483       int nunits_out = TYPE_VECTOR_SUBPARTS (vectype);
4484       int k;
4485
4486       gimple *vec_stmt = STMT_VINFO_VEC_STMT (stmt_info);
4487       gcc_assert (gimple_assign_rhs_code (vec_stmt) == VEC_COND_EXPR);
4488
4489       int scalar_precision
4490         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (TREE_TYPE (vectype)));
4491       tree cr_index_scalar_type = make_unsigned_type (scalar_precision);
4492       tree cr_index_vector_type = build_vector_type
4493         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype));
4494
4495       /* First we create a simple vector induction variable which starts
4496          with the values {1,2,3,...} (SERIES_VECT) and increments by the
4497          vector size (STEP).  */
4498
4499       /* Create a {1,2,3,...} vector.  */
4500       auto_vec<tree, 32> vtemp (nunits_out);
4501       for (k = 0; k < nunits_out; ++k)
4502         vtemp.quick_push (build_int_cst (cr_index_scalar_type, k + 1));
4503       tree series_vect = build_vector (cr_index_vector_type, vtemp);
4504
4505       /* Create a vector of the step value.  */
4506       tree step = build_int_cst (cr_index_scalar_type, nunits_out);
4507       tree vec_step = build_vector_from_val (cr_index_vector_type, step);
4508
4509       /* Create an induction variable.  */
4510       gimple_stmt_iterator incr_gsi;
4511       bool insert_after;
4512       standard_iv_increment_position (loop, &incr_gsi, &insert_after);
4513       create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
4514                  insert_after, &indx_before_incr, &indx_after_incr);
4515
4516       /* Next create a new phi node vector (NEW_PHI_TREE) which starts
4517          filled with zeros (VEC_ZERO).  */
4518
4519       /* Create a vector of 0s.  */
4520       tree zero = build_zero_cst (cr_index_scalar_type);
4521       tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
4522
4523       /* Create a vector phi node.  */
4524       tree new_phi_tree = make_ssa_name (cr_index_vector_type);
4525       new_phi = create_phi_node (new_phi_tree, loop->header);
4526       set_vinfo_for_stmt (new_phi,
4527                           new_stmt_vec_info (new_phi, loop_vinfo));
4528       add_phi_arg (as_a <gphi *> (new_phi), vec_zero,
4529                    loop_preheader_edge (loop), UNKNOWN_LOCATION);
4530
4531       /* Now take the condition from the loops original cond_expr
4532          (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
4533          every match uses values from the induction variable
4534          (INDEX_BEFORE_INCR) otherwise uses values from the phi node
4535          (NEW_PHI_TREE).
4536          Finally, we update the phi (NEW_PHI_TREE) to take the value of
4537          the new cond_expr (INDEX_COND_EXPR).  */
4538
4539       /* Duplicate the condition from vec_stmt.  */
4540       tree ccompare = unshare_expr (gimple_assign_rhs1 (vec_stmt));
4541
4542       /* Create a conditional, where the condition is taken from vec_stmt
4543          (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
4544          else is the phi (NEW_PHI_TREE).  */
4545       tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
4546                                      ccompare, indx_before_incr,
4547                                      new_phi_tree);
4548       induction_index = make_ssa_name (cr_index_vector_type);
4549       gimple *index_condition = gimple_build_assign (induction_index,
4550                                                      index_cond_expr);
4551       gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
4552       stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
4553                                                         loop_vinfo);
4554       STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
4555       set_vinfo_for_stmt (index_condition, index_vec_info);
4556
4557       /* Update the phi with the vec cond.  */
4558       add_phi_arg (as_a <gphi *> (new_phi), induction_index,
4559                    loop_latch_edge (loop), UNKNOWN_LOCATION);
4560     }
4561
4562   /* 2. Create epilog code.
4563         The reduction epilog code operates across the elements of the vector
4564         of partial results computed by the vectorized loop.
4565         The reduction epilog code consists of:
4566
4567         step 1: compute the scalar result in a vector (v_out2)
4568         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4569         step 3: adjust the scalar result (s_out3) if needed.
4570
4571         Step 1 can be accomplished using one the following three schemes:
4572           (scheme 1) using reduc_code, if available.
4573           (scheme 2) using whole-vector shifts, if available.
4574           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4575                      combined.
4576
4577           The overall epilog code looks like this:
4578
4579           s_out0 = phi <s_loop>         # original EXIT_PHI
4580           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4581           v_out2 = reduce <v_out1>              # step 1
4582           s_out3 = extract_field <v_out2, 0>    # step 2
4583           s_out4 = adjust_result <s_out3>       # step 3
4584
4585           (step 3 is optional, and steps 1 and 2 may be combined).
4586           Lastly, the uses of s_out0 are replaced by s_out4.  */
4587
4588
4589   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4590          v_out1 = phi <VECT_DEF>
4591          Store them in NEW_PHIS.  */
4592
4593   exit_bb = single_exit (loop)->dest;
4594   prev_phi_info = NULL;
4595   new_phis.create (vect_defs.length ());
4596   FOR_EACH_VEC_ELT (vect_defs, i, def)
4597     {
4598       for (j = 0; j < ncopies; j++)
4599         {
4600           tree new_def = copy_ssa_name (def);
4601           phi = create_phi_node (new_def, exit_bb);
4602           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4603           if (j == 0)
4604             new_phis.quick_push (phi);
4605           else
4606             {
4607               def = vect_get_vec_def_for_stmt_copy (dt, def);
4608               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4609             }
4610
4611           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4612           prev_phi_info = vinfo_for_stmt (phi);
4613         }
4614     }
4615
4616   /* The epilogue is created for the outer-loop, i.e., for the loop being
4617      vectorized.  Create exit phis for the outer loop.  */
4618   if (double_reduc)
4619     {
4620       loop = outer_loop;
4621       exit_bb = single_exit (loop)->dest;
4622       inner_phis.create (vect_defs.length ());
4623       FOR_EACH_VEC_ELT (new_phis, i, phi)
4624         {
4625           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4626           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4627           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4628                            PHI_RESULT (phi));
4629           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4630                                                             loop_vinfo));
4631           inner_phis.quick_push (phi);
4632           new_phis[i] = outer_phi;
4633           prev_phi_info = vinfo_for_stmt (outer_phi);
4634           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4635             {
4636               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4637               new_result = copy_ssa_name (PHI_RESULT (phi));
4638               outer_phi = create_phi_node (new_result, exit_bb);
4639               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4640                                PHI_RESULT (phi));
4641               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4642                                                                 loop_vinfo));
4643               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4644               prev_phi_info = vinfo_for_stmt (outer_phi);
4645             }
4646         }
4647     }
4648
4649   exit_gsi = gsi_after_labels (exit_bb);
4650
4651   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4652          (i.e. when reduc_code is not available) and in the final adjustment
4653          code (if needed).  Also get the original scalar reduction variable as
4654          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4655          represents a reduction pattern), the tree-code and scalar-def are
4656          taken from the original stmt that the pattern-stmt (STMT) replaces.
4657          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4658          are taken from STMT.  */
4659
4660   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4661   if (!orig_stmt)
4662     {
4663       /* Regular reduction  */
4664       orig_stmt = stmt;
4665     }
4666   else
4667     {
4668       /* Reduction pattern  */
4669       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4670       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4671       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4672     }
4673
4674   code = gimple_assign_rhs_code (orig_stmt);
4675   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4676      partial results are added and not subtracted.  */
4677   if (code == MINUS_EXPR)
4678     code = PLUS_EXPR;
4679
4680   scalar_dest = gimple_assign_lhs (orig_stmt);
4681   scalar_type = TREE_TYPE (scalar_dest);
4682   scalar_results.create (group_size);
4683   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4684   bitsize = TYPE_SIZE (scalar_type);
4685
4686   /* In case this is a reduction in an inner-loop while vectorizing an outer
4687      loop - we don't need to extract a single scalar result at the end of the
4688      inner-loop (unless it is double reduction, i.e., the use of reduction is
4689      outside the outer-loop).  The final vector of partial results will be used
4690      in the vectorized outer-loop, or reduced to a scalar result at the end of
4691      the outer-loop.  */
4692   if (nested_in_vect_loop && !double_reduc)
4693     goto vect_finalize_reduction;
4694
4695   /* SLP reduction without reduction chain, e.g.,
4696      # a1 = phi <a2, a0>
4697      # b1 = phi <b2, b0>
4698      a2 = operation (a1)
4699      b2 = operation (b1)  */
4700   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4701
4702   /* In case of reduction chain, e.g.,
4703      # a1 = phi <a3, a0>
4704      a2 = operation (a1)
4705      a3 = operation (a2),
4706
4707      we may end up with more than one vector result.  Here we reduce them to
4708      one vector.  */
4709   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4710     {
4711       tree first_vect = PHI_RESULT (new_phis[0]);
4712       gassign *new_vec_stmt = NULL;
4713       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4714       for (k = 1; k < new_phis.length (); k++)
4715         {
4716           gimple *next_phi = new_phis[k];
4717           tree second_vect = PHI_RESULT (next_phi);
4718           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4719           new_vec_stmt = gimple_build_assign (tem, code,
4720                                               first_vect, second_vect);
4721           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4722           first_vect = tem;
4723         }
4724
4725       new_phi_result = first_vect;
4726       if (new_vec_stmt)
4727         {
4728           new_phis.truncate (0);
4729           new_phis.safe_push (new_vec_stmt);
4730         }
4731     }
4732   /* Likewise if we couldn't use a single defuse cycle.  */
4733   else if (ncopies > 1)
4734     {
4735       gcc_assert (new_phis.length () == 1);
4736       tree first_vect = PHI_RESULT (new_phis[0]);
4737       gassign *new_vec_stmt = NULL;
4738       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4739       gimple *next_phi = new_phis[0];
4740       for (int k = 1; k < ncopies; ++k)
4741         {
4742           next_phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (next_phi));
4743           tree second_vect = PHI_RESULT (next_phi);
4744           tree tem = make_ssa_name (vec_dest, new_vec_stmt);
4745           new_vec_stmt = gimple_build_assign (tem, code,
4746                                               first_vect, second_vect);
4747           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4748           first_vect = tem;
4749         }
4750       new_phi_result = first_vect;
4751       new_phis.truncate (0);
4752       new_phis.safe_push (new_vec_stmt);
4753     }
4754   else
4755     new_phi_result = PHI_RESULT (new_phis[0]);
4756
4757   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4758       && reduc_code != ERROR_MARK)
4759     {
4760       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4761          various data values where the condition matched and another vector
4762          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4763          need to extract the last matching index (which will be the index with
4764          highest value) and use this to index into the data vector.
4765          For the case where there were no matches, the data vector will contain
4766          all default values and the index vector will be all zeros.  */
4767
4768       /* Get various versions of the type of the vector of indexes.  */
4769       tree index_vec_type = TREE_TYPE (induction_index);
4770       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4771       tree index_scalar_type = TREE_TYPE (index_vec_type);
4772       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4773         (index_vec_type);
4774
4775       /* Get an unsigned integer version of the type of the data vector.  */
4776       int scalar_precision
4777         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
4778       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4779       tree vectype_unsigned = build_vector_type
4780         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4781
4782       /* First we need to create a vector (ZERO_VEC) of zeros and another
4783          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4784          can create using a MAX reduction and then expanding.
4785          In the case where the loop never made any matches, the max index will
4786          be zero.  */
4787
4788       /* Vector of {0, 0, 0,...}.  */
4789       tree zero_vec = make_ssa_name (vectype);
4790       tree zero_vec_rhs = build_zero_cst (vectype);
4791       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4792       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4793
4794       /* Find maximum value from the vector of found indexes.  */
4795       tree max_index = make_ssa_name (index_scalar_type);
4796       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4797                                                     induction_index);
4798       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4799
4800       /* Vector of {max_index, max_index, max_index,...}.  */
4801       tree max_index_vec = make_ssa_name (index_vec_type);
4802       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4803                                                       max_index);
4804       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4805                                                         max_index_vec_rhs);
4806       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4807
4808       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4809          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4810          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4811          otherwise.  Only one value should match, resulting in a vector
4812          (VEC_COND) with one data value and the rest zeros.
4813          In the case where the loop never made any matches, every index will
4814          match, resulting in a vector with all data values (which will all be
4815          the default value).  */
4816
4817       /* Compare the max index vector to the vector of found indexes to find
4818          the position of the max value.  */
4819       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4820       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4821                                                       induction_index,
4822                                                       max_index_vec);
4823       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4824
4825       /* Use the compare to choose either values from the data vector or
4826          zero.  */
4827       tree vec_cond = make_ssa_name (vectype);
4828       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4829                                                    vec_compare, new_phi_result,
4830                                                    zero_vec);
4831       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4832
4833       /* Finally we need to extract the data value from the vector (VEC_COND)
4834          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4835          reduction, but because this doesn't exist, we can use a MAX reduction
4836          instead.  The data value might be signed or a float so we need to cast
4837          it first.
4838          In the case where the loop never made any matches, the data values are
4839          all identical, and so will reduce down correctly.  */
4840
4841       /* Make the matched data values unsigned.  */
4842       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4843       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4844                                        vec_cond);
4845       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4846                                                         VIEW_CONVERT_EXPR,
4847                                                         vec_cond_cast_rhs);
4848       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4849
4850       /* Reduce down to a scalar value.  */
4851       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4852       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4853                                       optab_default);
4854       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4855                   != CODE_FOR_nothing);
4856       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4857                                                      REDUC_MAX_EXPR,
4858                                                      vec_cond_cast);
4859       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4860
4861       /* Convert the reduced value back to the result type and set as the
4862          result.  */
4863       gimple_seq stmts = NULL;
4864       new_temp = gimple_build (&stmts, VIEW_CONVERT_EXPR, scalar_type,
4865                                data_reduc);
4866       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4867       scalar_results.safe_push (new_temp);
4868     }
4869   else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
4870            && reduc_code == ERROR_MARK)
4871     {
4872       /* Condition redution without supported REDUC_MAX_EXPR.  Generate
4873          idx = 0;
4874          idx_val = induction_index[0];
4875          val = data_reduc[0];
4876          for (idx = 0, val = init, i = 0; i < nelts; ++i)
4877            if (induction_index[i] > idx_val)
4878              val = data_reduc[i], idx_val = induction_index[i];
4879          return val;  */
4880
4881       tree data_eltype = TREE_TYPE (TREE_TYPE (new_phi_result));
4882       tree idx_eltype = TREE_TYPE (TREE_TYPE (induction_index));
4883       unsigned HOST_WIDE_INT el_size = tree_to_uhwi (TYPE_SIZE (idx_eltype));
4884       unsigned HOST_WIDE_INT v_size
4885         = el_size * TYPE_VECTOR_SUBPARTS (TREE_TYPE (induction_index));
4886       tree idx_val = NULL_TREE, val = NULL_TREE;
4887       for (unsigned HOST_WIDE_INT off = 0; off < v_size; off += el_size)
4888         {
4889           tree old_idx_val = idx_val;
4890           tree old_val = val;
4891           idx_val = make_ssa_name (idx_eltype);
4892           epilog_stmt = gimple_build_assign (idx_val, BIT_FIELD_REF,
4893                                              build3 (BIT_FIELD_REF, idx_eltype,
4894                                                      induction_index,
4895                                                      bitsize_int (el_size),
4896                                                      bitsize_int (off)));
4897           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4898           val = make_ssa_name (data_eltype);
4899           epilog_stmt = gimple_build_assign (val, BIT_FIELD_REF,
4900                                              build3 (BIT_FIELD_REF,
4901                                                      data_eltype,
4902                                                      new_phi_result,
4903                                                      bitsize_int (el_size),
4904                                                      bitsize_int (off)));
4905           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4906           if (off != 0)
4907             {
4908               tree new_idx_val = idx_val;
4909               tree new_val = val;
4910               if (off != v_size - el_size)
4911                 {
4912                   new_idx_val = make_ssa_name (idx_eltype);
4913                   epilog_stmt = gimple_build_assign (new_idx_val,
4914                                                      MAX_EXPR, idx_val,
4915                                                      old_idx_val);
4916                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4917                 }
4918               new_val = make_ssa_name (data_eltype);
4919               epilog_stmt = gimple_build_assign (new_val,
4920                                                  COND_EXPR,
4921                                                  build2 (GT_EXPR,
4922                                                          boolean_type_node,
4923                                                          idx_val,
4924                                                          old_idx_val),
4925                                                  val, old_val);
4926               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4927               idx_val = new_idx_val;
4928               val = new_val;
4929             }
4930         }
4931       /* Convert the reduced value back to the result type and set as the
4932          result.  */
4933       gimple_seq stmts = NULL;
4934       val = gimple_convert (&stmts, scalar_type, val);
4935       gsi_insert_seq_before (&exit_gsi, stmts, GSI_SAME_STMT);
4936       scalar_results.safe_push (val);
4937     }
4938
4939   /* 2.3 Create the reduction code, using one of the three schemes described
4940          above. In SLP we simply need to extract all the elements from the
4941          vector (without reducing them), so we use scalar shifts.  */
4942   else if (reduc_code != ERROR_MARK && !slp_reduc)
4943     {
4944       tree tmp;
4945       tree vec_elem_type;
4946
4947       /* Case 1:  Create:
4948          v_out2 = reduc_expr <v_out1>  */
4949
4950       if (dump_enabled_p ())
4951         dump_printf_loc (MSG_NOTE, vect_location,
4952                          "Reduce using direct vector reduction.\n");
4953
4954       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4955       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4956         {
4957           tree tmp_dest =
4958               vect_create_destination_var (scalar_dest, vec_elem_type);
4959           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4960           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4961           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4962           gimple_assign_set_lhs (epilog_stmt, new_temp);
4963           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4964
4965           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4966         }
4967       else
4968         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4969
4970       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4971       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4972       gimple_assign_set_lhs (epilog_stmt, new_temp);
4973       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4974
4975       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4976           == INTEGER_INDUC_COND_REDUCTION)
4977         {
4978           /* Earlier we set the initial value to be zero.  Check the result
4979              and if it is zero then replace with the original initial
4980              value.  */
4981           tree zero = build_zero_cst (scalar_type);
4982           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4983
4984           tmp = make_ssa_name (new_scalar_dest);
4985           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4986                                              initial_def, new_temp);
4987           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4988           new_temp = tmp;
4989         }
4990
4991       scalar_results.safe_push (new_temp);
4992     }
4993   else
4994     {
4995       bool reduce_with_shift = have_whole_vector_shift (mode);
4996       int element_bitsize = tree_to_uhwi (bitsize);
4997       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4998       tree vec_temp;
4999
5000       /* COND reductions all do the final reduction with MAX_EXPR.  */
5001       if (code == COND_EXPR)
5002         code = MAX_EXPR;
5003
5004       /* Regardless of whether we have a whole vector shift, if we're
5005          emulating the operation via tree-vect-generic, we don't want
5006          to use it.  Only the first round of the reduction is likely
5007          to still be profitable via emulation.  */
5008       /* ??? It might be better to emit a reduction tree code here, so that
5009          tree-vect-generic can expand the first round via bit tricks.  */
5010       if (!VECTOR_MODE_P (mode))
5011         reduce_with_shift = false;
5012       else
5013         {
5014           optab optab = optab_for_tree_code (code, vectype, optab_default);
5015           if (optab_handler (optab, mode) == CODE_FOR_nothing)
5016             reduce_with_shift = false;
5017         }
5018
5019       if (reduce_with_shift && !slp_reduc)
5020         {
5021           int nelements = vec_size_in_bits / element_bitsize;
5022           auto_vec_perm_indices sel (nelements);
5023
5024           int elt_offset;
5025
5026           tree zero_vec = build_zero_cst (vectype);
5027           /* Case 2: Create:
5028              for (offset = nelements/2; offset >= 1; offset/=2)
5029                 {
5030                   Create:  va' = vec_shift <va, offset>
5031                   Create:  va = vop <va, va'>
5032                 }  */
5033
5034           tree rhs;
5035
5036           if (dump_enabled_p ())
5037             dump_printf_loc (MSG_NOTE, vect_location,
5038                              "Reduce using vector shifts\n");
5039
5040           vec_dest = vect_create_destination_var (scalar_dest, vectype);
5041           new_temp = new_phi_result;
5042           for (elt_offset = nelements / 2;
5043                elt_offset >= 1;
5044                elt_offset /= 2)
5045             {
5046               sel.truncate (0);
5047               calc_vec_perm_mask_for_shift (elt_offset, nelements, &sel);
5048               tree mask = vect_gen_perm_mask_any (vectype, sel);
5049               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
5050                                                  new_temp, zero_vec, mask);
5051               new_name = make_ssa_name (vec_dest, epilog_stmt);
5052               gimple_assign_set_lhs (epilog_stmt, new_name);
5053               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5054
5055               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
5056                                                  new_temp);
5057               new_temp = make_ssa_name (vec_dest, epilog_stmt);
5058               gimple_assign_set_lhs (epilog_stmt, new_temp);
5059               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5060             }
5061
5062           /* 2.4  Extract the final scalar result.  Create:
5063              s_out3 = extract_field <v_out2, bitpos>  */
5064
5065           if (dump_enabled_p ())
5066             dump_printf_loc (MSG_NOTE, vect_location,
5067                              "extract scalar result\n");
5068
5069           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
5070                         bitsize, bitsize_zero_node);
5071           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5072           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5073           gimple_assign_set_lhs (epilog_stmt, new_temp);
5074           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5075           scalar_results.safe_push (new_temp);
5076         }
5077       else
5078         {
5079           /* Case 3: Create:
5080              s = extract_field <v_out2, 0>
5081              for (offset = element_size;
5082                   offset < vector_size;
5083                   offset += element_size;)
5084                {
5085                  Create:  s' = extract_field <v_out2, offset>
5086                  Create:  s = op <s, s'>  // For non SLP cases
5087                }  */
5088
5089           if (dump_enabled_p ())
5090             dump_printf_loc (MSG_NOTE, vect_location,
5091                              "Reduce using scalar code.\n");
5092
5093           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
5094           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
5095             {
5096               int bit_offset;
5097               if (gimple_code (new_phi) == GIMPLE_PHI)
5098                 vec_temp = PHI_RESULT (new_phi);
5099               else
5100                 vec_temp = gimple_assign_lhs (new_phi);
5101               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
5102                             bitsize_zero_node);
5103               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5104               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5105               gimple_assign_set_lhs (epilog_stmt, new_temp);
5106               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5107
5108               /* In SLP we don't need to apply reduction operation, so we just
5109                  collect s' values in SCALAR_RESULTS.  */
5110               if (slp_reduc)
5111                 scalar_results.safe_push (new_temp);
5112
5113               for (bit_offset = element_bitsize;
5114                    bit_offset < vec_size_in_bits;
5115                    bit_offset += element_bitsize)
5116                 {
5117                   tree bitpos = bitsize_int (bit_offset);
5118                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
5119                                      bitsize, bitpos);
5120
5121                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
5122                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
5123                   gimple_assign_set_lhs (epilog_stmt, new_name);
5124                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5125
5126                   if (slp_reduc)
5127                     {
5128                       /* In SLP we don't need to apply reduction operation, so
5129                          we just collect s' values in SCALAR_RESULTS.  */
5130                       new_temp = new_name;
5131                       scalar_results.safe_push (new_name);
5132                     }
5133                   else
5134                     {
5135                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
5136                                                          new_name, new_temp);
5137                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
5138                       gimple_assign_set_lhs (epilog_stmt, new_temp);
5139                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5140                     }
5141                 }
5142             }
5143
5144           /* The only case where we need to reduce scalar results in SLP, is
5145              unrolling.  If the size of SCALAR_RESULTS is greater than
5146              GROUP_SIZE, we reduce them combining elements modulo
5147              GROUP_SIZE.  */
5148           if (slp_reduc)
5149             {
5150               tree res, first_res, new_res;
5151               gimple *new_stmt;
5152
5153               /* Reduce multiple scalar results in case of SLP unrolling.  */
5154               for (j = group_size; scalar_results.iterate (j, &res);
5155                    j++)
5156                 {
5157                   first_res = scalar_results[j % group_size];
5158                   new_stmt = gimple_build_assign (new_scalar_dest, code,
5159                                                   first_res, res);
5160                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
5161                   gimple_assign_set_lhs (new_stmt, new_res);
5162                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
5163                   scalar_results[j % group_size] = new_res;
5164                 }
5165             }
5166           else
5167             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5168             scalar_results.safe_push (new_temp);
5169         }
5170
5171       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5172           == INTEGER_INDUC_COND_REDUCTION)
5173         {
5174           /* Earlier we set the initial value to be zero.  Check the result
5175              and if it is zero then replace with the original initial
5176              value.  */
5177           tree zero = build_zero_cst (scalar_type);
5178           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
5179
5180           tree tmp = make_ssa_name (new_scalar_dest);
5181           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
5182                                              initial_def, new_temp);
5183           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5184           scalar_results[0] = tmp;
5185         }
5186     }
5187
5188 vect_finalize_reduction:
5189
5190   if (double_reduc)
5191     loop = loop->inner;
5192
5193   /* 2.5 Adjust the final result by the initial value of the reduction
5194          variable. (When such adjustment is not needed, then
5195          'adjustment_def' is zero).  For example, if code is PLUS we create:
5196          new_temp = loop_exit_def + adjustment_def  */
5197
5198   if (adjustment_def)
5199     {
5200       gcc_assert (!slp_reduc);
5201       if (nested_in_vect_loop)
5202         {
5203           new_phi = new_phis[0];
5204           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5205           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5206           new_dest = vect_create_destination_var (scalar_dest, vectype);
5207         }
5208       else
5209         {
5210           new_temp = scalar_results[0];
5211           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5212           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5213           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5214         }
5215
5216       epilog_stmt = gimple_build_assign (new_dest, expr);
5217       new_temp = make_ssa_name (new_dest, epilog_stmt);
5218       gimple_assign_set_lhs (epilog_stmt, new_temp);
5219       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5220       if (nested_in_vect_loop)
5221         {
5222           set_vinfo_for_stmt (epilog_stmt,
5223                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5224           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5225                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5226
5227           if (!double_reduc)
5228             scalar_results.quick_push (new_temp);
5229           else
5230             scalar_results[0] = new_temp;
5231         }
5232       else
5233         scalar_results[0] = new_temp;
5234
5235       new_phis[0] = epilog_stmt;
5236     }
5237
5238   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5239           phis with new adjusted scalar results, i.e., replace use <s_out0>
5240           with use <s_out4>.
5241
5242      Transform:
5243         loop_exit:
5244           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5245           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5246           v_out2 = reduce <v_out1>
5247           s_out3 = extract_field <v_out2, 0>
5248           s_out4 = adjust_result <s_out3>
5249           use <s_out0>
5250           use <s_out0>
5251
5252      into:
5253
5254         loop_exit:
5255           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5256           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5257           v_out2 = reduce <v_out1>
5258           s_out3 = extract_field <v_out2, 0>
5259           s_out4 = adjust_result <s_out3>
5260           use <s_out4>
5261           use <s_out4> */
5262
5263
5264   /* In SLP reduction chain we reduce vector results into one vector if
5265      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5266      the last stmt in the reduction chain, since we are looking for the loop
5267      exit phi node.  */
5268   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5269     {
5270       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5271       /* Handle reduction patterns.  */
5272       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5273         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5274
5275       scalar_dest = gimple_assign_lhs (dest_stmt);
5276       group_size = 1;
5277     }
5278
5279   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5280      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5281      need to match SCALAR_RESULTS with corresponding statements.  The first
5282      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5283      the first vector stmt, etc.
5284      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5285   if (group_size > new_phis.length ())
5286     {
5287       ratio = group_size / new_phis.length ();
5288       gcc_assert (!(group_size % new_phis.length ()));
5289     }
5290   else
5291     ratio = 1;
5292
5293   for (k = 0; k < group_size; k++)
5294     {
5295       if (k % ratio == 0)
5296         {
5297           epilog_stmt = new_phis[k / ratio];
5298           reduction_phi = reduction_phis[k / ratio];
5299           if (double_reduc)
5300             inner_phi = inner_phis[k / ratio];
5301         }
5302
5303       if (slp_reduc)
5304         {
5305           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5306
5307           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5308           /* SLP statements can't participate in patterns.  */
5309           gcc_assert (!orig_stmt);
5310           scalar_dest = gimple_assign_lhs (current_stmt);
5311         }
5312
5313       phis.create (3);
5314       /* Find the loop-closed-use at the loop exit of the original scalar
5315          result.  (The reduction result is expected to have two immediate uses -
5316          one at the latch block, and one at the loop exit).  */
5317       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5318         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5319             && !is_gimple_debug (USE_STMT (use_p)))
5320           phis.safe_push (USE_STMT (use_p));
5321
5322       /* While we expect to have found an exit_phi because of loop-closed-ssa
5323          form we can end up without one if the scalar cycle is dead.  */
5324
5325       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5326         {
5327           if (outer_loop)
5328             {
5329               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5330               gphi *vect_phi;
5331
5332               /* FORNOW. Currently not supporting the case that an inner-loop
5333                  reduction is not used in the outer-loop (but only outside the
5334                  outer-loop), unless it is double reduction.  */
5335               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5336                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5337                           || double_reduc);
5338
5339               if (double_reduc)
5340                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5341               else
5342                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5343               if (!double_reduc
5344                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5345                       != vect_double_reduction_def)
5346                 continue;
5347
5348               /* Handle double reduction:
5349
5350                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5351                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5352                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5353                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5354
5355                  At that point the regular reduction (stmt2 and stmt3) is
5356                  already vectorized, as well as the exit phi node, stmt4.
5357                  Here we vectorize the phi node of double reduction, stmt1, and
5358                  update all relevant statements.  */
5359
5360               /* Go through all the uses of s2 to find double reduction phi
5361                  node, i.e., stmt1 above.  */
5362               orig_name = PHI_RESULT (exit_phi);
5363               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5364                 {
5365                   stmt_vec_info use_stmt_vinfo;
5366                   stmt_vec_info new_phi_vinfo;
5367                   tree vect_phi_init, preheader_arg, vect_phi_res;
5368                   basic_block bb = gimple_bb (use_stmt);
5369                   gimple *use;
5370
5371                   /* Check that USE_STMT is really double reduction phi
5372                      node.  */
5373                   if (gimple_code (use_stmt) != GIMPLE_PHI
5374                       || gimple_phi_num_args (use_stmt) != 2
5375                       || bb->loop_father != outer_loop)
5376                     continue;
5377                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5378                   if (!use_stmt_vinfo
5379                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5380                           != vect_double_reduction_def)
5381                     continue;
5382
5383                   /* Create vector phi node for double reduction:
5384                      vs1 = phi <vs0, vs2>
5385                      vs1 was created previously in this function by a call to
5386                        vect_get_vec_def_for_operand and is stored in
5387                        vec_initial_def;
5388                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5389                      vs0 is created here.  */
5390
5391                   /* Create vector phi node.  */
5392                   vect_phi = create_phi_node (vec_initial_def, bb);
5393                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5394                                     loop_vec_info_for_loop (outer_loop));
5395                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5396
5397                   /* Create vs0 - initial def of the double reduction phi.  */
5398                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5399                                              loop_preheader_edge (outer_loop));
5400                   vect_phi_init = get_initial_def_for_reduction
5401                     (stmt, preheader_arg, NULL);
5402
5403                   /* Update phi node arguments with vs0 and vs2.  */
5404                   add_phi_arg (vect_phi, vect_phi_init,
5405                                loop_preheader_edge (outer_loop),
5406                                UNKNOWN_LOCATION);
5407                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5408                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5409                   if (dump_enabled_p ())
5410                     {
5411                       dump_printf_loc (MSG_NOTE, vect_location,
5412                                        "created double reduction phi node: ");
5413                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5414                     }
5415
5416                   vect_phi_res = PHI_RESULT (vect_phi);
5417
5418                   /* Replace the use, i.e., set the correct vs1 in the regular
5419                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5420                      loop is redundant.  */
5421                   use = reduction_phi;
5422                   for (j = 0; j < ncopies; j++)
5423                     {
5424                       edge pr_edge = loop_preheader_edge (loop);
5425                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5426                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5427                     }
5428                 }
5429             }
5430         }
5431
5432       phis.release ();
5433       if (nested_in_vect_loop)
5434         {
5435           if (double_reduc)
5436             loop = outer_loop;
5437           else
5438             continue;
5439         }
5440
5441       phis.create (3);
5442       /* Find the loop-closed-use at the loop exit of the original scalar
5443          result.  (The reduction result is expected to have two immediate uses,
5444          one at the latch block, and one at the loop exit).  For double
5445          reductions we are looking for exit phis of the outer loop.  */
5446       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5447         {
5448           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5449             {
5450               if (!is_gimple_debug (USE_STMT (use_p)))
5451                 phis.safe_push (USE_STMT (use_p));
5452             }
5453           else
5454             {
5455               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5456                 {
5457                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5458
5459                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5460                     {
5461                       if (!flow_bb_inside_loop_p (loop,
5462                                              gimple_bb (USE_STMT (phi_use_p)))
5463                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5464                         phis.safe_push (USE_STMT (phi_use_p));
5465                     }
5466                 }
5467             }
5468         }
5469
5470       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5471         {
5472           /* Replace the uses:  */
5473           orig_name = PHI_RESULT (exit_phi);
5474           scalar_result = scalar_results[k];
5475           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5476             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5477               SET_USE (use_p, scalar_result);
5478         }
5479
5480       phis.release ();
5481     }
5482 }
5483
5484
5485 /* Function is_nonwrapping_integer_induction.
5486
5487    Check if STMT (which is part of loop LOOP) both increments and
5488    does not cause overflow.  */
5489
5490 static bool
5491 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5492 {
5493   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5494   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5495   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5496   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5497   widest_int ni, max_loop_value, lhs_max;
5498   bool overflow = false;
5499
5500   /* Make sure the loop is integer based.  */
5501   if (TREE_CODE (base) != INTEGER_CST
5502       || TREE_CODE (step) != INTEGER_CST)
5503     return false;
5504
5505   /* Check that the induction increments.  */
5506   if (tree_int_cst_sgn (step) == -1)
5507     return false;
5508
5509   /* Check that the max size of the loop will not wrap.  */
5510
5511   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5512     return true;
5513
5514   if (! max_stmt_executions (loop, &ni))
5515     return false;
5516
5517   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5518                             &overflow);
5519   if (overflow)
5520     return false;
5521
5522   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5523                             TYPE_SIGN (lhs_type), &overflow);
5524   if (overflow)
5525     return false;
5526
5527   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5528           <= TYPE_PRECISION (lhs_type));
5529 }
5530
5531 /* Function vectorizable_reduction.
5532
5533    Check if STMT performs a reduction operation that can be vectorized.
5534    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5535    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5536    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5537
5538    This function also handles reduction idioms (patterns) that have been
5539    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5540    of this form:
5541      X = pattern_expr (arg0, arg1, ..., X)
5542    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5543    sequence that had been detected and replaced by the pattern-stmt (STMT).
5544
5545    This function also handles reduction of condition expressions, for example:
5546      for (int i = 0; i < N; i++)
5547        if (a[i] < value)
5548          last = a[i];
5549    This is handled by vectorising the loop and creating an additional vector
5550    containing the loop indexes for which "a[i] < value" was true.  In the
5551    function epilogue this is reduced to a single max value and then used to
5552    index into the vector of results.
5553
5554    In some cases of reduction patterns, the type of the reduction variable X is
5555    different than the type of the other arguments of STMT.
5556    In such cases, the vectype that is used when transforming STMT into a vector
5557    stmt is different than the vectype that is used to determine the
5558    vectorization factor, because it consists of a different number of elements
5559    than the actual number of elements that are being operated upon in parallel.
5560
5561    For example, consider an accumulation of shorts into an int accumulator.
5562    On some targets it's possible to vectorize this pattern operating on 8
5563    shorts at a time (hence, the vectype for purposes of determining the
5564    vectorization factor should be V8HI); on the other hand, the vectype that
5565    is used to create the vector form is actually V4SI (the type of the result).
5566
5567    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5568    indicates what is the actual level of parallelism (V8HI in the example), so
5569    that the right vectorization factor would be derived.  This vectype
5570    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5571    be used to create the vectorized stmt.  The right vectype for the vectorized
5572    stmt is obtained from the type of the result X:
5573         get_vectype_for_scalar_type (TREE_TYPE (X))
5574
5575    This means that, contrary to "regular" reductions (or "regular" stmts in
5576    general), the following equation:
5577       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5578    does *NOT* necessarily hold for reduction patterns.  */
5579
5580 bool
5581 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5582                         gimple **vec_stmt, slp_tree slp_node,
5583                         slp_instance slp_node_instance)
5584 {
5585   tree vec_dest;
5586   tree scalar_dest;
5587   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5588   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5589   tree vectype_in = NULL_TREE;
5590   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5591   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5592   enum tree_code code, orig_code, epilog_reduc_code;
5593   machine_mode vec_mode;
5594   int op_type;
5595   optab optab, reduc_optab;
5596   tree new_temp = NULL_TREE;
5597   gimple *def_stmt;
5598   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5599   tree scalar_type;
5600   bool is_simple_use;
5601   gimple *orig_stmt;
5602   stmt_vec_info orig_stmt_info = NULL;
5603   int i;
5604   int ncopies;
5605   int epilog_copies;
5606   stmt_vec_info prev_stmt_info, prev_phi_info;
5607   bool single_defuse_cycle = false;
5608   gimple *new_stmt = NULL;
5609   int j;
5610   tree ops[3];
5611   enum vect_def_type dts[3];
5612   bool nested_cycle = false, found_nested_cycle_def = false;
5613   bool double_reduc = false;
5614   basic_block def_bb;
5615   struct loop * def_stmt_loop, *outer_loop = NULL;
5616   tree def_arg;
5617   gimple *def_arg_stmt;
5618   auto_vec<tree> vec_oprnds0;
5619   auto_vec<tree> vec_oprnds1;
5620   auto_vec<tree> vec_oprnds2;
5621   auto_vec<tree> vect_defs;
5622   auto_vec<gimple *> phis;
5623   int vec_num;
5624   tree def0, tem;
5625   bool first_p = true;
5626   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5627   tree cond_reduc_val = NULL_TREE;
5628
5629   /* Make sure it was already recognized as a reduction computation.  */
5630   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5631       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5632     return false;
5633
5634   if (nested_in_vect_loop_p (loop, stmt))
5635     {
5636       outer_loop = loop;
5637       loop = loop->inner;
5638       nested_cycle = true;
5639     }
5640
5641   /* In case of reduction chain we switch to the first stmt in the chain, but
5642      we don't update STMT_INFO, since only the last stmt is marked as reduction
5643      and has reduction properties.  */
5644   if (GROUP_FIRST_ELEMENT (stmt_info)
5645       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5646     {
5647       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5648       first_p = false;
5649     }
5650
5651   if (gimple_code (stmt) == GIMPLE_PHI)
5652     {
5653       /* Analysis is fully done on the reduction stmt invocation.  */
5654       if (! vec_stmt)
5655         {
5656           if (slp_node)
5657             slp_node_instance->reduc_phis = slp_node;
5658
5659           STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5660           return true;
5661         }
5662
5663       gimple *reduc_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5664       if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (reduc_stmt)))
5665         reduc_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (reduc_stmt));
5666
5667       gcc_assert (is_gimple_assign (reduc_stmt));
5668       for (unsigned k = 1; k < gimple_num_ops (reduc_stmt); ++k)
5669         {
5670           tree op = gimple_op (reduc_stmt, k);
5671           if (op == gimple_phi_result (stmt))
5672             continue;
5673           if (k == 1
5674               && gimple_assign_rhs_code (reduc_stmt) == COND_EXPR)
5675             continue;
5676           tem = get_vectype_for_scalar_type (TREE_TYPE (op));
5677           if (! vectype_in
5678               || TYPE_VECTOR_SUBPARTS (tem) < TYPE_VECTOR_SUBPARTS (vectype_in))
5679             vectype_in = tem;
5680           break;
5681         }
5682       gcc_assert (vectype_in);
5683
5684       if (slp_node)
5685         ncopies = 1;
5686       else
5687         ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5688
5689       use_operand_p use_p;
5690       gimple *use_stmt;
5691       if (ncopies > 1
5692           && (STMT_VINFO_RELEVANT (vinfo_for_stmt (reduc_stmt))
5693               <= vect_used_only_live)
5694           && single_imm_use (gimple_phi_result (stmt), &use_p, &use_stmt)
5695           && (use_stmt == reduc_stmt
5696               || (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt))
5697                   == reduc_stmt)))
5698         single_defuse_cycle = true;
5699
5700       /* Create the destination vector  */
5701       scalar_dest = gimple_assign_lhs (reduc_stmt);
5702       vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5703
5704       if (slp_node)
5705         /* The size vect_schedule_slp_instance computes is off for us.  */
5706         vec_num = ((LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5707                     * SLP_TREE_SCALAR_STMTS (slp_node).length ())
5708                    / TYPE_VECTOR_SUBPARTS (vectype_in));
5709       else
5710         vec_num = 1;
5711
5712       /* Generate the reduction PHIs upfront.  */
5713       prev_phi_info = NULL;
5714       for (j = 0; j < ncopies; j++)
5715         {
5716           if (j == 0 || !single_defuse_cycle)
5717             {
5718               for (i = 0; i < vec_num; i++)
5719                 {
5720                   /* Create the reduction-phi that defines the reduction
5721                      operand.  */
5722                   gimple *new_phi = create_phi_node (vec_dest, loop->header);
5723                   set_vinfo_for_stmt (new_phi,
5724                                       new_stmt_vec_info (new_phi, loop_vinfo));
5725
5726                   if (slp_node)
5727                     SLP_TREE_VEC_STMTS (slp_node).quick_push (new_phi);
5728                   else
5729                     {
5730                       if (j == 0)
5731                         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_phi;
5732                       else
5733                         STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5734                       prev_phi_info = vinfo_for_stmt (new_phi);
5735                     }
5736                 }
5737             }
5738         }
5739
5740       return true;
5741     }
5742
5743   /* 1. Is vectorizable reduction?  */
5744   /* Not supportable if the reduction variable is used in the loop, unless
5745      it's a reduction chain.  */
5746   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5747       && !GROUP_FIRST_ELEMENT (stmt_info))
5748     return false;
5749
5750   /* Reductions that are not used even in an enclosing outer-loop,
5751      are expected to be "live" (used out of the loop).  */
5752   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5753       && !STMT_VINFO_LIVE_P (stmt_info))
5754     return false;
5755
5756   /* 2. Has this been recognized as a reduction pattern?
5757
5758      Check if STMT represents a pattern that has been recognized
5759      in earlier analysis stages.  For stmts that represent a pattern,
5760      the STMT_VINFO_RELATED_STMT field records the last stmt in
5761      the original sequence that constitutes the pattern.  */
5762
5763   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5764   if (orig_stmt)
5765     {
5766       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5767       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5768       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5769     }
5770
5771   /* 3. Check the operands of the operation.  The first operands are defined
5772         inside the loop body. The last operand is the reduction variable,
5773         which is defined by the loop-header-phi.  */
5774
5775   gcc_assert (is_gimple_assign (stmt));
5776
5777   /* Flatten RHS.  */
5778   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5779     {
5780     case GIMPLE_BINARY_RHS:
5781       code = gimple_assign_rhs_code (stmt);
5782       op_type = TREE_CODE_LENGTH (code);
5783       gcc_assert (op_type == binary_op);
5784       ops[0] = gimple_assign_rhs1 (stmt);
5785       ops[1] = gimple_assign_rhs2 (stmt);
5786       break;
5787
5788     case GIMPLE_TERNARY_RHS:
5789       code = gimple_assign_rhs_code (stmt);
5790       op_type = TREE_CODE_LENGTH (code);
5791       gcc_assert (op_type == ternary_op);
5792       ops[0] = gimple_assign_rhs1 (stmt);
5793       ops[1] = gimple_assign_rhs2 (stmt);
5794       ops[2] = gimple_assign_rhs3 (stmt);
5795       break;
5796
5797     case GIMPLE_UNARY_RHS:
5798       return false;
5799
5800     default:
5801       gcc_unreachable ();
5802     }
5803
5804   if (code == COND_EXPR && slp_node)
5805     return false;
5806
5807   scalar_dest = gimple_assign_lhs (stmt);
5808   scalar_type = TREE_TYPE (scalar_dest);
5809   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5810       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5811     return false;
5812
5813   /* Do not try to vectorize bit-precision reductions.  */
5814   if (!type_has_mode_precision_p (scalar_type))
5815     return false;
5816
5817   /* All uses but the last are expected to be defined in the loop.
5818      The last use is the reduction variable.  In case of nested cycle this
5819      assumption is not true: we use reduc_index to record the index of the
5820      reduction variable.  */
5821   gimple *reduc_def_stmt = NULL;
5822   int reduc_index = -1;
5823   for (i = 0; i < op_type; i++)
5824     {
5825       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5826       if (i == 0 && code == COND_EXPR)
5827         continue;
5828
5829       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5830                                           &def_stmt, &dts[i], &tem);
5831       dt = dts[i];
5832       gcc_assert (is_simple_use);
5833       if (dt == vect_reduction_def)
5834         {
5835           reduc_def_stmt = def_stmt;
5836           reduc_index = i;
5837           continue;
5838         }
5839       else if (tem)
5840         {
5841           /* To properly compute ncopies we are interested in the widest
5842              input type in case we're looking at a widening accumulation.  */
5843           if (!vectype_in
5844               || TYPE_VECTOR_SUBPARTS (vectype_in) > TYPE_VECTOR_SUBPARTS (tem))
5845             vectype_in = tem;
5846         }
5847
5848       if (dt != vect_internal_def
5849           && dt != vect_external_def
5850           && dt != vect_constant_def
5851           && dt != vect_induction_def
5852           && !(dt == vect_nested_cycle && nested_cycle))
5853         return false;
5854
5855       if (dt == vect_nested_cycle)
5856         {
5857           found_nested_cycle_def = true;
5858           reduc_def_stmt = def_stmt;
5859           reduc_index = i;
5860         }
5861
5862       if (i == 1 && code == COND_EXPR)
5863         {
5864           /* Record how value of COND_EXPR is defined.  */
5865           if (dt == vect_constant_def)
5866             {
5867               cond_reduc_dt = dt;
5868               cond_reduc_val = ops[i];
5869             }
5870           if (dt == vect_induction_def && def_stmt != NULL
5871               && is_nonwrapping_integer_induction (def_stmt, loop))
5872             cond_reduc_dt = dt;
5873         }
5874     }
5875
5876   if (!vectype_in)
5877     vectype_in = vectype_out;
5878
5879   /* When vectorizing a reduction chain w/o SLP the reduction PHI is not
5880      directy used in stmt.  */
5881   if (reduc_index == -1)
5882     {
5883       if (orig_stmt)
5884         reduc_def_stmt = STMT_VINFO_REDUC_DEF (orig_stmt_info);
5885       else
5886         reduc_def_stmt = STMT_VINFO_REDUC_DEF (stmt_info);
5887     }
5888
5889   if (! reduc_def_stmt || gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5890     return false;
5891
5892   if (!(reduc_index == -1
5893         || dts[reduc_index] == vect_reduction_def
5894         || dts[reduc_index] == vect_nested_cycle
5895         || ((dts[reduc_index] == vect_internal_def
5896              || dts[reduc_index] == vect_external_def
5897              || dts[reduc_index] == vect_constant_def
5898              || dts[reduc_index] == vect_induction_def)
5899             && nested_cycle && found_nested_cycle_def)))
5900     {
5901       /* For pattern recognized stmts, orig_stmt might be a reduction,
5902          but some helper statements for the pattern might not, or
5903          might be COND_EXPRs with reduction uses in the condition.  */
5904       gcc_assert (orig_stmt);
5905       return false;
5906     }
5907
5908   stmt_vec_info reduc_def_info = vinfo_for_stmt (reduc_def_stmt);
5909   enum vect_reduction_type v_reduc_type
5910     = STMT_VINFO_REDUC_TYPE (reduc_def_info);
5911   gimple *tmp = STMT_VINFO_REDUC_DEF (reduc_def_info);
5912
5913   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5914   /* If we have a condition reduction, see if we can simplify it further.  */
5915   if (v_reduc_type == COND_REDUCTION)
5916     {
5917       if (cond_reduc_dt == vect_induction_def)
5918         {
5919           if (dump_enabled_p ())
5920             dump_printf_loc (MSG_NOTE, vect_location,
5921                              "condition expression based on "
5922                              "integer induction.\n");
5923           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5924             = INTEGER_INDUC_COND_REDUCTION;
5925         }
5926
5927       /* Loop peeling modifies initial value of reduction PHI, which
5928          makes the reduction stmt to be transformed different to the
5929          original stmt analyzed.  We need to record reduction code for
5930          CONST_COND_REDUCTION type reduction at analyzing stage, thus
5931          it can be used directly at transform stage.  */
5932       if (STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MAX_EXPR
5933           || STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info) == MIN_EXPR)
5934         {
5935           /* Also set the reduction type to CONST_COND_REDUCTION.  */
5936           gcc_assert (cond_reduc_dt == vect_constant_def);
5937           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = CONST_COND_REDUCTION;
5938         }
5939       else if (cond_reduc_dt == vect_constant_def)
5940         {
5941           enum vect_def_type cond_initial_dt;
5942           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5943           tree cond_initial_val
5944             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5945
5946           gcc_assert (cond_reduc_val != NULL_TREE);
5947           vect_is_simple_use (cond_initial_val, loop_vinfo,
5948                               &def_stmt, &cond_initial_dt);
5949           if (cond_initial_dt == vect_constant_def
5950               && types_compatible_p (TREE_TYPE (cond_initial_val),
5951                                      TREE_TYPE (cond_reduc_val)))
5952             {
5953               tree e = fold_binary (LE_EXPR, boolean_type_node,
5954                                     cond_initial_val, cond_reduc_val);
5955               if (e && (integer_onep (e) || integer_zerop (e)))
5956                 {
5957                   if (dump_enabled_p ())
5958                     dump_printf_loc (MSG_NOTE, vect_location,
5959                                      "condition expression based on "
5960                                      "compile time constant.\n");
5961                   /* Record reduction code at analysis stage.  */
5962                   STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info)
5963                     = integer_onep (e) ? MAX_EXPR : MIN_EXPR;
5964                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5965                     = CONST_COND_REDUCTION;
5966                 }
5967             }
5968         }
5969     }
5970
5971   if (orig_stmt)
5972     gcc_assert (tmp == orig_stmt
5973                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5974   else
5975     /* We changed STMT to be the first stmt in reduction chain, hence we
5976        check that in this case the first element in the chain is STMT.  */
5977     gcc_assert (stmt == tmp
5978                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5979
5980   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5981     return false;
5982
5983   if (slp_node)
5984     ncopies = 1;
5985   else
5986     ncopies = vect_get_num_copies (loop_vinfo, vectype_in);
5987
5988   gcc_assert (ncopies >= 1);
5989
5990   vec_mode = TYPE_MODE (vectype_in);
5991
5992   if (code == COND_EXPR)
5993     {
5994       /* Only call during the analysis stage, otherwise we'll lose
5995          STMT_VINFO_TYPE.  */
5996       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5997                                                 ops[reduc_index], 0, NULL))
5998         {
5999           if (dump_enabled_p ())
6000             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6001                              "unsupported condition in reduction\n");
6002           return false;
6003         }
6004     }
6005   else
6006     {
6007       /* 4. Supportable by target?  */
6008
6009       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
6010           || code == LROTATE_EXPR || code == RROTATE_EXPR)
6011         {
6012           /* Shifts and rotates are only supported by vectorizable_shifts,
6013              not vectorizable_reduction.  */
6014           if (dump_enabled_p ())
6015             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6016                              "unsupported shift or rotation.\n");
6017           return false;
6018         }
6019
6020       /* 4.1. check support for the operation in the loop  */
6021       optab = optab_for_tree_code (code, vectype_in, optab_default);
6022       if (!optab)
6023         {
6024           if (dump_enabled_p ())
6025             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6026                              "no optab.\n");
6027
6028           return false;
6029         }
6030
6031       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
6032         {
6033           if (dump_enabled_p ())
6034             dump_printf (MSG_NOTE, "op not supported by target.\n");
6035
6036           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
6037               || !vect_worthwhile_without_simd_p (loop_vinfo, code))
6038             return false;
6039
6040           if (dump_enabled_p ())
6041             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
6042         }
6043
6044       /* Worthwhile without SIMD support?  */
6045       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
6046           && !vect_worthwhile_without_simd_p (loop_vinfo, code))
6047         {
6048           if (dump_enabled_p ())
6049             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6050                              "not worthwhile without SIMD support.\n");
6051
6052           return false;
6053         }
6054     }
6055
6056   /* 4.2. Check support for the epilog operation.
6057
6058           If STMT represents a reduction pattern, then the type of the
6059           reduction variable may be different than the type of the rest
6060           of the arguments.  For example, consider the case of accumulation
6061           of shorts into an int accumulator; The original code:
6062                         S1: int_a = (int) short_a;
6063           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
6064
6065           was replaced with:
6066                         STMT: int_acc = widen_sum <short_a, int_acc>
6067
6068           This means that:
6069           1. The tree-code that is used to create the vector operation in the
6070              epilog code (that reduces the partial results) is not the
6071              tree-code of STMT, but is rather the tree-code of the original
6072              stmt from the pattern that STMT is replacing.  I.e, in the example
6073              above we want to use 'widen_sum' in the loop, but 'plus' in the
6074              epilog.
6075           2. The type (mode) we use to check available target support
6076              for the vector operation to be created in the *epilog*, is
6077              determined by the type of the reduction variable (in the example
6078              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
6079              However the type (mode) we use to check available target support
6080              for the vector operation to be created *inside the loop*, is
6081              determined by the type of the other arguments to STMT (in the
6082              example we'd check this: optab_handler (widen_sum_optab,
6083              vect_short_mode)).
6084
6085           This is contrary to "regular" reductions, in which the types of all
6086           the arguments are the same as the type of the reduction variable.
6087           For "regular" reductions we can therefore use the same vector type
6088           (and also the same tree-code) when generating the epilog code and
6089           when generating the code inside the loop.  */
6090
6091   if (orig_stmt)
6092     {
6093       /* This is a reduction pattern: get the vectype from the type of the
6094          reduction variable, and get the tree-code from orig_stmt.  */
6095       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6096                   == TREE_CODE_REDUCTION);
6097       orig_code = gimple_assign_rhs_code (orig_stmt);
6098       gcc_assert (vectype_out);
6099       vec_mode = TYPE_MODE (vectype_out);
6100     }
6101   else
6102     {
6103       /* Regular reduction: use the same vectype and tree-code as used for
6104          the vector code inside the loop can be used for the epilog code. */
6105       orig_code = code;
6106
6107       if (code == MINUS_EXPR)
6108         orig_code = PLUS_EXPR;
6109
6110       /* For simple condition reductions, replace with the actual expression
6111          we want to base our reduction around.  */
6112       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
6113         {
6114           orig_code = STMT_VINFO_VEC_CONST_COND_REDUC_CODE (stmt_info);
6115           gcc_assert (orig_code == MAX_EXPR || orig_code == MIN_EXPR);
6116         }
6117       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
6118                  == INTEGER_INDUC_COND_REDUCTION)
6119         orig_code = MAX_EXPR;
6120     }
6121
6122   if (nested_cycle)
6123     {
6124       def_bb = gimple_bb (reduc_def_stmt);
6125       def_stmt_loop = def_bb->loop_father;
6126       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
6127                                        loop_preheader_edge (def_stmt_loop));
6128       if (TREE_CODE (def_arg) == SSA_NAME
6129           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
6130           && gimple_code (def_arg_stmt) == GIMPLE_PHI
6131           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
6132           && vinfo_for_stmt (def_arg_stmt)
6133           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
6134               == vect_double_reduction_def)
6135         double_reduc = true;
6136     }
6137
6138   epilog_reduc_code = ERROR_MARK;
6139
6140   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
6141     {
6142       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
6143         {
6144           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
6145                                          optab_default);
6146           if (!reduc_optab)
6147             {
6148               if (dump_enabled_p ())
6149                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6150                                  "no optab for reduction.\n");
6151
6152               epilog_reduc_code = ERROR_MARK;
6153             }
6154           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
6155             {
6156               if (dump_enabled_p ())
6157                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6158                                  "reduc op not supported by target.\n");
6159
6160               epilog_reduc_code = ERROR_MARK;
6161             }
6162         }
6163       else
6164         {
6165           if (!nested_cycle || double_reduc)
6166             {
6167               if (dump_enabled_p ())
6168                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6169                                  "no reduc code for scalar code.\n");
6170
6171               return false;
6172             }
6173         }
6174     }
6175   else
6176     {
6177       int scalar_precision
6178         = GET_MODE_PRECISION (SCALAR_TYPE_MODE (scalar_type));
6179       cr_index_scalar_type = make_unsigned_type (scalar_precision);
6180       cr_index_vector_type = build_vector_type
6181         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
6182
6183       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
6184                                    optab_default);
6185       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
6186           != CODE_FOR_nothing)
6187         epilog_reduc_code = REDUC_MAX_EXPR;
6188     }
6189
6190   if ((double_reduc
6191        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
6192       && ncopies > 1)
6193     {
6194       if (dump_enabled_p ())
6195         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6196                          "multiple types in double reduction or condition "
6197                          "reduction.\n");
6198       return false;
6199     }
6200
6201   /* In case of widenning multiplication by a constant, we update the type
6202      of the constant to be the type of the other operand.  We check that the
6203      constant fits the type in the pattern recognition pass.  */
6204   if (code == DOT_PROD_EXPR
6205       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
6206     {
6207       if (TREE_CODE (ops[0]) == INTEGER_CST)
6208         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
6209       else if (TREE_CODE (ops[1]) == INTEGER_CST)
6210         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
6211       else
6212         {
6213           if (dump_enabled_p ())
6214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6215                              "invalid types in dot-prod\n");
6216
6217           return false;
6218         }
6219     }
6220
6221   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6222     {
6223       widest_int ni;
6224
6225       if (! max_loop_iterations (loop, &ni))
6226         {
6227           if (dump_enabled_p ())
6228             dump_printf_loc (MSG_NOTE, vect_location,
6229                              "loop count not known, cannot create cond "
6230                              "reduction.\n");
6231           return false;
6232         }
6233       /* Convert backedges to iterations.  */
6234       ni += 1;
6235
6236       /* The additional index will be the same type as the condition.  Check
6237          that the loop can fit into this less one (because we'll use up the
6238          zero slot for when there are no matches).  */
6239       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
6240       if (wi::geu_p (ni, wi::to_widest (max_index)))
6241         {
6242           if (dump_enabled_p ())
6243             dump_printf_loc (MSG_NOTE, vect_location,
6244                              "loop size is greater than data size.\n");
6245           return false;
6246         }
6247     }
6248
6249   /* In case the vectorization factor (VF) is bigger than the number
6250      of elements that we can fit in a vectype (nunits), we have to generate
6251      more than one vector stmt - i.e - we need to "unroll" the
6252      vector stmt by a factor VF/nunits.  For more details see documentation
6253      in vectorizable_operation.  */
6254
6255   /* If the reduction is used in an outer loop we need to generate
6256      VF intermediate results, like so (e.g. for ncopies=2):
6257         r0 = phi (init, r0)
6258         r1 = phi (init, r1)
6259         r0 = x0 + r0;
6260         r1 = x1 + r1;
6261     (i.e. we generate VF results in 2 registers).
6262     In this case we have a separate def-use cycle for each copy, and therefore
6263     for each copy we get the vector def for the reduction variable from the
6264     respective phi node created for this copy.
6265
6266     Otherwise (the reduction is unused in the loop nest), we can combine
6267     together intermediate results, like so (e.g. for ncopies=2):
6268         r = phi (init, r)
6269         r = x0 + r;
6270         r = x1 + r;
6271    (i.e. we generate VF/2 results in a single register).
6272    In this case for each copy we get the vector def for the reduction variable
6273    from the vectorized reduction operation generated in the previous iteration.
6274
6275    This only works when we see both the reduction PHI and its only consumer
6276    in vectorizable_reduction and there are no intermediate stmts
6277    participating.  */
6278   use_operand_p use_p;
6279   gimple *use_stmt;
6280   if (ncopies > 1
6281       && (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6282       && single_imm_use (gimple_phi_result (reduc_def_stmt), &use_p, &use_stmt)
6283       && (use_stmt == stmt
6284           || STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use_stmt)) == stmt))
6285     {
6286       single_defuse_cycle = true;
6287       epilog_copies = 1;
6288     }
6289   else
6290     epilog_copies = ncopies;
6291
6292   /* If the reduction stmt is one of the patterns that have lane
6293      reduction embedded we cannot handle the case of ! single_defuse_cycle.  */
6294   if ((ncopies > 1
6295        && ! single_defuse_cycle)
6296       && (code == DOT_PROD_EXPR
6297           || code == WIDEN_SUM_EXPR
6298           || code == SAD_EXPR))
6299     {
6300       if (dump_enabled_p ())
6301         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6302                          "multi def-use cycle not possible for lane-reducing "
6303                          "reduction operation\n");
6304       return false;
6305     }
6306
6307   if (!vec_stmt) /* transformation not required.  */
6308     {
6309       if (first_p)
6310         vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies);
6311       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
6312       return true;
6313     }
6314
6315   /* Transform.  */
6316
6317   if (dump_enabled_p ())
6318     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
6319
6320   /* FORNOW: Multiple types are not supported for condition.  */
6321   if (code == COND_EXPR)
6322     gcc_assert (ncopies == 1);
6323
6324   /* Create the destination vector  */
6325   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6326
6327   prev_stmt_info = NULL;
6328   prev_phi_info = NULL;
6329   if (slp_node)
6330     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6331   else
6332     {
6333       vec_num = 1;
6334       vec_oprnds0.create (1);
6335       vec_oprnds1.create (1);
6336       if (op_type == ternary_op)
6337         vec_oprnds2.create (1);
6338     }
6339
6340   phis.create (vec_num);
6341   vect_defs.create (vec_num);
6342   if (!slp_node)
6343     vect_defs.quick_push (NULL_TREE);
6344
6345   if (slp_node)
6346     phis.splice (SLP_TREE_VEC_STMTS (slp_node_instance->reduc_phis));
6347   else
6348     phis.quick_push (STMT_VINFO_VEC_STMT (vinfo_for_stmt (reduc_def_stmt)));
6349
6350   for (j = 0; j < ncopies; j++)
6351     {
6352       if (code == COND_EXPR)
6353         {
6354           gcc_assert (!slp_node);
6355           vectorizable_condition (stmt, gsi, vec_stmt,
6356                                   PHI_RESULT (phis[0]),
6357                                   reduc_index, NULL);
6358           /* Multiple types are not supported for condition.  */
6359           break;
6360         }
6361
6362       /* Handle uses.  */
6363       if (j == 0)
6364         {
6365           if (slp_node)
6366             {
6367               /* Get vec defs for all the operands except the reduction index,
6368                  ensuring the ordering of the ops in the vector is kept.  */
6369               auto_vec<tree, 3> slp_ops;
6370               auto_vec<vec<tree>, 3> vec_defs;
6371
6372               slp_ops.quick_push (ops[0]);
6373               slp_ops.quick_push (ops[1]);
6374               if (op_type == ternary_op)
6375                 slp_ops.quick_push (ops[2]);
6376
6377               vect_get_slp_defs (slp_ops, slp_node, &vec_defs);
6378
6379               vec_oprnds0.safe_splice (vec_defs[0]);
6380               vec_defs[0].release ();
6381               vec_oprnds1.safe_splice (vec_defs[1]);
6382               vec_defs[1].release ();
6383               if (op_type == ternary_op)
6384                 {
6385                   vec_oprnds2.safe_splice (vec_defs[2]);
6386                   vec_defs[2].release ();
6387                 }
6388             }
6389           else
6390             {
6391               vec_oprnds0.quick_push
6392                 (vect_get_vec_def_for_operand (ops[0], stmt));
6393               vec_oprnds1.quick_push
6394                 (vect_get_vec_def_for_operand (ops[1], stmt));
6395               if (op_type == ternary_op)
6396                 vec_oprnds2.quick_push
6397                   (vect_get_vec_def_for_operand (ops[2], stmt));
6398             }
6399         }
6400       else
6401         {
6402           if (!slp_node)
6403             {
6404               gcc_assert (reduc_index != -1 || ! single_defuse_cycle);
6405
6406               if (single_defuse_cycle && reduc_index == 0)
6407                 vec_oprnds0[0] = gimple_assign_lhs (new_stmt);
6408               else
6409                 vec_oprnds0[0]
6410                   = vect_get_vec_def_for_stmt_copy (dts[0], vec_oprnds0[0]);
6411               if (single_defuse_cycle && reduc_index == 1)
6412                 vec_oprnds1[0] = gimple_assign_lhs (new_stmt);
6413               else
6414                 vec_oprnds1[0]
6415                   = vect_get_vec_def_for_stmt_copy (dts[1], vec_oprnds1[0]);
6416               if (op_type == ternary_op)
6417                 {
6418                   if (single_defuse_cycle && reduc_index == 2)
6419                     vec_oprnds2[0] = gimple_assign_lhs (new_stmt);
6420                   else
6421                     vec_oprnds2[0]
6422                       = vect_get_vec_def_for_stmt_copy (dts[2], vec_oprnds2[0]);
6423                 }
6424             }
6425         }
6426
6427       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6428         {
6429           tree vop[3] = { def0, vec_oprnds1[i], NULL_TREE };
6430           if (op_type == ternary_op)
6431             vop[2] = vec_oprnds2[i];
6432
6433           new_temp = make_ssa_name (vec_dest, new_stmt);
6434           new_stmt = gimple_build_assign (new_temp, code,
6435                                           vop[0], vop[1], vop[2]);
6436           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6437
6438           if (slp_node)
6439             {
6440               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6441               vect_defs.quick_push (new_temp);
6442             }
6443           else
6444             vect_defs[0] = new_temp;
6445         }
6446
6447       if (slp_node)
6448         continue;
6449
6450       if (j == 0)
6451         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6452       else
6453         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6454
6455       prev_stmt_info = vinfo_for_stmt (new_stmt);
6456     }
6457
6458   /* Finalize the reduction-phi (set its arguments) and create the
6459      epilog reduction code.  */
6460   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6461     vect_defs[0] = gimple_assign_lhs (*vec_stmt);
6462
6463   vect_create_epilog_for_reduction (vect_defs, stmt, reduc_def_stmt,
6464                                     epilog_copies,
6465                                     epilog_reduc_code, phis,
6466                                     double_reduc, slp_node, slp_node_instance);
6467
6468   return true;
6469 }
6470
6471 /* Function vect_min_worthwhile_factor.
6472
6473    For a loop where we could vectorize the operation indicated by CODE,
6474    return the minimum vectorization factor that makes it worthwhile
6475    to use generic vectors.  */
6476 int
6477 vect_min_worthwhile_factor (enum tree_code code)
6478 {
6479   switch (code)
6480     {
6481     case PLUS_EXPR:
6482     case MINUS_EXPR:
6483     case NEGATE_EXPR:
6484       return 4;
6485
6486     case BIT_AND_EXPR:
6487     case BIT_IOR_EXPR:
6488     case BIT_XOR_EXPR:
6489     case BIT_NOT_EXPR:
6490       return 2;
6491
6492     default:
6493       return INT_MAX;
6494     }
6495 }
6496
6497 /* Return true if VINFO indicates we are doing loop vectorization and if
6498    it is worth decomposing CODE operations into scalar operations for
6499    that loop's vectorization factor.  */
6500
6501 bool
6502 vect_worthwhile_without_simd_p (vec_info *vinfo, tree_code code)
6503 {
6504   loop_vec_info loop_vinfo = dyn_cast <loop_vec_info> (vinfo);
6505   return (loop_vinfo
6506           && (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
6507               >= vect_min_worthwhile_factor (code)));
6508 }
6509
6510 /* Function vectorizable_induction
6511
6512    Check if PHI performs an induction computation that can be vectorized.
6513    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6514    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6515    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6516
6517 bool
6518 vectorizable_induction (gimple *phi,
6519                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6520                         gimple **vec_stmt, slp_tree slp_node)
6521 {
6522   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6523   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6524   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6525   unsigned ncopies;
6526   bool nested_in_vect_loop = false;
6527   struct loop *iv_loop;
6528   tree vec_def;
6529   edge pe = loop_preheader_edge (loop);
6530   basic_block new_bb;
6531   tree new_vec, vec_init, vec_step, t;
6532   tree new_name;
6533   gimple *new_stmt;
6534   gphi *induction_phi;
6535   tree induc_def, vec_dest;
6536   tree init_expr, step_expr;
6537   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6538   unsigned i;
6539   tree expr;
6540   gimple_seq stmts;
6541   imm_use_iterator imm_iter;
6542   use_operand_p use_p;
6543   gimple *exit_phi;
6544   edge latch_e;
6545   tree loop_arg;
6546   gimple_stmt_iterator si;
6547   basic_block bb = gimple_bb (phi);
6548
6549   if (gimple_code (phi) != GIMPLE_PHI)
6550     return false;
6551
6552   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6553     return false;
6554
6555   /* Make sure it was recognized as induction computation.  */
6556   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
6557     return false;
6558
6559   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6560   unsigned nunits = TYPE_VECTOR_SUBPARTS (vectype);
6561
6562   if (slp_node)
6563     ncopies = 1;
6564   else
6565     ncopies = vect_get_num_copies (loop_vinfo, vectype);
6566   gcc_assert (ncopies >= 1);
6567
6568   /* FORNOW. These restrictions should be relaxed.  */
6569   if (nested_in_vect_loop_p (loop, phi))
6570     {
6571       imm_use_iterator imm_iter;
6572       use_operand_p use_p;
6573       gimple *exit_phi;
6574       edge latch_e;
6575       tree loop_arg;
6576
6577       if (ncopies > 1)
6578         {
6579           if (dump_enabled_p ())
6580             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6581                              "multiple types in nested loop.\n");
6582           return false;
6583         }
6584
6585       /* FORNOW: outer loop induction with SLP not supported.  */
6586       if (STMT_SLP_TYPE (stmt_info))
6587         return false;
6588
6589       exit_phi = NULL;
6590       latch_e = loop_latch_edge (loop->inner);
6591       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6592       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6593         {
6594           gimple *use_stmt = USE_STMT (use_p);
6595           if (is_gimple_debug (use_stmt))
6596             continue;
6597
6598           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6599             {
6600               exit_phi = use_stmt;
6601               break;
6602             }
6603         }
6604       if (exit_phi)
6605         {
6606           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6607           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6608                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6609             {
6610               if (dump_enabled_p ())
6611                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6612                                  "inner-loop induction only used outside "
6613                                  "of the outer vectorized loop.\n");
6614               return false;
6615             }
6616         }
6617
6618       nested_in_vect_loop = true;
6619       iv_loop = loop->inner;
6620     }
6621   else
6622     iv_loop = loop;
6623   gcc_assert (iv_loop == (gimple_bb (phi))->loop_father);
6624
6625   if (!vec_stmt) /* transformation not required.  */
6626     {
6627       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6628       if (dump_enabled_p ())
6629         dump_printf_loc (MSG_NOTE, vect_location,
6630                          "=== vectorizable_induction ===\n");
6631       vect_model_induction_cost (stmt_info, ncopies);
6632       return true;
6633     }
6634
6635   /* Transform.  */
6636
6637   /* Compute a vector variable, initialized with the first VF values of
6638      the induction variable.  E.g., for an iv with IV_PHI='X' and
6639      evolution S, for a vector of 4 units, we want to compute:
6640      [X, X + S, X + 2*S, X + 3*S].  */
6641
6642   if (dump_enabled_p ())
6643     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6644
6645   latch_e = loop_latch_edge (iv_loop);
6646   loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6647
6648   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_info);
6649   gcc_assert (step_expr != NULL_TREE);
6650
6651   pe = loop_preheader_edge (iv_loop);
6652   init_expr = PHI_ARG_DEF_FROM_EDGE (phi,
6653                                      loop_preheader_edge (iv_loop));
6654
6655   /* Convert the step to the desired type.  */
6656   stmts = NULL;
6657   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
6658   if (stmts)
6659     {
6660       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6661       gcc_assert (!new_bb);
6662     }
6663
6664   /* Find the first insertion point in the BB.  */
6665   si = gsi_after_labels (bb);
6666
6667   /* For SLP induction we have to generate several IVs as for example
6668      with group size 3 we need [i, i, i, i + S] [i + S, i + S, i + 2*S, i + 2*S]
6669      [i + 2*S, i + 3*S, i + 3*S, i + 3*S].  The step is the same uniform
6670      [VF*S, VF*S, VF*S, VF*S] for all.  */
6671   if (slp_node)
6672     {
6673       /* Convert the init to the desired type.  */
6674       stmts = NULL;
6675       init_expr = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6676       if (stmts)
6677         {
6678           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6679           gcc_assert (!new_bb);
6680         }
6681
6682       /* Generate [VF*S, VF*S, ... ].  */
6683       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6684         {
6685           expr = build_int_cst (integer_type_node, vf);
6686           expr = fold_convert (TREE_TYPE (step_expr), expr);
6687         }
6688       else
6689         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6690       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6691                               expr, step_expr);
6692       if (! CONSTANT_CLASS_P (new_name))
6693         new_name = vect_init_vector (phi, new_name,
6694                                      TREE_TYPE (step_expr), NULL);
6695       new_vec = build_vector_from_val (vectype, new_name);
6696       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6697
6698       /* Now generate the IVs.  */
6699       unsigned group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6700       unsigned nvects = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6701       unsigned elts = nunits * nvects;
6702       unsigned nivs = least_common_multiple (group_size, nunits) / nunits;
6703       gcc_assert (elts % group_size == 0);
6704       tree elt = init_expr;
6705       unsigned ivn;
6706       for (ivn = 0; ivn < nivs; ++ivn)
6707         {
6708           auto_vec<tree, 32> elts (nunits);
6709           stmts = NULL;
6710           for (unsigned eltn = 0; eltn < nunits; ++eltn)
6711             {
6712               if (ivn*nunits + eltn >= group_size
6713                   && (ivn*nunits + eltn) % group_size == 0)
6714                 elt = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (elt),
6715                                     elt, step_expr);
6716               elts.quick_push (elt);
6717             }
6718           vec_init = gimple_build_vector (&stmts, vectype, elts);
6719           if (stmts)
6720             {
6721               new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6722               gcc_assert (!new_bb);
6723             }
6724
6725           /* Create the induction-phi that defines the induction-operand.  */
6726           vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6727           induction_phi = create_phi_node (vec_dest, iv_loop->header);
6728           set_vinfo_for_stmt (induction_phi,
6729                               new_stmt_vec_info (induction_phi, loop_vinfo));
6730           induc_def = PHI_RESULT (induction_phi);
6731
6732           /* Create the iv update inside the loop  */
6733           vec_def = make_ssa_name (vec_dest);
6734           new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6735           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6736           set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6737
6738           /* Set the arguments of the phi node:  */
6739           add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6740           add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6741                        UNKNOWN_LOCATION);
6742
6743           SLP_TREE_VEC_STMTS (slp_node).quick_push (induction_phi);
6744         }
6745
6746       /* Re-use IVs when we can.  */
6747       if (ivn < nvects)
6748         {
6749           unsigned vfp
6750             = least_common_multiple (group_size, nunits) / group_size;
6751           /* Generate [VF'*S, VF'*S, ... ].  */
6752           if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6753             {
6754               expr = build_int_cst (integer_type_node, vfp);
6755               expr = fold_convert (TREE_TYPE (step_expr), expr);
6756             }
6757           else
6758             expr = build_int_cst (TREE_TYPE (step_expr), vfp);
6759           new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
6760                                   expr, step_expr);
6761           if (! CONSTANT_CLASS_P (new_name))
6762             new_name = vect_init_vector (phi, new_name,
6763                                          TREE_TYPE (step_expr), NULL);
6764           new_vec = build_vector_from_val (vectype, new_name);
6765           vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6766           for (; ivn < nvects; ++ivn)
6767             {
6768               gimple *iv = SLP_TREE_VEC_STMTS (slp_node)[ivn - nivs];
6769               tree def;
6770               if (gimple_code (iv) == GIMPLE_PHI)
6771                 def = gimple_phi_result (iv);
6772               else
6773                 def = gimple_assign_lhs (iv);
6774               new_stmt = gimple_build_assign (make_ssa_name (vectype),
6775                                               PLUS_EXPR,
6776                                               def, vec_step);
6777               if (gimple_code (iv) == GIMPLE_PHI)
6778                 gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6779               else
6780                 {
6781                   gimple_stmt_iterator tgsi = gsi_for_stmt (iv);
6782                   gsi_insert_after (&tgsi, new_stmt, GSI_CONTINUE_LINKING);
6783                 }
6784               set_vinfo_for_stmt (new_stmt,
6785                                   new_stmt_vec_info (new_stmt, loop_vinfo));
6786               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6787             }
6788         }
6789
6790       return true;
6791     }
6792
6793   /* Create the vector that holds the initial_value of the induction.  */
6794   if (nested_in_vect_loop)
6795     {
6796       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
6797          been created during vectorization of previous stmts.  We obtain it
6798          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
6799       vec_init = vect_get_vec_def_for_operand (init_expr, phi);
6800       /* If the initial value is not of proper type, convert it.  */
6801       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
6802         {
6803           new_stmt
6804             = gimple_build_assign (vect_get_new_ssa_name (vectype,
6805                                                           vect_simple_var,
6806                                                           "vec_iv_"),
6807                                    VIEW_CONVERT_EXPR,
6808                                    build1 (VIEW_CONVERT_EXPR, vectype,
6809                                            vec_init));
6810           vec_init = gimple_assign_lhs (new_stmt);
6811           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
6812                                                  new_stmt);
6813           gcc_assert (!new_bb);
6814           set_vinfo_for_stmt (new_stmt,
6815                               new_stmt_vec_info (new_stmt, loop_vinfo));
6816         }
6817     }
6818   else
6819     {
6820       /* iv_loop is the loop to be vectorized. Create:
6821          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
6822       stmts = NULL;
6823       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
6824
6825       auto_vec<tree, 32> elts (nunits);
6826       elts.quick_push (new_name);
6827       for (i = 1; i < nunits; i++)
6828         {
6829           /* Create: new_name_i = new_name + step_expr  */
6830           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
6831                                    new_name, step_expr);
6832           elts.quick_push (new_name);
6833         }
6834       /* Create a vector from [new_name_0, new_name_1, ...,
6835          new_name_nunits-1]  */
6836       vec_init = gimple_build_vector (&stmts, vectype, elts);
6837       if (stmts)
6838         {
6839           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
6840           gcc_assert (!new_bb);
6841         }
6842     }
6843
6844
6845   /* Create the vector that holds the step of the induction.  */
6846   if (nested_in_vect_loop)
6847     /* iv_loop is nested in the loop to be vectorized. Generate:
6848        vec_step = [S, S, S, S]  */
6849     new_name = step_expr;
6850   else
6851     {
6852       /* iv_loop is the loop to be vectorized. Generate:
6853           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
6854       gimple_seq seq = NULL;
6855       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6856         {
6857           expr = build_int_cst (integer_type_node, vf);
6858           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6859         }
6860       else
6861         expr = build_int_cst (TREE_TYPE (step_expr), vf);
6862       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6863                                expr, step_expr);
6864       if (seq)
6865         {
6866           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6867           gcc_assert (!new_bb);
6868         }
6869     }
6870
6871   t = unshare_expr (new_name);
6872   gcc_assert (CONSTANT_CLASS_P (new_name)
6873               || TREE_CODE (new_name) == SSA_NAME);
6874   new_vec = build_vector_from_val (vectype, t);
6875   vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6876
6877
6878   /* Create the following def-use cycle:
6879      loop prolog:
6880          vec_init = ...
6881          vec_step = ...
6882      loop:
6883          vec_iv = PHI <vec_init, vec_loop>
6884          ...
6885          STMT
6886          ...
6887          vec_loop = vec_iv + vec_step;  */
6888
6889   /* Create the induction-phi that defines the induction-operand.  */
6890   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
6891   induction_phi = create_phi_node (vec_dest, iv_loop->header);
6892   set_vinfo_for_stmt (induction_phi,
6893                       new_stmt_vec_info (induction_phi, loop_vinfo));
6894   induc_def = PHI_RESULT (induction_phi);
6895
6896   /* Create the iv update inside the loop  */
6897   vec_def = make_ssa_name (vec_dest);
6898   new_stmt = gimple_build_assign (vec_def, PLUS_EXPR, induc_def, vec_step);
6899   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6900   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
6901
6902   /* Set the arguments of the phi node:  */
6903   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
6904   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
6905                UNKNOWN_LOCATION);
6906
6907   STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = induction_phi;
6908
6909   /* In case that vectorization factor (VF) is bigger than the number
6910      of elements that we can fit in a vectype (nunits), we have to generate
6911      more than one vector stmt - i.e - we need to "unroll" the
6912      vector stmt by a factor VF/nunits.  For more details see documentation
6913      in vectorizable_operation.  */
6914
6915   if (ncopies > 1)
6916     {
6917       gimple_seq seq = NULL;
6918       stmt_vec_info prev_stmt_vinfo;
6919       /* FORNOW. This restriction should be relaxed.  */
6920       gcc_assert (!nested_in_vect_loop);
6921
6922       /* Create the vector that holds the step of the induction.  */
6923       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
6924         {
6925           expr = build_int_cst (integer_type_node, nunits);
6926           expr = gimple_build (&seq, FLOAT_EXPR, TREE_TYPE (step_expr), expr);
6927         }
6928       else
6929         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
6930       new_name = gimple_build (&seq, MULT_EXPR, TREE_TYPE (step_expr),
6931                                expr, step_expr);
6932       if (seq)
6933         {
6934           new_bb = gsi_insert_seq_on_edge_immediate (pe, seq);
6935           gcc_assert (!new_bb);
6936         }
6937
6938       t = unshare_expr (new_name);
6939       gcc_assert (CONSTANT_CLASS_P (new_name)
6940                   || TREE_CODE (new_name) == SSA_NAME);
6941       new_vec = build_vector_from_val (vectype, t);
6942       vec_step = vect_init_vector (phi, new_vec, vectype, NULL);
6943
6944       vec_def = induc_def;
6945       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
6946       for (i = 1; i < ncopies; i++)
6947         {
6948           /* vec_i = vec_prev + vec_step  */
6949           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
6950                                           vec_def, vec_step);
6951           vec_def = make_ssa_name (vec_dest, new_stmt);
6952           gimple_assign_set_lhs (new_stmt, vec_def);
6953
6954           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
6955           set_vinfo_for_stmt (new_stmt,
6956                               new_stmt_vec_info (new_stmt, loop_vinfo));
6957           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
6958           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
6959         }
6960     }
6961
6962   if (nested_in_vect_loop)
6963     {
6964       /* Find the loop-closed exit-phi of the induction, and record
6965          the final vector of induction results:  */
6966       exit_phi = NULL;
6967       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6968         {
6969           gimple *use_stmt = USE_STMT (use_p);
6970           if (is_gimple_debug (use_stmt))
6971             continue;
6972
6973           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
6974             {
6975               exit_phi = use_stmt;
6976               break;
6977             }
6978         }
6979       if (exit_phi)
6980         {
6981           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
6982           /* FORNOW. Currently not supporting the case that an inner-loop induction
6983              is not used in the outer-loop (i.e. only outside the outer-loop).  */
6984           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
6985                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
6986
6987           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
6988           if (dump_enabled_p ())
6989             {
6990               dump_printf_loc (MSG_NOTE, vect_location,
6991                                "vector of inductions after inner-loop:");
6992               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
6993             }
6994         }
6995     }
6996
6997
6998   if (dump_enabled_p ())
6999     {
7000       dump_printf_loc (MSG_NOTE, vect_location,
7001                        "transform induction: created def-use cycle: ");
7002       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
7003       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7004                         SSA_NAME_DEF_STMT (vec_def), 0);
7005     }
7006
7007   return true;
7008 }
7009
7010 /* Function vectorizable_live_operation.
7011
7012    STMT computes a value that is used outside the loop.  Check if
7013    it can be supported.  */
7014
7015 bool
7016 vectorizable_live_operation (gimple *stmt,
7017                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
7018                              slp_tree slp_node, int slp_index,
7019                              gimple **vec_stmt)
7020 {
7021   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
7022   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
7023   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7024   imm_use_iterator imm_iter;
7025   tree lhs, lhs_type, bitsize, vec_bitsize;
7026   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
7027   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
7028   int ncopies;
7029   gimple *use_stmt;
7030   auto_vec<tree> vec_oprnds;
7031
7032   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
7033
7034   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
7035     return false;
7036
7037   /* FORNOW.  CHECKME.  */
7038   if (nested_in_vect_loop_p (loop, stmt))
7039     return false;
7040
7041   /* If STMT is not relevant and it is a simple assignment and its inputs are
7042      invariant then it can remain in place, unvectorized.  The original last
7043      scalar value that it computes will be used.  */
7044   if (!STMT_VINFO_RELEVANT_P (stmt_info))
7045     {
7046       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
7047       if (dump_enabled_p ())
7048         dump_printf_loc (MSG_NOTE, vect_location,
7049                          "statement is simple and uses invariant.  Leaving in "
7050                          "place.\n");
7051       return true;
7052     }
7053
7054   if (slp_node)
7055     ncopies = 1;
7056   else
7057     ncopies = vect_get_num_copies (loop_vinfo, vectype);
7058
7059   if (!vec_stmt)
7060     /* No transformation required.  */
7061     return true;
7062
7063   /* If stmt has a related stmt, then use that for getting the lhs.  */
7064   if (is_pattern_stmt_p (stmt_info))
7065     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
7066
7067   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
7068         : gimple_get_lhs (stmt);
7069   lhs_type = TREE_TYPE (lhs);
7070
7071   bitsize = (VECTOR_BOOLEAN_TYPE_P (vectype)
7072              ? bitsize_int (TYPE_PRECISION (TREE_TYPE (vectype)))
7073              : TYPE_SIZE (TREE_TYPE (vectype)));
7074   vec_bitsize = TYPE_SIZE (vectype);
7075
7076   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
7077   tree vec_lhs, bitstart;
7078   if (slp_node)
7079     {
7080       gcc_assert (slp_index >= 0);
7081
7082       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
7083       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
7084
7085       /* Get the last occurrence of the scalar index from the concatenation of
7086          all the slp vectors. Calculate which slp vector it is and the index
7087          within.  */
7088       int pos = (num_vec * nunits) - num_scalar + slp_index;
7089       int vec_entry = pos / nunits;
7090       int vec_index = pos % nunits;
7091
7092       /* Get the correct slp vectorized stmt.  */
7093       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
7094
7095       /* Get entry to use.  */
7096       bitstart = bitsize_int (vec_index);
7097       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
7098     }
7099   else
7100     {
7101       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
7102       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
7103
7104       /* For multiple copies, get the last copy.  */
7105       for (int i = 1; i < ncopies; ++i)
7106         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
7107                                                   vec_lhs);
7108
7109       /* Get the last lane in the vector.  */
7110       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
7111     }
7112
7113   /* Create a new vectorized stmt for the uses of STMT and insert outside the
7114      loop.  */
7115   gimple_seq stmts = NULL;
7116   tree bftype = TREE_TYPE (vectype);
7117   if (VECTOR_BOOLEAN_TYPE_P (vectype))
7118     bftype = build_nonstandard_integer_type (tree_to_uhwi (bitsize), 1);
7119   tree new_tree = build3 (BIT_FIELD_REF, bftype, vec_lhs, bitsize, bitstart);
7120   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
7121                                    true, NULL_TREE);
7122   if (stmts)
7123     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
7124
7125   /* Replace use of lhs with newly computed result.  If the use stmt is a
7126      single arg PHI, just replace all uses of PHI result.  It's necessary
7127      because lcssa PHI defining lhs may be before newly inserted stmt.  */
7128   use_operand_p use_p;
7129   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
7130     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
7131         && !is_gimple_debug (use_stmt))
7132     {
7133       if (gimple_code (use_stmt) == GIMPLE_PHI
7134           && gimple_phi_num_args (use_stmt) == 1)
7135         {
7136           replace_uses_by (gimple_phi_result (use_stmt), new_tree);
7137         }
7138       else
7139         {
7140           FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
7141             SET_USE (use_p, new_tree);
7142         }
7143       update_stmt (use_stmt);
7144     }
7145
7146   return true;
7147 }
7148
7149 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
7150
7151 static void
7152 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
7153 {
7154   ssa_op_iter op_iter;
7155   imm_use_iterator imm_iter;
7156   def_operand_p def_p;
7157   gimple *ustmt;
7158
7159   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
7160     {
7161       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
7162         {
7163           basic_block bb;
7164
7165           if (!is_gimple_debug (ustmt))
7166             continue;
7167
7168           bb = gimple_bb (ustmt);
7169
7170           if (!flow_bb_inside_loop_p (loop, bb))
7171             {
7172               if (gimple_debug_bind_p (ustmt))
7173                 {
7174                   if (dump_enabled_p ())
7175                     dump_printf_loc (MSG_NOTE, vect_location,
7176                                      "killing debug use\n");
7177
7178                   gimple_debug_bind_reset_value (ustmt);
7179                   update_stmt (ustmt);
7180                 }
7181               else
7182                 gcc_unreachable ();
7183             }
7184         }
7185     }
7186 }
7187
7188 /* Given loop represented by LOOP_VINFO, return true if computation of
7189    LOOP_VINFO_NITERS (= LOOP_VINFO_NITERSM1 + 1) doesn't overflow, false
7190    otherwise.  */
7191
7192 static bool
7193 loop_niters_no_overflow (loop_vec_info loop_vinfo)
7194 {
7195   /* Constant case.  */
7196   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7197     {
7198       tree cst_niters = LOOP_VINFO_NITERS (loop_vinfo);
7199       tree cst_nitersm1 = LOOP_VINFO_NITERSM1 (loop_vinfo);
7200
7201       gcc_assert (TREE_CODE (cst_niters) == INTEGER_CST);
7202       gcc_assert (TREE_CODE (cst_nitersm1) == INTEGER_CST);
7203       if (wi::to_widest (cst_nitersm1) < wi::to_widest (cst_niters))
7204         return true;
7205     }
7206
7207   widest_int max;
7208   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7209   /* Check the upper bound of loop niters.  */
7210   if (get_max_loop_iterations (loop, &max))
7211     {
7212       tree type = TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo));
7213       signop sgn = TYPE_SIGN (type);
7214       widest_int type_max = widest_int::from (wi::max_value (type), sgn);
7215       if (max < type_max)
7216         return true;
7217     }
7218   return false;
7219 }
7220
7221 /* Scale profiling counters by estimation for LOOP which is vectorized
7222    by factor VF.  */
7223
7224 static void
7225 scale_profile_for_vect_loop (struct loop *loop, unsigned vf)
7226 {
7227   edge preheader = loop_preheader_edge (loop);
7228   /* Reduce loop iterations by the vectorization factor.  */
7229   gcov_type new_est_niter = niter_for_unrolled_loop (loop, vf);
7230   profile_count freq_h = loop->header->count, freq_e = preheader->count ();
7231
7232   if (freq_h.nonzero_p ())
7233     {
7234       profile_probability p;
7235
7236       /* Avoid dropping loop body profile counter to 0 because of zero count
7237          in loop's preheader.  */
7238       if (!(freq_e == profile_count::zero ()))
7239         freq_e = freq_e.force_nonzero ();
7240       p = freq_e.apply_scale (new_est_niter + 1, 1).probability_in (freq_h);
7241       scale_loop_frequencies (loop, p);
7242     }
7243
7244   edge exit_e = single_exit (loop);
7245   exit_e->probability = profile_probability::always ()
7246                                  .apply_scale (1, new_est_niter + 1);
7247
7248   edge exit_l = single_pred_edge (loop->latch);
7249   profile_probability prob = exit_l->probability;
7250   exit_l->probability = exit_e->probability.invert ();
7251   if (prob.initialized_p () && exit_l->probability.initialized_p ())
7252     scale_bbs_frequencies (&loop->latch, 1, exit_l->probability / prob);
7253 }
7254
7255 /* Function vect_transform_loop.
7256
7257    The analysis phase has determined that the loop is vectorizable.
7258    Vectorize the loop - created vectorized stmts to replace the scalar
7259    stmts in the loop, and update the loop exit condition.
7260    Returns scalar epilogue loop if any.  */
7261
7262 struct loop *
7263 vect_transform_loop (loop_vec_info loop_vinfo)
7264 {
7265   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
7266   struct loop *epilogue = NULL;
7267   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
7268   int nbbs = loop->num_nodes;
7269   int i;
7270   tree niters_vector = NULL;
7271   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
7272   bool grouped_store;
7273   bool slp_scheduled = false;
7274   gimple *stmt, *pattern_stmt;
7275   gimple_seq pattern_def_seq = NULL;
7276   gimple_stmt_iterator pattern_def_si = gsi_none ();
7277   bool transform_pattern_stmt = false;
7278   bool check_profitability = false;
7279   int th;
7280
7281   if (dump_enabled_p ())
7282     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
7283
7284   /* Use the more conservative vectorization threshold.  If the number
7285      of iterations is constant assume the cost check has been performed
7286      by our caller.  If the threshold makes all loops profitable that
7287      run at least the vectorization factor number of times checking
7288      is pointless, too.  */
7289   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
7290   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo)
7291       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7292     {
7293       if (dump_enabled_p ())
7294         dump_printf_loc (MSG_NOTE, vect_location,
7295                          "Profitability threshold is %d loop iterations.\n",
7296                          th);
7297       check_profitability = true;
7298     }
7299
7300   /* Make sure there exists a single-predecessor exit bb.  Do this before
7301      versioning.   */
7302   edge e = single_exit (loop);
7303   if (! single_pred_p (e->dest))
7304     {
7305       split_loop_exit_edge (e);
7306       if (dump_enabled_p ())
7307         dump_printf (MSG_NOTE, "split exit edge\n");
7308     }
7309
7310   /* Version the loop first, if required, so the profitability check
7311      comes first.  */
7312
7313   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
7314     {
7315       vect_loop_versioning (loop_vinfo, th, check_profitability);
7316       check_profitability = false;
7317     }
7318
7319   /* Make sure there exists a single-predecessor exit bb also on the
7320      scalar loop copy.  Do this after versioning but before peeling
7321      so CFG structure is fine for both scalar and if-converted loop
7322      to make slpeel_duplicate_current_defs_from_edges face matched
7323      loop closed PHI nodes on the exit.  */
7324   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7325     {
7326       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
7327       if (! single_pred_p (e->dest))
7328         {
7329           split_loop_exit_edge (e);
7330           if (dump_enabled_p ())
7331             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
7332         }
7333     }
7334
7335   tree niters = vect_build_loop_niters (loop_vinfo);
7336   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = niters;
7337   tree nitersm1 = unshare_expr (LOOP_VINFO_NITERSM1 (loop_vinfo));
7338   bool niters_no_overflow = loop_niters_no_overflow (loop_vinfo);
7339   epilogue = vect_do_peeling (loop_vinfo, niters, nitersm1, &niters_vector, th,
7340                               check_profitability, niters_no_overflow);
7341   if (niters_vector == NULL_TREE)
7342     {
7343       if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
7344         niters_vector
7345           = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
7346                            LOOP_VINFO_INT_NITERS (loop_vinfo) / vf);
7347       else
7348         vect_gen_vector_loop_niters (loop_vinfo, niters, &niters_vector,
7349                                      niters_no_overflow);
7350     }
7351
7352   /* 1) Make sure the loop header has exactly two entries
7353      2) Make sure we have a preheader basic block.  */
7354
7355   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
7356
7357   split_edge (loop_preheader_edge (loop));
7358
7359   /* FORNOW: the vectorizer supports only loops which body consist
7360      of one basic block (header + empty latch). When the vectorizer will
7361      support more involved loop forms, the order by which the BBs are
7362      traversed need to be reconsidered.  */
7363
7364   for (i = 0; i < nbbs; i++)
7365     {
7366       basic_block bb = bbs[i];
7367       stmt_vec_info stmt_info;
7368
7369       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
7370            gsi_next (&si))
7371         {
7372           gphi *phi = si.phi ();
7373           if (dump_enabled_p ())
7374             {
7375               dump_printf_loc (MSG_NOTE, vect_location,
7376                                "------>vectorizing phi: ");
7377               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
7378             }
7379           stmt_info = vinfo_for_stmt (phi);
7380           if (!stmt_info)
7381             continue;
7382
7383           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7384             vect_loop_kill_debug_uses (loop, phi);
7385
7386           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7387               && !STMT_VINFO_LIVE_P (stmt_info))
7388             continue;
7389
7390           if (STMT_VINFO_VECTYPE (stmt_info)
7391               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
7392                   != (unsigned HOST_WIDE_INT) vf)
7393               && dump_enabled_p ())
7394             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7395
7396           if ((STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def
7397                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def
7398                || STMT_VINFO_DEF_TYPE (stmt_info) == vect_nested_cycle)
7399               && ! PURE_SLP_STMT (stmt_info))
7400             {
7401               if (dump_enabled_p ())
7402                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
7403               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
7404             }
7405         }
7406
7407       pattern_stmt = NULL;
7408       for (gimple_stmt_iterator si = gsi_start_bb (bb);
7409            !gsi_end_p (si) || transform_pattern_stmt;)
7410         {
7411           bool is_store;
7412
7413           if (transform_pattern_stmt)
7414             stmt = pattern_stmt;
7415           else
7416             {
7417               stmt = gsi_stmt (si);
7418               /* During vectorization remove existing clobber stmts.  */
7419               if (gimple_clobber_p (stmt))
7420                 {
7421                   unlink_stmt_vdef (stmt);
7422                   gsi_remove (&si, true);
7423                   release_defs (stmt);
7424                   continue;
7425                 }
7426             }
7427
7428           if (dump_enabled_p ())
7429             {
7430               dump_printf_loc (MSG_NOTE, vect_location,
7431                                "------>vectorizing statement: ");
7432               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
7433             }
7434
7435           stmt_info = vinfo_for_stmt (stmt);
7436
7437           /* vector stmts created in the outer-loop during vectorization of
7438              stmts in an inner-loop may not have a stmt_info, and do not
7439              need to be vectorized.  */
7440           if (!stmt_info)
7441             {
7442               gsi_next (&si);
7443               continue;
7444             }
7445
7446           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
7447             vect_loop_kill_debug_uses (loop, stmt);
7448
7449           if (!STMT_VINFO_RELEVANT_P (stmt_info)
7450               && !STMT_VINFO_LIVE_P (stmt_info))
7451             {
7452               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7453                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7454                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7455                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7456                 {
7457                   stmt = pattern_stmt;
7458                   stmt_info = vinfo_for_stmt (stmt);
7459                 }
7460               else
7461                 {
7462                   gsi_next (&si);
7463                   continue;
7464                 }
7465             }
7466           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
7467                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
7468                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
7469                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
7470             transform_pattern_stmt = true;
7471
7472           /* If pattern statement has def stmts, vectorize them too.  */
7473           if (is_pattern_stmt_p (stmt_info))
7474             {
7475               if (pattern_def_seq == NULL)
7476                 {
7477                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
7478                   pattern_def_si = gsi_start (pattern_def_seq);
7479                 }
7480               else if (!gsi_end_p (pattern_def_si))
7481                 gsi_next (&pattern_def_si);
7482               if (pattern_def_seq != NULL)
7483                 {
7484                   gimple *pattern_def_stmt = NULL;
7485                   stmt_vec_info pattern_def_stmt_info = NULL;
7486
7487                   while (!gsi_end_p (pattern_def_si))
7488                     {
7489                       pattern_def_stmt = gsi_stmt (pattern_def_si);
7490                       pattern_def_stmt_info
7491                         = vinfo_for_stmt (pattern_def_stmt);
7492                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
7493                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
7494                         break;
7495                       gsi_next (&pattern_def_si);
7496                     }
7497
7498                   if (!gsi_end_p (pattern_def_si))
7499                     {
7500                       if (dump_enabled_p ())
7501                         {
7502                           dump_printf_loc (MSG_NOTE, vect_location,
7503                                            "==> vectorizing pattern def "
7504                                            "stmt: ");
7505                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
7506                                             pattern_def_stmt, 0);
7507                         }
7508
7509                       stmt = pattern_def_stmt;
7510                       stmt_info = pattern_def_stmt_info;
7511                     }
7512                   else
7513                     {
7514                       pattern_def_si = gsi_none ();
7515                       transform_pattern_stmt = false;
7516                     }
7517                 }
7518               else
7519                 transform_pattern_stmt = false;
7520             }
7521
7522           if (STMT_VINFO_VECTYPE (stmt_info))
7523             {
7524               unsigned int nunits
7525                 = (unsigned int)
7526                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7527               if (!STMT_SLP_TYPE (stmt_info)
7528                   && nunits != (unsigned int) vf
7529                   && dump_enabled_p ())
7530                   /* For SLP VF is set according to unrolling factor, and not
7531                      to vector size, hence for SLP this print is not valid.  */
7532                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7533             }
7534
7535           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7536              reached.  */
7537           if (STMT_SLP_TYPE (stmt_info))
7538             {
7539               if (!slp_scheduled)
7540                 {
7541                   slp_scheduled = true;
7542
7543                   if (dump_enabled_p ())
7544                     dump_printf_loc (MSG_NOTE, vect_location,
7545                                      "=== scheduling SLP instances ===\n");
7546
7547                   vect_schedule_slp (loop_vinfo);
7548                 }
7549
7550               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7551               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7552                 {
7553                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7554                     {
7555                       pattern_def_seq = NULL;
7556                       gsi_next (&si);
7557                     }
7558                   continue;
7559                 }
7560             }
7561
7562           /* -------- vectorize statement ------------ */
7563           if (dump_enabled_p ())
7564             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7565
7566           grouped_store = false;
7567           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7568           if (is_store)
7569             {
7570               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7571                 {
7572                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7573                      interleaving chain was completed - free all the stores in
7574                      the chain.  */
7575                   gsi_next (&si);
7576                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7577                 }
7578               else
7579                 {
7580                   /* Free the attached stmt_vec_info and remove the stmt.  */
7581                   gimple *store = gsi_stmt (si);
7582                   free_stmt_vec_info (store);
7583                   unlink_stmt_vdef (store);
7584                   gsi_remove (&si, true);
7585                   release_defs (store);
7586                 }
7587
7588               /* Stores can only appear at the end of pattern statements.  */
7589               gcc_assert (!transform_pattern_stmt);
7590               pattern_def_seq = NULL;
7591             }
7592           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7593             {
7594               pattern_def_seq = NULL;
7595               gsi_next (&si);
7596             }
7597         }                       /* stmts in BB */
7598     }                           /* BBs in loop */
7599
7600   slpeel_make_loop_iterate_ntimes (loop, niters_vector);
7601
7602   scale_profile_for_vect_loop (loop, vf);
7603
7604   /* The minimum number of iterations performed by the epilogue.  This
7605      is 1 when peeling for gaps because we always need a final scalar
7606      iteration.  */
7607   int min_epilogue_iters = LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) ? 1 : 0;
7608   /* +1 to convert latch counts to loop iteration counts,
7609      -min_epilogue_iters to remove iterations that cannot be performed
7610        by the vector code.  */
7611   int bias = 1 - min_epilogue_iters;
7612   /* In these calculations the "- 1" converts loop iteration counts
7613      back to latch counts.  */
7614   if (loop->any_upper_bound)
7615     loop->nb_iterations_upper_bound
7616       = wi::udiv_floor (loop->nb_iterations_upper_bound + bias, vf) - 1;
7617   if (loop->any_likely_upper_bound)
7618     loop->nb_iterations_likely_upper_bound
7619       = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + bias, vf) - 1;
7620   if (loop->any_estimate)
7621     loop->nb_iterations_estimate
7622       = wi::udiv_floor (loop->nb_iterations_estimate + bias, vf) - 1;
7623
7624   if (dump_enabled_p ())
7625     {
7626       if (!LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7627         {
7628           dump_printf_loc (MSG_NOTE, vect_location,
7629                            "LOOP VECTORIZED\n");
7630           if (loop->inner)
7631             dump_printf_loc (MSG_NOTE, vect_location,
7632                              "OUTER LOOP VECTORIZED\n");
7633           dump_printf (MSG_NOTE, "\n");
7634         }
7635       else
7636         dump_printf_loc (MSG_NOTE, vect_location,
7637                          "LOOP EPILOGUE VECTORIZED (VS=%d)\n",
7638                          current_vector_size);
7639     }
7640
7641   /* Free SLP instances here because otherwise stmt reference counting
7642      won't work.  */
7643   slp_instance instance;
7644   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7645     vect_free_slp_instance (instance);
7646   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7647   /* Clear-up safelen field since its value is invalid after vectorization
7648      since vectorized loop can have loop-carried dependencies.  */
7649   loop->safelen = 0;
7650
7651   /* Don't vectorize epilogue for epilogue.  */
7652   if (LOOP_VINFO_EPILOGUE_P (loop_vinfo))
7653     epilogue = NULL;
7654
7655   if (epilogue)
7656     {
7657         unsigned int vector_sizes
7658           = targetm.vectorize.autovectorize_vector_sizes ();
7659         vector_sizes &= current_vector_size - 1;
7660
7661         if (!PARAM_VALUE (PARAM_VECT_EPILOGUES_NOMASK))
7662           epilogue = NULL;
7663         else if (!vector_sizes)
7664           epilogue = NULL;
7665         else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
7666                  && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) >= 0)
7667           {
7668             int smallest_vec_size = 1 << ctz_hwi (vector_sizes);
7669             int ratio = current_vector_size / smallest_vec_size;
7670             int eiters = LOOP_VINFO_INT_NITERS (loop_vinfo)
7671               - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
7672             eiters = eiters % vf;
7673
7674             epilogue->nb_iterations_upper_bound = eiters - 1;
7675
7676             if (eiters < vf / ratio)
7677               epilogue = NULL;
7678             }
7679     }
7680
7681   if (epilogue)
7682     {
7683       epilogue->force_vectorize = loop->force_vectorize;
7684       epilogue->safelen = loop->safelen;
7685       epilogue->dont_vectorize = false;
7686
7687       /* We may need to if-convert epilogue to vectorize it.  */
7688       if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
7689         tree_if_conversion (epilogue);
7690     }
7691
7692   return epilogue;
7693 }
7694
7695 /* The code below is trying to perform simple optimization - revert
7696    if-conversion for masked stores, i.e. if the mask of a store is zero
7697    do not perform it and all stored value producers also if possible.
7698    For example,
7699      for (i=0; i<n; i++)
7700        if (c[i])
7701         {
7702           p1[i] += 1;
7703           p2[i] = p3[i] +2;
7704         }
7705    this transformation will produce the following semi-hammock:
7706
7707    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7708      {
7709        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7710        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7711        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7712        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7713        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7714        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7715      }
7716 */
7717
7718 void
7719 optimize_mask_stores (struct loop *loop)
7720 {
7721   basic_block *bbs = get_loop_body (loop);
7722   unsigned nbbs = loop->num_nodes;
7723   unsigned i;
7724   basic_block bb;
7725   struct loop *bb_loop;
7726   gimple_stmt_iterator gsi;
7727   gimple *stmt;
7728   auto_vec<gimple *> worklist;
7729
7730   vect_location = find_loop_location (loop);
7731   /* Pick up all masked stores in loop if any.  */
7732   for (i = 0; i < nbbs; i++)
7733     {
7734       bb = bbs[i];
7735       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7736            gsi_next (&gsi))
7737         {
7738           stmt = gsi_stmt (gsi);
7739           if (gimple_call_internal_p (stmt, IFN_MASK_STORE))
7740             worklist.safe_push (stmt);
7741         }
7742     }
7743
7744   free (bbs);
7745   if (worklist.is_empty ())
7746     return;
7747
7748   /* Loop has masked stores.  */
7749   while (!worklist.is_empty ())
7750     {
7751       gimple *last, *last_store;
7752       edge e, efalse;
7753       tree mask;
7754       basic_block store_bb, join_bb;
7755       gimple_stmt_iterator gsi_to;
7756       tree vdef, new_vdef;
7757       gphi *phi;
7758       tree vectype;
7759       tree zero;
7760
7761       last = worklist.pop ();
7762       mask = gimple_call_arg (last, 2);
7763       bb = gimple_bb (last);
7764       /* Create then_bb and if-then structure in CFG, then_bb belongs to
7765          the same loop as if_bb.  It could be different to LOOP when two
7766          level loop-nest is vectorized and mask_store belongs to the inner
7767          one.  */
7768       e = split_block (bb, last);
7769       bb_loop = bb->loop_father;
7770       gcc_assert (loop == bb_loop || flow_loop_nested_p (loop, bb_loop));
7771       join_bb = e->dest;
7772       store_bb = create_empty_bb (bb);
7773       add_bb_to_loop (store_bb, bb_loop);
7774       e->flags = EDGE_TRUE_VALUE;
7775       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7776       /* Put STORE_BB to likely part.  */
7777       efalse->probability = profile_probability::unlikely ();
7778       store_bb->count = efalse->count ();
7779       make_single_succ_edge (store_bb, join_bb, EDGE_FALLTHRU);
7780       if (dom_info_available_p (CDI_DOMINATORS))
7781         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7782       if (dump_enabled_p ())
7783         dump_printf_loc (MSG_NOTE, vect_location,
7784                          "Create new block %d to sink mask stores.",
7785                          store_bb->index);
7786       /* Create vector comparison with boolean result.  */
7787       vectype = TREE_TYPE (mask);
7788       zero = build_zero_cst (vectype);
7789       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7790       gsi = gsi_last_bb (bb);
7791       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7792       /* Create new PHI node for vdef of the last masked store:
7793          .MEM_2 = VDEF <.MEM_1>
7794          will be converted to
7795          .MEM.3 = VDEF <.MEM_1>
7796          and new PHI node will be created in join bb
7797          .MEM_2 = PHI <.MEM_1, .MEM_3>
7798       */
7799       vdef = gimple_vdef (last);
7800       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7801       gimple_set_vdef (last, new_vdef);
7802       phi = create_phi_node (vdef, join_bb);
7803       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7804
7805       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7806       while (true)
7807         {
7808           gimple_stmt_iterator gsi_from;
7809           gimple *stmt1 = NULL;
7810
7811           /* Move masked store to STORE_BB.  */
7812           last_store = last;
7813           gsi = gsi_for_stmt (last);
7814           gsi_from = gsi;
7815           /* Shift GSI to the previous stmt for further traversal.  */
7816           gsi_prev (&gsi);
7817           gsi_to = gsi_start_bb (store_bb);
7818           gsi_move_before (&gsi_from, &gsi_to);
7819           /* Setup GSI_TO to the non-empty block start.  */
7820           gsi_to = gsi_start_bb (store_bb);
7821           if (dump_enabled_p ())
7822             {
7823               dump_printf_loc (MSG_NOTE, vect_location,
7824                                "Move stmt to created bb\n");
7825               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7826             }
7827           /* Move all stored value producers if possible.  */
7828           while (!gsi_end_p (gsi))
7829             {
7830               tree lhs;
7831               imm_use_iterator imm_iter;
7832               use_operand_p use_p;
7833               bool res;
7834
7835               /* Skip debug statements.  */
7836               if (is_gimple_debug (gsi_stmt (gsi)))
7837                 {
7838                   gsi_prev (&gsi);
7839                   continue;
7840                 }
7841               stmt1 = gsi_stmt (gsi);
7842               /* Do not consider statements writing to memory or having
7843                  volatile operand.  */
7844               if (gimple_vdef (stmt1)
7845                   || gimple_has_volatile_ops (stmt1))
7846                 break;
7847               gsi_from = gsi;
7848               gsi_prev (&gsi);
7849               lhs = gimple_get_lhs (stmt1);
7850               if (!lhs)
7851                 break;
7852
7853               /* LHS of vectorized stmt must be SSA_NAME.  */
7854               if (TREE_CODE (lhs) != SSA_NAME)
7855                 break;
7856
7857               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7858                 {
7859                   /* Remove dead scalar statement.  */
7860                   if (has_zero_uses (lhs))
7861                     {
7862                       gsi_remove (&gsi_from, true);
7863                       continue;
7864                     }
7865                 }
7866
7867               /* Check that LHS does not have uses outside of STORE_BB.  */
7868               res = true;
7869               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7870                 {
7871                   gimple *use_stmt;
7872                   use_stmt = USE_STMT (use_p);
7873                   if (is_gimple_debug (use_stmt))
7874                     continue;
7875                   if (gimple_bb (use_stmt) != store_bb)
7876                     {
7877                       res = false;
7878                       break;
7879                     }
7880                 }
7881               if (!res)
7882                 break;
7883
7884               if (gimple_vuse (stmt1)
7885                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7886                 break;
7887
7888               /* Can move STMT1 to STORE_BB.  */
7889               if (dump_enabled_p ())
7890                 {
7891                   dump_printf_loc (MSG_NOTE, vect_location,
7892                                    "Move stmt to created bb\n");
7893                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7894                 }
7895               gsi_move_before (&gsi_from, &gsi_to);
7896               /* Shift GSI_TO for further insertion.  */
7897               gsi_prev (&gsi_to);
7898             }
7899           /* Put other masked stores with the same mask to STORE_BB.  */
7900           if (worklist.is_empty ()
7901               || gimple_call_arg (worklist.last (), 2) != mask
7902               || worklist.last () != stmt1)
7903             break;
7904           last = worklist.pop ();
7905         }
7906       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7907     }
7908 }