gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52
  53 /* Loop Vectorization Pass.
  54
  55    This pass tries to vectorize loops.
  56
  57    For example, the vectorizer transforms the following simple loop:
  58
  59         short a[N]; short b[N]; short c[N]; int i;
  60
  61         for (i=0; i<N; i++){
  62           a[i] = b[i] + c[i];
  63         }
  64
  65    as if it was manually vectorized by rewriting the source code into:
  66
  67         typedef int __attribute__((mode(V8HI))) v8hi;
  68         short a[N];  short b[N]; short c[N];   int i;
  69         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  70         v8hi va, vb, vc;
  71
  72         for (i=0; i<N/8; i++){
  73           vb = pb[i];
  74           vc = pc[i];
  75           va = vb + vc;
  76           pa[i] = va;
  77         }
  78
  79         The main entry to this pass is vectorize_loops(), in which
  80    the vectorizer applies a set of analyses on a given set of loops,
  81    followed by the actual vectorization transformation for the loops that
  82    had successfully passed the analysis phase.
  83         Throughout this pass we make a distinction between two types of
  84    data: scalars (which are represented by SSA_NAMES), and memory references
  85    ("data-refs").  These two types of data require different handling both
  86    during analysis and transformation. The types of data-refs that the
  87    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  88    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  89    accesses are required to have a simple (consecutive) access pattern.
  90
  91    Analysis phase:
  92    ===============
  93         The driver for the analysis phase is vect_analyze_loop().
  94    It applies a set of analyses, some of which rely on the scalar evolution
  95    analyzer (scev) developed by Sebastian Pop.
  96
  97         During the analysis phase the vectorizer records some information
  98    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  99    loop, as well as general information about the loop as a whole, which is
 100    recorded in a "loop_vec_info" struct attached to each loop.
 101
 102    Transformation phase:
 103    =====================
 104         The loop transformation phase scans all the stmts in the loop, and
 105    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 106    the loop that needs to be vectorized.  It inserts the vector code sequence
 107    just before the scalar stmt S, and records a pointer to the vector code
 108    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 109    attached to S).  This pointer will be used for the vectorization of following
 110    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 111    otherwise, we rely on dead code elimination for removing it.
 112
 113         For example, say stmt S1 was vectorized into stmt VS1:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    S2:  a = b;
 118
 119    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 120    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 121    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 122    resulting sequence would be:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    VS2: va = vb;
 127    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 128
 129         Operands that are not SSA_NAMEs, are data-refs that appear in
 130    load/store operations (like 'x[i]' in S1), and are handled differently.
 131
 132    Target modeling:
 133    =================
 134         Currently the only target specific information that is used is the
 135    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 136    Targets that can support different sizes of vectors, for now will need
 137    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 138    flexibility will be added in the future.
 139
 140         Since we only vectorize operations which vector form can be
 141    expressed using existing tree codes, to verify that an operation is
 142    supported, the vectorizer checks the relevant optab at the relevant
 143    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 144    the value found is CODE_FOR_nothing, then there's no target support, and
 145    we can't vectorize the stmt.
 146
 147    For additional information on this project see:
 148    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 149 */
 150
 151 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 152
 153 /* Function vect_determine_vectorization_factor
 154
 155    Determine the vectorization factor (VF).  VF is the number of data elements
 156    that are operated upon in parallel in a single iteration of the vectorized
 157    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 158    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 159    elements can fit in a single vector register.
 160
 161    We currently support vectorization of loops in which all types operated upon
 162    are of the same size.  Therefore this function currently sets VF according to
 163    the size of the types operated upon, and fails if there are multiple sizes
 164    in the loop.
 165
 166    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 167    original loop:
 168         for (i=0; i<N; i++){
 169           a[i] = b[i] + c[i];
 170         }
 171
 172    vectorized loop:
 173         for (i=0; i<N; i+=VF){
 174           a[i:VF] = b[i:VF] + c[i:VF];
 175         }
 176 */
 177
 178 static bool
 179 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 180 {
 181   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 182   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 183   unsigned nbbs = loop->num_nodes;
 184   unsigned int vectorization_factor = 0;
 185   tree scalar_type;
 186   gphi *phi;
 187   tree vectype;
 188   unsigned int nunits;
 189   stmt_vec_info stmt_info;
 190   unsigned i;
 191   HOST_WIDE_INT dummy;
 192   gimple *stmt, *pattern_stmt = NULL;
 193   gimple_seq pattern_def_seq = NULL;
 194   gimple_stmt_iterator pattern_def_si = gsi_none ();
 195   bool analyze_pattern_stmt = false;
 196   bool bool_result;
 197   auto_vec<stmt_vec_info> mask_producers;
 198
 199   if (dump_enabled_p ())
 200     dump_printf_loc (MSG_NOTE, vect_location,
 201                      "=== vect_determine_vectorization_factor ===\n");
 202
 203   for (i = 0; i < nbbs; i++)
 204     {
 205       basic_block bb = bbs[i];
 206
 207       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 208            gsi_next (&si))
 209         {
 210           phi = si.phi ();
 211           stmt_info = vinfo_for_stmt (phi);
 212           if (dump_enabled_p ())
 213             {
 214               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 215               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 216             }
 217
 218           gcc_assert (stmt_info);
 219
 220           if (STMT_VINFO_RELEVANT_P (stmt_info)
 221               || STMT_VINFO_LIVE_P (stmt_info))
 222             {
 223               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 224               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 225
 226               if (dump_enabled_p ())
 227                 {
 228                   dump_printf_loc (MSG_NOTE, vect_location,
 229                                    "get vectype for scalar type:  ");
 230                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 231                   dump_printf (MSG_NOTE, "\n");
 232                 }
 233
 234               vectype = get_vectype_for_scalar_type (scalar_type);
 235               if (!vectype)
 236                 {
 237                   if (dump_enabled_p ())
 238                     {
 239                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 240                                        "not vectorized: unsupported "
 241                                        "data-type ");
 242                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 243                                          scalar_type);
 244                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 245                     }
 246                   return false;
 247                 }
 248               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 249
 250               if (dump_enabled_p ())
 251                 {
 252                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 253                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 254                   dump_printf (MSG_NOTE, "\n");
 255                 }
 256
 257               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 258               if (dump_enabled_p ())
 259                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 260                                  nunits);
 261
 262               if (!vectorization_factor
 263                   || (nunits > vectorization_factor))
 264                 vectorization_factor = nunits;
 265             }
 266         }
 267
 268       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 269            !gsi_end_p (si) || analyze_pattern_stmt;)
 270         {
 271           tree vf_vectype;
 272
 273           if (analyze_pattern_stmt)
 274             stmt = pattern_stmt;
 275           else
 276             stmt = gsi_stmt (si);
 277
 278           stmt_info = vinfo_for_stmt (stmt);
 279
 280           if (dump_enabled_p ())
 281             {
 282               dump_printf_loc (MSG_NOTE, vect_location,
 283                                "==> examining statement: ");
 284               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 285             }
 286
 287           gcc_assert (stmt_info);
 288
 289           /* Skip stmts which do not need to be vectorized.  */
 290           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 291                && !STMT_VINFO_LIVE_P (stmt_info))
 292               || gimple_clobber_p (stmt))
 293             {
 294               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 295                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 296                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 297                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 298                 {
 299                   stmt = pattern_stmt;
 300                   stmt_info = vinfo_for_stmt (pattern_stmt);
 301                   if (dump_enabled_p ())
 302                     {
 303                       dump_printf_loc (MSG_NOTE, vect_location,
 304                                        "==> examining pattern statement: ");
 305                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 306                     }
 307                 }
 308               else
 309                 {
 310                   if (dump_enabled_p ())
 311                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 312                   gsi_next (&si);
 313                   continue;
 314                 }
 315             }
 316           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 317                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 318                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 319                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 320             analyze_pattern_stmt = true;
 321
 322           /* If a pattern statement has def stmts, analyze them too.  */
 323           if (is_pattern_stmt_p (stmt_info))
 324             {
 325               if (pattern_def_seq == NULL)
 326                 {
 327                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 328                   pattern_def_si = gsi_start (pattern_def_seq);
 329                 }
 330               else if (!gsi_end_p (pattern_def_si))
 331                 gsi_next (&pattern_def_si);
 332               if (pattern_def_seq != NULL)
 333                 {
 334                   gimple *pattern_def_stmt = NULL;
 335                   stmt_vec_info pattern_def_stmt_info = NULL;
 336
 337                   while (!gsi_end_p (pattern_def_si))
 338                     {
 339                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 340                       pattern_def_stmt_info
 341                         = vinfo_for_stmt (pattern_def_stmt);
 342                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 343                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 344                         break;
 345                       gsi_next (&pattern_def_si);
 346                     }
 347
 348                   if (!gsi_end_p (pattern_def_si))
 349                     {
 350                       if (dump_enabled_p ())
 351                         {
 352                           dump_printf_loc (MSG_NOTE, vect_location,
 353                                            "==> examining pattern def stmt: ");
 354                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 355                                             pattern_def_stmt, 0);
 356                         }
 357
 358                       stmt = pattern_def_stmt;
 359                       stmt_info = pattern_def_stmt_info;
 360                     }
 361                   else
 362                     {
 363                       pattern_def_si = gsi_none ();
 364                       analyze_pattern_stmt = false;
 365                     }
 366                 }
 367               else
 368                 analyze_pattern_stmt = false;
 369             }
 370
 371           if (gimple_get_lhs (stmt) == NULL_TREE
 372               /* MASK_STORE has no lhs, but is ok.  */
 373               && (!is_gimple_call (stmt)
 374                   || !gimple_call_internal_p (stmt)
 375                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 376             {
 377               if (is_gimple_call (stmt))
 378                 {
 379                   /* Ignore calls with no lhs.  These must be calls to
 380                      #pragma omp simd functions, and what vectorization factor
 381                      it really needs can't be determined until
 382                      vectorizable_simd_clone_call.  */
 383                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 384                     {
 385                       pattern_def_seq = NULL;
 386                       gsi_next (&si);
 387                     }
 388                   continue;
 389                 }
 390               if (dump_enabled_p ())
 391                 {
 392                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 393                                    "not vectorized: irregular stmt.");
 394                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 395                                     0);
 396                 }
 397               return false;
 398             }
 399
 400           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 401             {
 402               if (dump_enabled_p ())
 403                 {
 404                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                                    "not vectorized: vector stmt in loop:");
 406                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 407                 }
 408               return false;
 409             }
 410
 411           bool_result = false;
 412
 413           if (STMT_VINFO_VECTYPE (stmt_info))
 414             {
 415               /* The only case when a vectype had been already set is for stmts
 416                  that contain a dataref, or for "pattern-stmts" (stmts
 417                  generated by the vectorizer to represent/replace a certain
 418                  idiom).  */
 419               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 420                           || is_pattern_stmt_p (stmt_info)
 421                           || !gsi_end_p (pattern_def_si));
 422               vectype = STMT_VINFO_VECTYPE (stmt_info);
 423             }
 424           else
 425             {
 426               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 427               if (is_gimple_call (stmt)
 428                   && gimple_call_internal_p (stmt)
 429                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 430                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 431               else
 432                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 433
 434               /* Bool ops don't participate in vectorization factor
 435                  computation.  For comparison use compared types to
 436                  compute a factor.  */
 437               if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
 438                   && is_gimple_assign (stmt)
 439                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 440                 {
 441                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 442                       || STMT_VINFO_LIVE_P (stmt_info))
 443                     mask_producers.safe_push (stmt_info);
 444                   bool_result = true;
 445
 446                   if (gimple_code (stmt) == GIMPLE_ASSIGN
 447                       && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                          == tcc_comparison
 449                       && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
 450                          != BOOLEAN_TYPE)
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 531                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 558           if (!vectorization_factor
 559               || (nunits > vectorization_factor))
 560             vectorization_factor = nunits;
 561
 562           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 563             {
 564               pattern_def_seq = NULL;
 565               gsi_next (&si);
 566             }
 567         }
 568     }
 569
 570   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 571   if (dump_enabled_p ())
 572     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 573                      vectorization_factor);
 574   if (vectorization_factor <= 1)
 575     {
 576       if (dump_enabled_p ())
 577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 578                          "not vectorized: unsupported data-type\n");
 579       return false;
 580     }
 581   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 582
 583   for (i = 0; i < mask_producers.length (); i++)
 584     {
 585       tree mask_type = NULL;
 586
 587       stmt = STMT_VINFO_STMT (mask_producers[i]);
 588
 589       if (gimple_code (stmt) == GIMPLE_ASSIGN
 590           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 591           && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
 592         {
 593           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 594           mask_type = get_mask_type_for_scalar_type (scalar_type);
 595
 596           if (!mask_type)
 597             {
 598               if (dump_enabled_p ())
 599                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 600                                  "not vectorized: unsupported mask\n");
 601               return false;
 602             }
 603         }
 604       else
 605         {
 606           tree rhs;
 607           ssa_op_iter iter;
 608           gimple *def_stmt;
 609           enum vect_def_type dt;
 610
 611           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 612             {
 613               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 614                                        &def_stmt, &dt, &vectype))
 615                 {
 616                   if (dump_enabled_p ())
 617                     {
 618                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 619                                        "not vectorized: can't compute mask type "
 620                                        "for statement, ");
 621                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 622                                         0);
 623                     }
 624                   return false;
 625                 }
 626
 627               /* No vectype probably means external definition.
 628                  Allow it in case there is another operand which
 629                  allows to determine mask type.  */
 630               if (!vectype)
 631                 continue;
 632
 633               if (!mask_type)
 634                 mask_type = vectype;
 635               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 636                        != TYPE_VECTOR_SUBPARTS (vectype))
 637                 {
 638                   if (dump_enabled_p ())
 639                     {
 640                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                                        "not vectorized: different sized masks "
 642                                        "types in statement, ");
 643                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 644                                          mask_type);
 645                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          vectype);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 649                     }
 650                   return false;
 651                 }
 652               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 653                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 654                 {
 655                   if (dump_enabled_p ())
 656                     {
 657                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 658                                        "not vectorized: mixed mask and "
 659                                        "nonmask vector types in statement, ");
 660                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 661                                          mask_type);
 662                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          vectype);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 666                     }
 667                   return false;
 668                 }
 669             }
 670
 671           /* We may compare boolean value loaded as vector of integers.
 672              Fix mask_type in such case.  */
 673           if (mask_type
 674               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 675               && gimple_code (stmt) == GIMPLE_ASSIGN
 676               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 677             mask_type = build_same_sized_truth_vector_type (mask_type);
 678         }
 679
 680       /* No mask_type should mean loop invariant predicate.
 681          This is probably a subject for optimization in
 682          if-conversion.  */
 683       if (!mask_type)
 684         {
 685           if (dump_enabled_p ())
 686             {
 687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                                "not vectorized: can't compute mask type "
 689                                "for statement, ");
 690               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 691                                 0);
 692             }
 693           return false;
 694         }
 695
 696       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 697     }
 698
 699   return true;
 700 }
 701
 702
 703 /* Function vect_is_simple_iv_evolution.
 704
 705    FORNOW: A simple evolution of an induction variables in the loop is
 706    considered a polynomial evolution.  */
 707
 708 static bool
 709 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 710                              tree * step)
 711 {
 712   tree init_expr;
 713   tree step_expr;
 714   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 715   basic_block bb;
 716
 717   /* When there is no evolution in this loop, the evolution function
 718      is not "simple".  */
 719   if (evolution_part == NULL_TREE)
 720     return false;
 721
 722   /* When the evolution is a polynomial of degree >= 2
 723      the evolution function is not "simple".  */
 724   if (tree_is_chrec (evolution_part))
 725     return false;
 726
 727   step_expr = evolution_part;
 728   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 729
 730   if (dump_enabled_p ())
 731     {
 732       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 733       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 734       dump_printf (MSG_NOTE, ",  init: ");
 735       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 736       dump_printf (MSG_NOTE, "\n");
 737     }
 738
 739   *init = init_expr;
 740   *step = step_expr;
 741
 742   if (TREE_CODE (step_expr) != INTEGER_CST
 743       && (TREE_CODE (step_expr) != SSA_NAME
 744           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 745               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 746           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 747               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 748                   || !flag_associative_math)))
 749       && (TREE_CODE (step_expr) != REAL_CST
 750           || !flag_associative_math))
 751     {
 752       if (dump_enabled_p ())
 753         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 754                          "step unknown.\n");
 755       return false;
 756     }
 757
 758   return true;
 759 }
 760
 761 /* Function vect_analyze_scalar_cycles_1.
 762
 763    Examine the cross iteration def-use cycles of scalar variables
 764    in LOOP.  LOOP_VINFO represents the loop that is now being
 765    considered for vectorization (can be LOOP, or an outer-loop
 766    enclosing LOOP).  */
 767
 768 static void
 769 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 770 {
 771   basic_block bb = loop->header;
 772   tree init, step;
 773   auto_vec<gimple *, 64> worklist;
 774   gphi_iterator gsi;
 775   bool double_reduc;
 776
 777   if (dump_enabled_p ())
 778     dump_printf_loc (MSG_NOTE, vect_location,
 779                      "=== vect_analyze_scalar_cycles ===\n");
 780
 781   /* First - identify all inductions.  Reduction detection assumes that all the
 782      inductions have been identified, therefore, this order must not be
 783      changed.  */
 784   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 785     {
 786       gphi *phi = gsi.phi ();
 787       tree access_fn = NULL;
 788       tree def = PHI_RESULT (phi);
 789       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 790
 791       if (dump_enabled_p ())
 792         {
 793           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 794           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 795         }
 796
 797       /* Skip virtual phi's.  The data dependences that are associated with
 798          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 799       if (virtual_operand_p (def))
 800         continue;
 801
 802       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 803
 804       /* Analyze the evolution function.  */
 805       access_fn = analyze_scalar_evolution (loop, def);
 806       if (access_fn)
 807         {
 808           STRIP_NOPS (access_fn);
 809           if (dump_enabled_p ())
 810             {
 811               dump_printf_loc (MSG_NOTE, vect_location,
 812                                "Access function of PHI: ");
 813               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 814               dump_printf (MSG_NOTE, "\n");
 815             }
 816           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 817             = initial_condition_in_loop_num (access_fn, loop->num);
 818           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 819             = evolution_part_in_loop_num (access_fn, loop->num);
 820         }
 821
 822       if (!access_fn
 823           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 824           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 825               && TREE_CODE (step) != INTEGER_CST))
 826         {
 827           worklist.safe_push (phi);
 828           continue;
 829         }
 830
 831       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 832                   != NULL_TREE);
 833       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 834
 835       if (dump_enabled_p ())
 836         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 837       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 838     }
 839
 840
 841   /* Second - identify all reductions and nested cycles.  */
 842   while (worklist.length () > 0)
 843     {
 844       gimple *phi = worklist.pop ();
 845       tree def = PHI_RESULT (phi);
 846       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 847       gimple *reduc_stmt;
 848       bool nested_cycle;
 849
 850       if (dump_enabled_p ())
 851         {
 852           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 853           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 854         }
 855
 856       gcc_assert (!virtual_operand_p (def)
 857                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 858
 859       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 860       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 861                                                 &double_reduc, false);
 862       if (reduc_stmt)
 863         {
 864           if (double_reduc)
 865             {
 866               if (dump_enabled_p ())
 867                 dump_printf_loc (MSG_NOTE, vect_location,
 868                                  "Detected double reduction.\n");
 869
 870               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 871               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 872                                                     vect_double_reduction_def;
 873             }
 874           else
 875             {
 876               if (nested_cycle)
 877                 {
 878                   if (dump_enabled_p ())
 879                     dump_printf_loc (MSG_NOTE, vect_location,
 880                                      "Detected vectorizable nested cycle.\n");
 881
 882                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 883                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 884                                                              vect_nested_cycle;
 885                 }
 886               else
 887                 {
 888                   if (dump_enabled_p ())
 889                     dump_printf_loc (MSG_NOTE, vect_location,
 890                                      "Detected reduction.\n");
 891
 892                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 893                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 894                                                            vect_reduction_def;
 895                   /* Store the reduction cycles for possible vectorization in
 896                      loop-aware SLP.  */
 897                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 898                 }
 899             }
 900         }
 901       else
 902         if (dump_enabled_p ())
 903           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 904                            "Unknown def-use cycle pattern.\n");
 905     }
 906 }
 907
 908
 909 /* Function vect_analyze_scalar_cycles.
 910
 911    Examine the cross iteration def-use cycles of scalar variables, by
 912    analyzing the loop-header PHIs of scalar variables.  Classify each
 913    cycle as one of the following: invariant, induction, reduction, unknown.
 914    We do that for the loop represented by LOOP_VINFO, and also to its
 915    inner-loop, if exists.
 916    Examples for scalar cycles:
 917
 918    Example1: reduction:
 919
 920               loop1:
 921               for (i=0; i<N; i++)
 922                  sum += a[i];
 923
 924    Example2: induction:
 925
 926               loop2:
 927               for (i=0; i<N; i++)
 928                  a[i] = i;  */
 929
 930 static void
 931 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 932 {
 933   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 934
 935   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 936
 937   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 938      Reductions in such inner-loop therefore have different properties than
 939      the reductions in the nest that gets vectorized:
 940      1. When vectorized, they are executed in the same order as in the original
 941         scalar loop, so we can't change the order of computation when
 942         vectorizing them.
 943      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 944         current checks are too strict.  */
 945
 946   if (loop->inner)
 947     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 948 }
 949
 950 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 951
 952 static void
 953 vect_fixup_reduc_chain (gimple *stmt)
 954 {
 955   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 956   gimple *stmtp;
 957   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 958               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 959   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 960   do
 961     {
 962       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 964       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 965       if (stmt)
 966         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 967           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 968     }
 969   while (stmt);
 970   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 971 }
 972
 973 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 974
 975 static void
 976 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 977 {
 978   gimple *first;
 979   unsigned i;
 980
 981   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 982     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 983       {
 984         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 985         while (next)
 986           {
 987             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 988               break;
 989             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 990           }
 991         /* If not all stmt in the chain are patterns try to handle
 992            the chain without patterns.  */
 993         if (! next)
 994           {
 995             vect_fixup_reduc_chain (first);
 996             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 997               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 998           }
 999       }
1000 }
1001
1002 /* Function vect_get_loop_niters.
1003
1004    Determine how many iterations the loop is executed and place it
1005    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1006    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1007    niter information holds in ASSUMPTIONS.
1008
1009    Return the loop exit condition.  */
1010
1011
1012 static gcond *
1013 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1014                       tree *number_of_iterations, tree *number_of_iterationsm1)
1015 {
1016   edge exit = single_exit (loop);
1017   struct tree_niter_desc niter_desc;
1018   tree niter_assumptions, niter, may_be_zero;
1019   gcond *cond = get_loop_exit_condition (loop);
1020
1021   *assumptions = boolean_true_node;
1022   *number_of_iterationsm1 = chrec_dont_know;
1023   *number_of_iterations = chrec_dont_know;
1024   if (dump_enabled_p ())
1025     dump_printf_loc (MSG_NOTE, vect_location,
1026                      "=== get_loop_niters ===\n");
1027
1028   if (!exit)
1029     return cond;
1030
1031   niter = chrec_dont_know;
1032   may_be_zero = NULL_TREE;
1033   niter_assumptions = boolean_true_node;
1034   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1035       || chrec_contains_undetermined (niter_desc.niter))
1036     return cond;
1037
1038   niter_assumptions = niter_desc.assumptions;
1039   may_be_zero = niter_desc.may_be_zero;
1040   niter = niter_desc.niter;
1041
1042   if (may_be_zero && integer_zerop (may_be_zero))
1043     may_be_zero = NULL_TREE;
1044
1045   if (may_be_zero)
1046     {
1047       if (COMPARISON_CLASS_P (may_be_zero))
1048         {
1049           /* Try to combine may_be_zero with assumptions, this can simplify
1050              computation of niter expression.  */
1051           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1052             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1053                                              niter_assumptions,
1054                                              fold_build1 (TRUTH_NOT_EXPR,
1055                                                           boolean_type_node,
1056                                                           may_be_zero));
1057           else
1058             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1059                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1060
1061           may_be_zero = NULL_TREE;
1062         }
1063       else if (integer_nonzerop (may_be_zero))
1064         {
1065           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1066           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1067           return cond;
1068         }
1069       else
1070         return cond;
1071     }
1072
1073   *assumptions = niter_assumptions;
1074   *number_of_iterationsm1 = niter;
1075
1076   /* We want the number of loop header executions which is the number
1077      of latch executions plus one.
1078      ???  For UINT_MAX latch executions this number overflows to zero
1079      for loops like do { n++; } while (n != 0);  */
1080   if (niter && !chrec_contains_undetermined (niter))
1081     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1082                           build_int_cst (TREE_TYPE (niter), 1));
1083   *number_of_iterations = niter;
1084
1085   return cond;
1086 }
1087
1088 /* Function bb_in_loop_p
1089
1090    Used as predicate for dfs order traversal of the loop bbs.  */
1091
1092 static bool
1093 bb_in_loop_p (const_basic_block bb, const void *data)
1094 {
1095   const struct loop *const loop = (const struct loop *)data;
1096   if (flow_bb_inside_loop_p (loop, bb))
1097     return true;
1098   return false;
1099 }
1100
1101
1102 /* Function new_loop_vec_info.
1103
1104    Create and initialize a new loop_vec_info struct for LOOP, as well as
1105    stmt_vec_info structs for all the stmts in LOOP.  */
1106
1107 static loop_vec_info
1108 new_loop_vec_info (struct loop *loop)
1109 {
1110   loop_vec_info res;
1111   basic_block *bbs;
1112   gimple_stmt_iterator si;
1113   unsigned int i, nbbs;
1114
1115   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1116   res->kind = vec_info::loop;
1117   LOOP_VINFO_LOOP (res) = loop;
1118
1119   bbs = get_loop_body (loop);
1120
1121   /* Create/Update stmt_info for all stmts in the loop.  */
1122   for (i = 0; i < loop->num_nodes; i++)
1123     {
1124       basic_block bb = bbs[i];
1125
1126       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1127         {
1128           gimple *phi = gsi_stmt (si);
1129           gimple_set_uid (phi, 0);
1130           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1131         }
1132
1133       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1134         {
1135           gimple *stmt = gsi_stmt (si);
1136           gimple_set_uid (stmt, 0);
1137           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1138         }
1139     }
1140
1141   /* CHECKME: We want to visit all BBs before their successors (except for
1142      latch blocks, for which this assertion wouldn't hold).  In the simple
1143      case of the loop forms we allow, a dfs order of the BBs would the same
1144      as reversed postorder traversal, so we are safe.  */
1145
1146    free (bbs);
1147    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1148    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1149                               bbs, loop->num_nodes, loop);
1150    gcc_assert (nbbs == loop->num_nodes);
1151
1152   LOOP_VINFO_BBS (res) = bbs;
1153   LOOP_VINFO_NITERSM1 (res) = NULL;
1154   LOOP_VINFO_NITERS (res) = NULL;
1155   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1156   LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL;
1157   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1158   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1159   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1160   LOOP_VINFO_VECT_FACTOR (res) = 0;
1161   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1162   LOOP_VINFO_DATAREFS (res) = vNULL;
1163   LOOP_VINFO_DDRS (res) = vNULL;
1164   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1165   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1166   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1167   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1168   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1169   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1170   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1171   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1172   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1173   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1174   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1175   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1176
1177   return res;
1178 }
1179
1180
1181 /* Function destroy_loop_vec_info.
1182
1183    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1184    stmts in the loop.  */
1185
1186 void
1187 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1188 {
1189   struct loop *loop;
1190   basic_block *bbs;
1191   int nbbs;
1192   gimple_stmt_iterator si;
1193   int j;
1194   vec<slp_instance> slp_instances;
1195   slp_instance instance;
1196   bool swapped;
1197
1198   if (!loop_vinfo)
1199     return;
1200
1201   loop = LOOP_VINFO_LOOP (loop_vinfo);
1202
1203   bbs = LOOP_VINFO_BBS (loop_vinfo);
1204   nbbs = clean_stmts ? loop->num_nodes : 0;
1205   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1206
1207   for (j = 0; j < nbbs; j++)
1208     {
1209       basic_block bb = bbs[j];
1210       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1211         free_stmt_vec_info (gsi_stmt (si));
1212
1213       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1214         {
1215           gimple *stmt = gsi_stmt (si);
1216
1217           /* We may have broken canonical form by moving a constant
1218              into RHS1 of a commutative op.  Fix such occurrences.  */
1219           if (swapped && is_gimple_assign (stmt))
1220             {
1221               enum tree_code code = gimple_assign_rhs_code (stmt);
1222
1223               if ((code == PLUS_EXPR
1224                    || code == POINTER_PLUS_EXPR
1225                    || code == MULT_EXPR)
1226                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1227                 swap_ssa_operands (stmt,
1228                                    gimple_assign_rhs1_ptr (stmt),
1229                                    gimple_assign_rhs2_ptr (stmt));
1230             }
1231
1232           /* Free stmt_vec_info.  */
1233           free_stmt_vec_info (stmt);
1234           gsi_next (&si);
1235         }
1236     }
1237
1238   free (LOOP_VINFO_BBS (loop_vinfo));
1239   vect_destroy_datarefs (loop_vinfo);
1240   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1241   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1242   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1243   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1244   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1245   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1246   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1247     vect_free_slp_instance (instance);
1248
1249   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1250   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1251   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1252   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1253
1254   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1255   loop_vinfo->scalar_cost_vec.release ();
1256
1257   free (loop_vinfo);
1258   loop->aux = NULL;
1259 }
1260
1261
1262 /* Calculate the cost of one scalar iteration of the loop.  */
1263 static void
1264 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1265 {
1266   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1269   int innerloop_iters, i;
1270
1271   /* Count statements in scalar loop.  Using this as scalar cost for a single
1272      iteration for now.
1273
1274      TODO: Add outer loop support.
1275
1276      TODO: Consider assigning different costs to different scalar
1277      statements.  */
1278
1279   /* FORNOW.  */
1280   innerloop_iters = 1;
1281   if (loop->inner)
1282     innerloop_iters = 50; /* FIXME */
1283
1284   for (i = 0; i < nbbs; i++)
1285     {
1286       gimple_stmt_iterator si;
1287       basic_block bb = bbs[i];
1288
1289       if (bb->loop_father == loop->inner)
1290         factor = innerloop_iters;
1291       else
1292         factor = 1;
1293
1294       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1295         {
1296           gimple *stmt = gsi_stmt (si);
1297           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1298
1299           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1300             continue;
1301
1302           /* Skip stmts that are not vectorized inside the loop.  */
1303           if (stmt_info
1304               && !STMT_VINFO_RELEVANT_P (stmt_info)
1305               && (!STMT_VINFO_LIVE_P (stmt_info)
1306                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1307               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1308             continue;
1309
1310           vect_cost_for_stmt kind;
1311           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1312             {
1313               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1314                kind = scalar_load;
1315              else
1316                kind = scalar_store;
1317             }
1318           else
1319             kind = scalar_stmt;
1320
1321           scalar_single_iter_cost
1322             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1323                                  factor, kind, NULL, 0, vect_prologue);
1324         }
1325     }
1326   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1327     = scalar_single_iter_cost;
1328 }
1329
1330
1331 /* Function vect_analyze_loop_form_1.
1332
1333    Verify that certain CFG restrictions hold, including:
1334    - the loop has a pre-header
1335    - the loop has a single entry and exit
1336    - the loop exit condition is simple enough
1337    - the number of iterations can be analyzed, i.e, a countable loop.  The
1338      niter could be analyzed under some assumptions.  */
1339
1340 bool
1341 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1342                           tree *assumptions, tree *number_of_iterationsm1,
1343                           tree *number_of_iterations, gcond **inner_loop_cond)
1344 {
1345   if (dump_enabled_p ())
1346     dump_printf_loc (MSG_NOTE, vect_location,
1347                      "=== vect_analyze_loop_form ===\n");
1348
1349   /* Different restrictions apply when we are considering an inner-most loop,
1350      vs. an outer (nested) loop.
1351      (FORNOW. May want to relax some of these restrictions in the future).  */
1352
1353   if (!loop->inner)
1354     {
1355       /* Inner-most loop.  We currently require that the number of BBs is
1356          exactly 2 (the header and latch).  Vectorizable inner-most loops
1357          look like this:
1358
1359                         (pre-header)
1360                            |
1361                           header <--------+
1362                            | |            |
1363                            | +--> latch --+
1364                            |
1365                         (exit-bb)  */
1366
1367       if (loop->num_nodes != 2)
1368         {
1369           if (dump_enabled_p ())
1370             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                              "not vectorized: control flow in loop.\n");
1372           return false;
1373         }
1374
1375       if (empty_block_p (loop->header))
1376         {
1377           if (dump_enabled_p ())
1378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379                              "not vectorized: empty loop.\n");
1380           return false;
1381         }
1382     }
1383   else
1384     {
1385       struct loop *innerloop = loop->inner;
1386       edge entryedge;
1387
1388       /* Nested loop. We currently require that the loop is doubly-nested,
1389          contains a single inner loop, and the number of BBs is exactly 5.
1390          Vectorizable outer-loops look like this:
1391
1392                         (pre-header)
1393                            |
1394                           header <---+
1395                            |         |
1396                           inner-loop |
1397                            |         |
1398                           tail ------+
1399                            |
1400                         (exit-bb)
1401
1402          The inner-loop has the properties expected of inner-most loops
1403          as described above.  */
1404
1405       if ((loop->inner)->inner || (loop->inner)->next)
1406         {
1407           if (dump_enabled_p ())
1408             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1409                              "not vectorized: multiple nested loops.\n");
1410           return false;
1411         }
1412
1413       if (loop->num_nodes != 5)
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: control flow in loop.\n");
1418           return false;
1419         }
1420
1421       entryedge = loop_preheader_edge (innerloop);
1422       if (entryedge->src != loop->header
1423           || !single_exit (innerloop)
1424           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1425         {
1426           if (dump_enabled_p ())
1427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428                              "not vectorized: unsupported outerloop form.\n");
1429           return false;
1430         }
1431
1432       /* Analyze the inner-loop.  */
1433       tree inner_niterm1, inner_niter, inner_assumptions;
1434       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1435                                       &inner_assumptions, &inner_niterm1,
1436                                       &inner_niter, NULL)
1437           /* Don't support analyzing niter under assumptions for inner
1438              loop.  */
1439           || !integer_onep (inner_assumptions))
1440         {
1441           if (dump_enabled_p ())
1442             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                              "not vectorized: Bad inner loop.\n");
1444           return false;
1445         }
1446
1447       if (!expr_invariant_in_loop_p (loop, inner_niter))
1448         {
1449           if (dump_enabled_p ())
1450             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1451                              "not vectorized: inner-loop count not"
1452                              " invariant.\n");
1453           return false;
1454         }
1455
1456       if (dump_enabled_p ())
1457         dump_printf_loc (MSG_NOTE, vect_location,
1458                          "Considering outer-loop vectorization.\n");
1459     }
1460
1461   if (!single_exit (loop)
1462       || EDGE_COUNT (loop->header->preds) != 2)
1463     {
1464       if (dump_enabled_p ())
1465         {
1466           if (!single_exit (loop))
1467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1468                              "not vectorized: multiple exits.\n");
1469           else if (EDGE_COUNT (loop->header->preds) != 2)
1470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1471                              "not vectorized: too many incoming edges.\n");
1472         }
1473       return false;
1474     }
1475
1476   /* We assume that the loop exit condition is at the end of the loop. i.e,
1477      that the loop is represented as a do-while (with a proper if-guard
1478      before the loop if needed), where the loop header contains all the
1479      executable statements, and the latch is empty.  */
1480   if (!empty_block_p (loop->latch)
1481       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1482     {
1483       if (dump_enabled_p ())
1484         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485                          "not vectorized: latch block not empty.\n");
1486       return false;
1487     }
1488
1489   /* Make sure the exit is not abnormal.  */
1490   edge e = single_exit (loop);
1491   if (e->flags & EDGE_ABNORMAL)
1492     {
1493       if (dump_enabled_p ())
1494         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1495                          "not vectorized: abnormal loop exit edge.\n");
1496       return false;
1497     }
1498
1499   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1500                                      number_of_iterationsm1);
1501   if (!*loop_cond)
1502     {
1503       if (dump_enabled_p ())
1504         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1505                          "not vectorized: complicated exit condition.\n");
1506       return false;
1507     }
1508
1509   if (integer_zerop (*assumptions)
1510       || !*number_of_iterations
1511       || chrec_contains_undetermined (*number_of_iterations))
1512     {
1513       if (dump_enabled_p ())
1514         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1515                          "not vectorized: number of iterations cannot be "
1516                          "computed.\n");
1517       return false;
1518     }
1519
1520   if (integer_zerop (*number_of_iterations))
1521     {
1522       if (dump_enabled_p ())
1523         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524                          "not vectorized: number of iterations = 0.\n");
1525       return false;
1526     }
1527
1528   return true;
1529 }
1530
1531 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1532
1533 loop_vec_info
1534 vect_analyze_loop_form (struct loop *loop)
1535 {
1536   tree assumptions, number_of_iterations, number_of_iterationsm1;
1537   gcond *loop_cond, *inner_loop_cond = NULL;
1538
1539   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1540                                   &assumptions, &number_of_iterationsm1,
1541                                   &number_of_iterations, &inner_loop_cond))
1542     return NULL;
1543
1544   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1545   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1546   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1547   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1548   if (!integer_onep (assumptions))
1549     {
1550       /* We consider to vectorize this loop by versioning it under
1551          some assumptions.  In order to do this, we need to clear
1552          existing information computed by scev and niter analyzer.  */
1553       scev_reset_htab ();
1554       free_numbers_of_iterations_estimates_loop (loop);
1555       /* Also set flag for this loop so that following scev and niter
1556          analysis are done under the assumptions.  */
1557       loop_constraint_set (loop, LOOP_C_FINITE);
1558       /* Also record the assumptions for versioning.  */
1559       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1560     }
1561
1562   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1563     {
1564       if (dump_enabled_p ())
1565         {
1566           dump_printf_loc (MSG_NOTE, vect_location,
1567                            "Symbolic number of iterations is ");
1568           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1569           dump_printf (MSG_NOTE, "\n");
1570         }
1571     }
1572
1573   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1574   if (inner_loop_cond)
1575     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1576       = loop_exit_ctrl_vec_info_type;
1577
1578   gcc_assert (!loop->aux);
1579   loop->aux = loop_vinfo;
1580   return loop_vinfo;
1581 }
1582
1583
1584
1585 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1586    statements update the vectorization factor.  */
1587
1588 static void
1589 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1590 {
1591   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1592   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1593   int nbbs = loop->num_nodes;
1594   unsigned int vectorization_factor;
1595   int i;
1596
1597   if (dump_enabled_p ())
1598     dump_printf_loc (MSG_NOTE, vect_location,
1599                      "=== vect_update_vf_for_slp ===\n");
1600
1601   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1602   gcc_assert (vectorization_factor != 0);
1603
1604   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1605      vectorization factor of the loop is the unrolling factor required by
1606      the SLP instances.  If that unrolling factor is 1, we say, that we
1607      perform pure SLP on loop - cross iteration parallelism is not
1608      exploited.  */
1609   bool only_slp_in_loop = true;
1610   for (i = 0; i < nbbs; i++)
1611     {
1612       basic_block bb = bbs[i];
1613       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1614            gsi_next (&si))
1615         {
1616           gimple *stmt = gsi_stmt (si);
1617           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1618           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1619               && STMT_VINFO_RELATED_STMT (stmt_info))
1620             {
1621               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1622               stmt_info = vinfo_for_stmt (stmt);
1623             }
1624           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1625                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1626               && !PURE_SLP_STMT (stmt_info))
1627             /* STMT needs both SLP and loop-based vectorization.  */
1628             only_slp_in_loop = false;
1629         }
1630     }
1631
1632   if (only_slp_in_loop)
1633     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1634   else
1635     vectorization_factor
1636       = least_common_multiple (vectorization_factor,
1637                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1638
1639   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1640   if (dump_enabled_p ())
1641     dump_printf_loc (MSG_NOTE, vect_location,
1642                      "Updating vectorization factor to %d\n",
1643                      vectorization_factor);
1644 }
1645
1646 /* Function vect_analyze_loop_operations.
1647
1648    Scan the loop stmts and make sure they are all vectorizable.  */
1649
1650 static bool
1651 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1652 {
1653   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1654   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1655   int nbbs = loop->num_nodes;
1656   int i;
1657   stmt_vec_info stmt_info;
1658   bool need_to_vectorize = false;
1659   bool ok;
1660
1661   if (dump_enabled_p ())
1662     dump_printf_loc (MSG_NOTE, vect_location,
1663                      "=== vect_analyze_loop_operations ===\n");
1664
1665   for (i = 0; i < nbbs; i++)
1666     {
1667       basic_block bb = bbs[i];
1668
1669       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1670            gsi_next (&si))
1671         {
1672           gphi *phi = si.phi ();
1673           ok = true;
1674
1675           stmt_info = vinfo_for_stmt (phi);
1676           if (dump_enabled_p ())
1677             {
1678               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1679               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1680             }
1681           if (virtual_operand_p (gimple_phi_result (phi)))
1682             continue;
1683
1684           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1685              (i.e., a phi in the tail of the outer-loop).  */
1686           if (! is_loop_header_bb_p (bb))
1687             {
1688               /* FORNOW: we currently don't support the case that these phis
1689                  are not used in the outerloop (unless it is double reduction,
1690                  i.e., this phi is vect_reduction_def), cause this case
1691                  requires to actually do something here.  */
1692               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1693                    || STMT_VINFO_LIVE_P (stmt_info))
1694                   && STMT_VINFO_DEF_TYPE (stmt_info)
1695                      != vect_double_reduction_def)
1696                 {
1697                   if (dump_enabled_p ())
1698                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1699                                      "Unsupported loop-closed phi in "
1700                                      "outer-loop.\n");
1701                   return false;
1702                 }
1703
1704               /* If PHI is used in the outer loop, we check that its operand
1705                  is defined in the inner loop.  */
1706               if (STMT_VINFO_RELEVANT_P (stmt_info))
1707                 {
1708                   tree phi_op;
1709                   gimple *op_def_stmt;
1710
1711                   if (gimple_phi_num_args (phi) != 1)
1712                     return false;
1713
1714                   phi_op = PHI_ARG_DEF (phi, 0);
1715                   if (TREE_CODE (phi_op) != SSA_NAME)
1716                     return false;
1717
1718                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1719                   if (gimple_nop_p (op_def_stmt)
1720                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1721                       || !vinfo_for_stmt (op_def_stmt))
1722                     return false;
1723
1724                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1725                         != vect_used_in_outer
1726                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1727                            != vect_used_in_outer_by_reduction)
1728                     return false;
1729                 }
1730
1731               continue;
1732             }
1733
1734           gcc_assert (stmt_info);
1735
1736           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1737                || STMT_VINFO_LIVE_P (stmt_info))
1738               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1739             {
1740               /* A scalar-dependence cycle that we don't support.  */
1741               if (dump_enabled_p ())
1742                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1743                                  "not vectorized: scalar dependence cycle.\n");
1744               return false;
1745             }
1746
1747           if (STMT_VINFO_RELEVANT_P (stmt_info))
1748             {
1749               need_to_vectorize = true;
1750               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1751                 ok = vectorizable_induction (phi, NULL, NULL);
1752             }
1753
1754           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1755             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1756
1757           if (!ok)
1758             {
1759               if (dump_enabled_p ())
1760                 {
1761                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1762                                    "not vectorized: relevant phi not "
1763                                    "supported: ");
1764                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1765                 }
1766               return false;
1767             }
1768         }
1769
1770       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1771            gsi_next (&si))
1772         {
1773           gimple *stmt = gsi_stmt (si);
1774           if (!gimple_clobber_p (stmt)
1775               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1776             return false;
1777         }
1778     } /* bbs */
1779
1780   /* All operations in the loop are either irrelevant (deal with loop
1781      control, or dead), or only used outside the loop and can be moved
1782      out of the loop (e.g. invariants, inductions).  The loop can be
1783      optimized away by scalar optimizations.  We're better off not
1784      touching this loop.  */
1785   if (!need_to_vectorize)
1786     {
1787       if (dump_enabled_p ())
1788         dump_printf_loc (MSG_NOTE, vect_location,
1789                          "All the computation can be taken out of the loop.\n");
1790       if (dump_enabled_p ())
1791         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1792                          "not vectorized: redundant loop. no profit to "
1793                          "vectorize.\n");
1794       return false;
1795     }
1796
1797   return true;
1798 }
1799
1800
1801 /* Function vect_analyze_loop_2.
1802
1803    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1804    for it.  The different analyses will record information in the
1805    loop_vec_info struct.  */
1806 static bool
1807 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1808 {
1809   bool ok;
1810   int max_vf = MAX_VECTORIZATION_FACTOR;
1811   int min_vf = 2;
1812   unsigned int n_stmts = 0;
1813
1814   /* The first group of checks is independent of the vector size.  */
1815   fatal = true;
1816
1817   /* Find all data references in the loop (which correspond to vdefs/vuses)
1818      and analyze their evolution in the loop.  */
1819
1820   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1821
1822   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1823   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1824     {
1825       if (dump_enabled_p ())
1826         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1827                          "not vectorized: loop nest containing two "
1828                          "or more consecutive inner loops cannot be "
1829                          "vectorized\n");
1830       return false;
1831     }
1832
1833   for (unsigned i = 0; i < loop->num_nodes; i++)
1834     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1835          !gsi_end_p (gsi); gsi_next (&gsi))
1836       {
1837         gimple *stmt = gsi_stmt (gsi);
1838         if (is_gimple_debug (stmt))
1839           continue;
1840         ++n_stmts;
1841         if (!find_data_references_in_stmt (loop, stmt,
1842                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1843           {
1844             if (is_gimple_call (stmt) && loop->safelen)
1845               {
1846                 tree fndecl = gimple_call_fndecl (stmt), op;
1847                 if (fndecl != NULL_TREE)
1848                   {
1849                     cgraph_node *node = cgraph_node::get (fndecl);
1850                     if (node != NULL && node->simd_clones != NULL)
1851                       {
1852                         unsigned int j, n = gimple_call_num_args (stmt);
1853                         for (j = 0; j < n; j++)
1854                           {
1855                             op = gimple_call_arg (stmt, j);
1856                             if (DECL_P (op)
1857                                 || (REFERENCE_CLASS_P (op)
1858                                     && get_base_address (op)))
1859                               break;
1860                           }
1861                         op = gimple_call_lhs (stmt);
1862                         /* Ignore #pragma omp declare simd functions
1863                            if they don't have data references in the
1864                            call stmt itself.  */
1865                         if (j == n
1866                             && !(op
1867                                  && (DECL_P (op)
1868                                      || (REFERENCE_CLASS_P (op)
1869                                          && get_base_address (op)))))
1870                           continue;
1871                       }
1872                   }
1873               }
1874             if (dump_enabled_p ())
1875               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876                                "not vectorized: loop contains function "
1877                                "calls or data references that cannot "
1878                                "be analyzed\n");
1879             return false;
1880           }
1881       }
1882
1883   /* Analyze the data references and also adjust the minimal
1884      vectorization factor according to the loads and stores.  */
1885
1886   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1887   if (!ok)
1888     {
1889       if (dump_enabled_p ())
1890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1891                          "bad data references.\n");
1892       return false;
1893     }
1894
1895   /* Classify all cross-iteration scalar data-flow cycles.
1896      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1897   vect_analyze_scalar_cycles (loop_vinfo);
1898
1899   vect_pattern_recog (loop_vinfo);
1900
1901   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1902
1903   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1904      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1905
1906   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1907   if (!ok)
1908     {
1909       if (dump_enabled_p ())
1910         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1911                          "bad data access.\n");
1912       return false;
1913     }
1914
1915   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1916
1917   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1918   if (!ok)
1919     {
1920       if (dump_enabled_p ())
1921         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1922                          "unexpected pattern.\n");
1923       return false;
1924     }
1925
1926   /* While the rest of the analysis below depends on it in some way.  */
1927   fatal = false;
1928
1929   /* Analyze data dependences between the data-refs in the loop
1930      and adjust the maximum vectorization factor according to
1931      the dependences.
1932      FORNOW: fail at the first data dependence that we encounter.  */
1933
1934   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1935   if (!ok
1936       || max_vf < min_vf)
1937     {
1938       if (dump_enabled_p ())
1939             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1940                              "bad data dependence.\n");
1941       return false;
1942     }
1943
1944   ok = vect_determine_vectorization_factor (loop_vinfo);
1945   if (!ok)
1946     {
1947       if (dump_enabled_p ())
1948         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949                          "can't determine vectorization factor.\n");
1950       return false;
1951     }
1952   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1953     {
1954       if (dump_enabled_p ())
1955         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1956                          "bad data dependence.\n");
1957       return false;
1958     }
1959
1960   /* Compute the scalar iteration cost.  */
1961   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1962
1963   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1964   HOST_WIDE_INT estimated_niter;
1965   unsigned th;
1966   int min_scalar_loop_bound;
1967
1968   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1969   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1970   if (!ok)
1971     return false;
1972
1973   /* If there are any SLP instances mark them as pure_slp.  */
1974   bool slp = vect_make_slp_decision (loop_vinfo);
1975   if (slp)
1976     {
1977       /* Find stmts that need to be both vectorized and SLPed.  */
1978       vect_detect_hybrid_slp (loop_vinfo);
1979
1980       /* Update the vectorization factor based on the SLP decision.  */
1981       vect_update_vf_for_slp (loop_vinfo);
1982     }
1983
1984   /* This is the point where we can re-start analysis with SLP forced off.  */
1985 start_over:
1986
1987   /* Now the vectorization factor is final.  */
1988   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1989   gcc_assert (vectorization_factor != 0);
1990
1991   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1992     dump_printf_loc (MSG_NOTE, vect_location,
1993                      "vectorization_factor = %d, niters = "
1994                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1995                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1996
1997   HOST_WIDE_INT max_niter
1998     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1999   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2000        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
2001       || (max_niter != -1
2002           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
2003     {
2004       if (dump_enabled_p ())
2005         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2006                          "not vectorized: iteration count smaller than "
2007                          "vectorization factor.\n");
2008       return false;
2009     }
2010
2011   /* Analyze the alignment of the data-refs in the loop.
2012      Fail if a data reference is found that cannot be vectorized.  */
2013
2014   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2015   if (!ok)
2016     {
2017       if (dump_enabled_p ())
2018         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2019                          "bad data alignment.\n");
2020       return false;
2021     }
2022
2023   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2024      It is important to call pruning after vect_analyze_data_ref_accesses,
2025      since we use grouping information gathered by interleaving analysis.  */
2026   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2027   if (!ok)
2028     return false;
2029
2030   /* This pass will decide on using loop versioning and/or loop peeling in
2031      order to enhance the alignment of data references in the loop.  */
2032   ok = vect_enhance_data_refs_alignment (loop_vinfo);
2033   if (!ok)
2034     {
2035       if (dump_enabled_p ())
2036         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2037                          "bad data alignment.\n");
2038       return false;
2039     }
2040
2041   if (slp)
2042     {
2043       /* Analyze operations in the SLP instances.  Note this may
2044          remove unsupported SLP instances which makes the above
2045          SLP kind detection invalid.  */
2046       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2047       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2048                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2049       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2050         goto again;
2051     }
2052
2053   /* Scan all the remaining operations in the loop that are not subject
2054      to SLP and make sure they are vectorizable.  */
2055   ok = vect_analyze_loop_operations (loop_vinfo);
2056   if (!ok)
2057     {
2058       if (dump_enabled_p ())
2059         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2060                          "bad operation or unsupported loop bound.\n");
2061       return false;
2062     }
2063
2064   /* Analyze cost.  Decide if worth while to vectorize.  */
2065   int min_profitable_estimate, min_profitable_iters;
2066   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2067                                       &min_profitable_estimate);
2068
2069   if (min_profitable_iters < 0)
2070     {
2071       if (dump_enabled_p ())
2072         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2073                          "not vectorized: vectorization not profitable.\n");
2074       if (dump_enabled_p ())
2075         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2076                          "not vectorized: vector version will never be "
2077                          "profitable.\n");
2078       goto again;
2079     }
2080
2081   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2082                             * vectorization_factor) - 1);
2083
2084   /* Use the cost model only if it is more conservative than user specified
2085      threshold.  */
2086   th = (unsigned) min_scalar_loop_bound;
2087   if (min_profitable_iters
2088       && (!min_scalar_loop_bound
2089           || min_profitable_iters > min_scalar_loop_bound))
2090     th = (unsigned) min_profitable_iters;
2091
2092   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2093
2094   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2095       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2096     {
2097       if (dump_enabled_p ())
2098         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2099                          "not vectorized: vectorization not profitable.\n");
2100       if (dump_enabled_p ())
2101         dump_printf_loc (MSG_NOTE, vect_location,
2102                          "not vectorized: iteration count smaller than user "
2103                          "specified loop bound parameter or minimum profitable "
2104                          "iterations (whichever is more conservative).\n");
2105       goto again;
2106     }
2107
2108   estimated_niter
2109     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2110   if (estimated_niter == -1)
2111     estimated_niter = max_niter;
2112   if (estimated_niter != -1
2113       && ((unsigned HOST_WIDE_INT) estimated_niter
2114           <= MAX (th, (unsigned)min_profitable_estimate)))
2115     {
2116       if (dump_enabled_p ())
2117         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2118                          "not vectorized: estimated iteration count too "
2119                          "small.\n");
2120       if (dump_enabled_p ())
2121         dump_printf_loc (MSG_NOTE, vect_location,
2122                          "not vectorized: estimated iteration count smaller "
2123                          "than specified loop bound parameter or minimum "
2124                          "profitable iterations (whichever is more "
2125                          "conservative).\n");
2126       goto again;
2127     }
2128
2129   /* Decide whether we need to create an epilogue loop to handle
2130      remaining scalar iterations.  */
2131   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2132         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2133        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2134
2135   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2136       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2137     {
2138       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2139                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2140           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2141         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2142     }
2143   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2144            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2145                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2146                /* In case of versioning, check if the maximum number of
2147                   iterations is greater than th.  If they are identical,
2148                   the epilogue is unnecessary.  */
2149                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2150                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2151     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2152
2153   /* If an epilogue loop is required make sure we can create one.  */
2154   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2155       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2156     {
2157       if (dump_enabled_p ())
2158         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2159       if (!vect_can_advance_ivs_p (loop_vinfo)
2160           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2161                                            single_exit (LOOP_VINFO_LOOP
2162                                                          (loop_vinfo))))
2163         {
2164           if (dump_enabled_p ())
2165             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2166                              "not vectorized: can't create required "
2167                              "epilog loop\n");
2168           goto again;
2169         }
2170     }
2171
2172   gcc_assert (vectorization_factor
2173               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2174
2175   /* Ok to vectorize!  */
2176   return true;
2177
2178 again:
2179   /* Try again with SLP forced off but if we didn't do any SLP there is
2180      no point in re-trying.  */
2181   if (!slp)
2182     return false;
2183
2184   /* If there are reduction chains re-trying will fail anyway.  */
2185   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2186     return false;
2187
2188   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2189      via interleaving or lane instructions.  */
2190   slp_instance instance;
2191   slp_tree node;
2192   unsigned i, j;
2193   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2194     {
2195       stmt_vec_info vinfo;
2196       vinfo = vinfo_for_stmt
2197           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2198       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2199         continue;
2200       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2201       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2202       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2203       if (! vect_store_lanes_supported (vectype, size)
2204           && ! vect_grouped_store_supported (vectype, size))
2205         return false;
2206       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2207         {
2208           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2209           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2210           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2211           size = STMT_VINFO_GROUP_SIZE (vinfo);
2212           vectype = STMT_VINFO_VECTYPE (vinfo);
2213           if (! vect_load_lanes_supported (vectype, size)
2214               && ! vect_grouped_load_supported (vectype, single_element_p,
2215                                                 size))
2216             return false;
2217         }
2218     }
2219
2220   if (dump_enabled_p ())
2221     dump_printf_loc (MSG_NOTE, vect_location,
2222                      "re-trying with SLP disabled\n");
2223
2224   /* Roll back state appropriately.  No SLP this time.  */
2225   slp = false;
2226   /* Restore vectorization factor as it were without SLP.  */
2227   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2228   /* Free the SLP instances.  */
2229   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2230     vect_free_slp_instance (instance);
2231   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2232   /* Reset SLP type to loop_vect on all stmts.  */
2233   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2234     {
2235       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2236       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2237            !gsi_end_p (si); gsi_next (&si))
2238         {
2239           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2240           STMT_SLP_TYPE (stmt_info) = loop_vect;
2241           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2242             {
2243               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2244               STMT_SLP_TYPE (stmt_info) = loop_vect;
2245               for (gimple_stmt_iterator pi
2246                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2247                    !gsi_end_p (pi); gsi_next (&pi))
2248                 {
2249                   gimple *pstmt = gsi_stmt (pi);
2250                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2251                 }
2252             }
2253         }
2254     }
2255   /* Free optimized alias test DDRS.  */
2256   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2257   /* Reset target cost data.  */
2258   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2259   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2260     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2261   /* Reset assorted flags.  */
2262   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2263   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2264   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2265
2266   goto start_over;
2267 }
2268
2269 /* Function vect_analyze_loop.
2270
2271    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2272    for it.  The different analyses will record information in the
2273    loop_vec_info struct.  */
2274 loop_vec_info
2275 vect_analyze_loop (struct loop *loop)
2276 {
2277   loop_vec_info loop_vinfo;
2278   unsigned int vector_sizes;
2279
2280   /* Autodetect first vector size we try.  */
2281   current_vector_size = 0;
2282   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2283
2284   if (dump_enabled_p ())
2285     dump_printf_loc (MSG_NOTE, vect_location,
2286                      "===== analyze_loop_nest =====\n");
2287
2288   if (loop_outer (loop)
2289       && loop_vec_info_for_loop (loop_outer (loop))
2290       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2291     {
2292       if (dump_enabled_p ())
2293         dump_printf_loc (MSG_NOTE, vect_location,
2294                          "outer-loop already vectorized.\n");
2295       return NULL;
2296     }
2297
2298   while (1)
2299     {
2300       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2301       loop_vinfo = vect_analyze_loop_form (loop);
2302       if (!loop_vinfo)
2303         {
2304           if (dump_enabled_p ())
2305             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306                              "bad loop form.\n");
2307           return NULL;
2308         }
2309
2310       bool fatal = false;
2311       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2312         {
2313           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2314
2315           return loop_vinfo;
2316         }
2317
2318       destroy_loop_vec_info (loop_vinfo, true);
2319
2320       vector_sizes &= ~current_vector_size;
2321       if (fatal
2322           || vector_sizes == 0
2323           || current_vector_size == 0)
2324         return NULL;
2325
2326       /* Try the next biggest vector size.  */
2327       current_vector_size = 1 << floor_log2 (vector_sizes);
2328       if (dump_enabled_p ())
2329         dump_printf_loc (MSG_NOTE, vect_location,
2330                          "***** Re-trying analysis with "
2331                          "vector size %d\n", current_vector_size);
2332     }
2333 }
2334
2335
2336 /* Function reduction_code_for_scalar_code
2337
2338    Input:
2339    CODE - tree_code of a reduction operations.
2340
2341    Output:
2342    REDUC_CODE - the corresponding tree-code to be used to reduce the
2343       vector of partial results into a single scalar result, or ERROR_MARK
2344       if the operation is a supported reduction operation, but does not have
2345       such a tree-code.
2346
2347    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2348
2349 static bool
2350 reduction_code_for_scalar_code (enum tree_code code,
2351                                 enum tree_code *reduc_code)
2352 {
2353   switch (code)
2354     {
2355       case MAX_EXPR:
2356         *reduc_code = REDUC_MAX_EXPR;
2357         return true;
2358
2359       case MIN_EXPR:
2360         *reduc_code = REDUC_MIN_EXPR;
2361         return true;
2362
2363       case PLUS_EXPR:
2364         *reduc_code = REDUC_PLUS_EXPR;
2365         return true;
2366
2367       case MULT_EXPR:
2368       case MINUS_EXPR:
2369       case BIT_IOR_EXPR:
2370       case BIT_XOR_EXPR:
2371       case BIT_AND_EXPR:
2372         *reduc_code = ERROR_MARK;
2373         return true;
2374
2375       default:
2376        return false;
2377     }
2378 }
2379
2380
2381 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2382    STMT is printed with a message MSG. */
2383
2384 static void
2385 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2386 {
2387   dump_printf_loc (msg_type, vect_location, "%s", msg);
2388   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2389 }
2390
2391
2392 /* Detect SLP reduction of the form:
2393
2394    #a1 = phi <a5, a0>
2395    a2 = operation (a1)
2396    a3 = operation (a2)
2397    a4 = operation (a3)
2398    a5 = operation (a4)
2399
2400    #a = phi <a5>
2401
2402    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2403    FIRST_STMT is the first reduction stmt in the chain
2404    (a2 = operation (a1)).
2405
2406    Return TRUE if a reduction chain was detected.  */
2407
2408 static bool
2409 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2410                        gimple *first_stmt)
2411 {
2412   struct loop *loop = (gimple_bb (phi))->loop_father;
2413   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2414   enum tree_code code;
2415   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2416   stmt_vec_info use_stmt_info, current_stmt_info;
2417   tree lhs;
2418   imm_use_iterator imm_iter;
2419   use_operand_p use_p;
2420   int nloop_uses, size = 0, n_out_of_loop_uses;
2421   bool found = false;
2422
2423   if (loop != vect_loop)
2424     return false;
2425
2426   lhs = PHI_RESULT (phi);
2427   code = gimple_assign_rhs_code (first_stmt);
2428   while (1)
2429     {
2430       nloop_uses = 0;
2431       n_out_of_loop_uses = 0;
2432       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2433         {
2434           gimple *use_stmt = USE_STMT (use_p);
2435           if (is_gimple_debug (use_stmt))
2436             continue;
2437
2438           /* Check if we got back to the reduction phi.  */
2439           if (use_stmt == phi)
2440             {
2441               loop_use_stmt = use_stmt;
2442               found = true;
2443               break;
2444             }
2445
2446           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2447             {
2448               loop_use_stmt = use_stmt;
2449               nloop_uses++;
2450             }
2451            else
2452              n_out_of_loop_uses++;
2453
2454            /* There are can be either a single use in the loop or two uses in
2455               phi nodes.  */
2456            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2457              return false;
2458         }
2459
2460       if (found)
2461         break;
2462
2463       /* We reached a statement with no loop uses.  */
2464       if (nloop_uses == 0)
2465         return false;
2466
2467       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2468       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2469         return false;
2470
2471       if (!is_gimple_assign (loop_use_stmt)
2472           || code != gimple_assign_rhs_code (loop_use_stmt)
2473           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2474         return false;
2475
2476       /* Insert USE_STMT into reduction chain.  */
2477       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2478       if (current_stmt)
2479         {
2480           current_stmt_info = vinfo_for_stmt (current_stmt);
2481           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2482           GROUP_FIRST_ELEMENT (use_stmt_info)
2483             = GROUP_FIRST_ELEMENT (current_stmt_info);
2484         }
2485       else
2486         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2487
2488       lhs = gimple_assign_lhs (loop_use_stmt);
2489       current_stmt = loop_use_stmt;
2490       size++;
2491    }
2492
2493   if (!found || loop_use_stmt != phi || size < 2)
2494     return false;
2495
2496   /* Swap the operands, if needed, to make the reduction operand be the second
2497      operand.  */
2498   lhs = PHI_RESULT (phi);
2499   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2500   while (next_stmt)
2501     {
2502       if (gimple_assign_rhs2 (next_stmt) == lhs)
2503         {
2504           tree op = gimple_assign_rhs1 (next_stmt);
2505           gimple *def_stmt = NULL;
2506
2507           if (TREE_CODE (op) == SSA_NAME)
2508             def_stmt = SSA_NAME_DEF_STMT (op);
2509
2510           /* Check that the other def is either defined in the loop
2511              ("vect_internal_def"), or it's an induction (defined by a
2512              loop-header phi-node).  */
2513           if (def_stmt
2514               && gimple_bb (def_stmt)
2515               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2516               && (is_gimple_assign (def_stmt)
2517                   || is_gimple_call (def_stmt)
2518                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2519                            == vect_induction_def
2520                   || (gimple_code (def_stmt) == GIMPLE_PHI
2521                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2522                                   == vect_internal_def
2523                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2524             {
2525               lhs = gimple_assign_lhs (next_stmt);
2526               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2527               continue;
2528             }
2529
2530           return false;
2531         }
2532       else
2533         {
2534           tree op = gimple_assign_rhs2 (next_stmt);
2535           gimple *def_stmt = NULL;
2536
2537           if (TREE_CODE (op) == SSA_NAME)
2538             def_stmt = SSA_NAME_DEF_STMT (op);
2539
2540           /* Check that the other def is either defined in the loop
2541             ("vect_internal_def"), or it's an induction (defined by a
2542             loop-header phi-node).  */
2543           if (def_stmt
2544               && gimple_bb (def_stmt)
2545               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2546               && (is_gimple_assign (def_stmt)
2547                   || is_gimple_call (def_stmt)
2548                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2549                               == vect_induction_def
2550                   || (gimple_code (def_stmt) == GIMPLE_PHI
2551                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2552                                   == vect_internal_def
2553                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2554             {
2555               if (dump_enabled_p ())
2556                 {
2557                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2558                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2559                 }
2560
2561               swap_ssa_operands (next_stmt,
2562                                  gimple_assign_rhs1_ptr (next_stmt),
2563                                  gimple_assign_rhs2_ptr (next_stmt));
2564               update_stmt (next_stmt);
2565
2566               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2567                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2568             }
2569           else
2570             return false;
2571         }
2572
2573       lhs = gimple_assign_lhs (next_stmt);
2574       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2575     }
2576
2577   /* Save the chain for further analysis in SLP detection.  */
2578   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2579   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2580   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2581
2582   return true;
2583 }
2584
2585
2586 /* Function vect_is_simple_reduction_1
2587
2588    (1) Detect a cross-iteration def-use cycle that represents a simple
2589    reduction computation.  We look for the following pattern:
2590
2591    loop_header:
2592      a1 = phi < a0, a2 >
2593      a3 = ...
2594      a2 = operation (a3, a1)
2595
2596    or
2597
2598    a3 = ...
2599    loop_header:
2600      a1 = phi < a0, a2 >
2601      a2 = operation (a3, a1)
2602
2603    such that:
2604    1. operation is commutative and associative and it is safe to
2605       change the order of the computation (if CHECK_REDUCTION is true)
2606    2. no uses for a2 in the loop (a2 is used out of the loop)
2607    3. no uses of a1 in the loop besides the reduction operation
2608    4. no uses of a1 outside the loop.
2609
2610    Conditions 1,4 are tested here.
2611    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2612
2613    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2614    nested cycles, if CHECK_REDUCTION is false.
2615
2616    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2617    reductions:
2618
2619      a1 = phi < a0, a2 >
2620      inner loop (def of a3)
2621      a2 = phi < a3 >
2622
2623    (4) Detect condition expressions, ie:
2624      for (int i = 0; i < N; i++)
2625        if (a[i] < val)
2626         ret_val = a[i];
2627
2628 */
2629
2630 static gimple *
2631 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2632                           bool check_reduction, bool *double_reduc,
2633                           bool need_wrapping_integral_overflow,
2634                           enum vect_reduction_type *v_reduc_type)
2635 {
2636   struct loop *loop = (gimple_bb (phi))->loop_father;
2637   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2638   edge latch_e = loop_latch_edge (loop);
2639   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2640   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2641   enum tree_code orig_code, code;
2642   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2643   tree type;
2644   int nloop_uses;
2645   tree name;
2646   imm_use_iterator imm_iter;
2647   use_operand_p use_p;
2648   bool phi_def;
2649
2650   *double_reduc = false;
2651   *v_reduc_type = TREE_CODE_REDUCTION;
2652
2653   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2654      otherwise, we assume outer loop vectorization.  */
2655   gcc_assert ((check_reduction && loop == vect_loop)
2656               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2657
2658   name = PHI_RESULT (phi);
2659   /* ???  If there are no uses of the PHI result the inner loop reduction
2660      won't be detected as possibly double-reduction by vectorizable_reduction
2661      because that tries to walk the PHI arg from the preheader edge which
2662      can be constant.  See PR60382.  */
2663   if (has_zero_uses (name))
2664     return NULL;
2665   nloop_uses = 0;
2666   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2667     {
2668       gimple *use_stmt = USE_STMT (use_p);
2669       if (is_gimple_debug (use_stmt))
2670         continue;
2671
2672       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2673         {
2674           if (dump_enabled_p ())
2675             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2676                              "intermediate value used outside loop.\n");
2677
2678           return NULL;
2679         }
2680
2681       nloop_uses++;
2682       if (nloop_uses > 1)
2683         {
2684           if (dump_enabled_p ())
2685             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2686                              "reduction used in loop.\n");
2687           return NULL;
2688         }
2689
2690       phi_use_stmt = use_stmt;
2691     }
2692
2693   if (TREE_CODE (loop_arg) != SSA_NAME)
2694     {
2695       if (dump_enabled_p ())
2696         {
2697           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2698                            "reduction: not ssa_name: ");
2699           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2700           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2701         }
2702       return NULL;
2703     }
2704
2705   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2706   if (!def_stmt)
2707     {
2708       if (dump_enabled_p ())
2709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2710                          "reduction: no def_stmt.\n");
2711       return NULL;
2712     }
2713
2714   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2715     {
2716       if (dump_enabled_p ())
2717         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2718       return NULL;
2719     }
2720
2721   if (is_gimple_assign (def_stmt))
2722     {
2723       name = gimple_assign_lhs (def_stmt);
2724       phi_def = false;
2725     }
2726   else
2727     {
2728       name = PHI_RESULT (def_stmt);
2729       phi_def = true;
2730     }
2731
2732   nloop_uses = 0;
2733   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2734     {
2735       gimple *use_stmt = USE_STMT (use_p);
2736       if (is_gimple_debug (use_stmt))
2737         continue;
2738       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2739         nloop_uses++;
2740       if (nloop_uses > 1)
2741         {
2742           if (dump_enabled_p ())
2743             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2744                              "reduction used in loop.\n");
2745           return NULL;
2746         }
2747     }
2748
2749   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2750      defined in the inner loop.  */
2751   if (phi_def)
2752     {
2753       op1 = PHI_ARG_DEF (def_stmt, 0);
2754
2755       if (gimple_phi_num_args (def_stmt) != 1
2756           || TREE_CODE (op1) != SSA_NAME)
2757         {
2758           if (dump_enabled_p ())
2759             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2760                              "unsupported phi node definition.\n");
2761
2762           return NULL;
2763         }
2764
2765       def1 = SSA_NAME_DEF_STMT (op1);
2766       if (gimple_bb (def1)
2767           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2768           && loop->inner
2769           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2770           && is_gimple_assign (def1)
2771           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2772         {
2773           if (dump_enabled_p ())
2774             report_vect_op (MSG_NOTE, def_stmt,
2775                             "detected double reduction: ");
2776
2777           *double_reduc = true;
2778           return def_stmt;
2779         }
2780
2781       return NULL;
2782     }
2783
2784   code = orig_code = gimple_assign_rhs_code (def_stmt);
2785
2786   /* We can handle "res -= x[i]", which is non-associative by
2787      simply rewriting this into "res += -x[i]".  Avoid changing
2788      gimple instruction for the first simple tests and only do this
2789      if we're allowed to change code at all.  */
2790   if (code == MINUS_EXPR
2791       && (op1 = gimple_assign_rhs1 (def_stmt))
2792       && TREE_CODE (op1) == SSA_NAME
2793       && SSA_NAME_DEF_STMT (op1) == phi)
2794     code = PLUS_EXPR;
2795
2796   if (code == COND_EXPR)
2797     {
2798       if (check_reduction)
2799         *v_reduc_type = COND_REDUCTION;
2800     }
2801   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2802     {
2803       if (dump_enabled_p ())
2804         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2805                         "reduction: not commutative/associative: ");
2806       return NULL;
2807     }
2808
2809   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2810     {
2811       if (code != COND_EXPR)
2812         {
2813           if (dump_enabled_p ())
2814             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2815                             "reduction: not binary operation: ");
2816
2817           return NULL;
2818         }
2819
2820       op3 = gimple_assign_rhs1 (def_stmt);
2821       if (COMPARISON_CLASS_P (op3))
2822         {
2823           op4 = TREE_OPERAND (op3, 1);
2824           op3 = TREE_OPERAND (op3, 0);
2825         }
2826
2827       op1 = gimple_assign_rhs2 (def_stmt);
2828       op2 = gimple_assign_rhs3 (def_stmt);
2829
2830       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2831         {
2832           if (dump_enabled_p ())
2833             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2834                             "reduction: uses not ssa_names: ");
2835
2836           return NULL;
2837         }
2838     }
2839   else
2840     {
2841       op1 = gimple_assign_rhs1 (def_stmt);
2842       op2 = gimple_assign_rhs2 (def_stmt);
2843
2844       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2845         {
2846           if (dump_enabled_p ())
2847             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2848                             "reduction: uses not ssa_names: ");
2849
2850           return NULL;
2851         }
2852    }
2853
2854   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2855   if ((TREE_CODE (op1) == SSA_NAME
2856        && !types_compatible_p (type,TREE_TYPE (op1)))
2857       || (TREE_CODE (op2) == SSA_NAME
2858           && !types_compatible_p (type, TREE_TYPE (op2)))
2859       || (op3 && TREE_CODE (op3) == SSA_NAME
2860           && !types_compatible_p (type, TREE_TYPE (op3)))
2861       || (op4 && TREE_CODE (op4) == SSA_NAME
2862           && !types_compatible_p (type, TREE_TYPE (op4))))
2863     {
2864       if (dump_enabled_p ())
2865         {
2866           dump_printf_loc (MSG_NOTE, vect_location,
2867                            "reduction: multiple types: operation type: ");
2868           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2869           dump_printf (MSG_NOTE, ", operands types: ");
2870           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2871                              TREE_TYPE (op1));
2872           dump_printf (MSG_NOTE, ",");
2873           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2874                              TREE_TYPE (op2));
2875           if (op3)
2876             {
2877               dump_printf (MSG_NOTE, ",");
2878               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2879                                  TREE_TYPE (op3));
2880             }
2881
2882           if (op4)
2883             {
2884               dump_printf (MSG_NOTE, ",");
2885               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2886                                  TREE_TYPE (op4));
2887             }
2888           dump_printf (MSG_NOTE, "\n");
2889         }
2890
2891       return NULL;
2892     }
2893
2894   /* Check that it's ok to change the order of the computation.
2895      Generally, when vectorizing a reduction we change the order of the
2896      computation.  This may change the behavior of the program in some
2897      cases, so we need to check that this is ok.  One exception is when
2898      vectorizing an outer-loop: the inner-loop is executed sequentially,
2899      and therefore vectorizing reductions in the inner-loop during
2900      outer-loop vectorization is safe.  */
2901
2902   if (*v_reduc_type != COND_REDUCTION
2903       && check_reduction)
2904     {
2905       /* CHECKME: check for !flag_finite_math_only too?  */
2906       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2907         {
2908           /* Changing the order of operations changes the semantics.  */
2909           if (dump_enabled_p ())
2910             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2911                         "reduction: unsafe fp math optimization: ");
2912           return NULL;
2913         }
2914       else if (INTEGRAL_TYPE_P (type))
2915         {
2916           if (!operation_no_trapping_overflow (type, code))
2917             {
2918               /* Changing the order of operations changes the semantics.  */
2919               if (dump_enabled_p ())
2920                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2921                                 "reduction: unsafe int math optimization"
2922                                 " (overflow traps): ");
2923               return NULL;
2924             }
2925           if (need_wrapping_integral_overflow
2926               && !TYPE_OVERFLOW_WRAPS (type)
2927               && operation_can_overflow (code))
2928             {
2929               /* Changing the order of operations changes the semantics.  */
2930               if (dump_enabled_p ())
2931                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2932                                 "reduction: unsafe int math optimization"
2933                                 " (overflow doesn't wrap): ");
2934               return NULL;
2935             }
2936         }
2937       else if (SAT_FIXED_POINT_TYPE_P (type))
2938         {
2939           /* Changing the order of operations changes the semantics.  */
2940           if (dump_enabled_p ())
2941           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2942                           "reduction: unsafe fixed-point math optimization: ");
2943           return NULL;
2944         }
2945     }
2946
2947   /* Reduction is safe. We're dealing with one of the following:
2948      1) integer arithmetic and no trapv
2949      2) floating point arithmetic, and special flags permit this optimization
2950      3) nested cycle (i.e., outer loop vectorization).  */
2951   if (TREE_CODE (op1) == SSA_NAME)
2952     def1 = SSA_NAME_DEF_STMT (op1);
2953
2954   if (TREE_CODE (op2) == SSA_NAME)
2955     def2 = SSA_NAME_DEF_STMT (op2);
2956
2957   if (code != COND_EXPR
2958       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2959     {
2960       if (dump_enabled_p ())
2961         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2962       return NULL;
2963     }
2964
2965   /* Check that one def is the reduction def, defined by PHI,
2966      the other def is either defined in the loop ("vect_internal_def"),
2967      or it's an induction (defined by a loop-header phi-node).  */
2968
2969   if (def2 && def2 == phi
2970       && (code == COND_EXPR
2971           || !def1 || gimple_nop_p (def1)
2972           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2973           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2974               && (is_gimple_assign (def1)
2975                   || is_gimple_call (def1)
2976                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2977                       == vect_induction_def
2978                   || (gimple_code (def1) == GIMPLE_PHI
2979                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2980                           == vect_internal_def
2981                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2982     {
2983       if (dump_enabled_p ())
2984         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2985       return def_stmt;
2986     }
2987
2988   if (def1 && def1 == phi
2989       && (code == COND_EXPR
2990           || !def2 || gimple_nop_p (def2)
2991           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2992           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2993               && (is_gimple_assign (def2)
2994                   || is_gimple_call (def2)
2995                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2996                       == vect_induction_def
2997                   || (gimple_code (def2) == GIMPLE_PHI
2998                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2999                           == vect_internal_def
3000                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3001     {
3002       if (check_reduction
3003           && orig_code != MINUS_EXPR)
3004         {
3005           if (code == COND_EXPR)
3006             {
3007               /* No current known use where this case would be useful.  */
3008               if (dump_enabled_p ())
3009                 report_vect_op (MSG_NOTE, def_stmt,
3010                                 "detected reduction: cannot currently swap "
3011                                 "operands for cond_expr");
3012               return NULL;
3013             }
3014
3015           /* Swap operands (just for simplicity - so that the rest of the code
3016              can assume that the reduction variable is always the last (second)
3017              argument).  */
3018           if (dump_enabled_p ())
3019             report_vect_op (MSG_NOTE, def_stmt,
3020                             "detected reduction: need to swap operands: ");
3021
3022           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3023                              gimple_assign_rhs2_ptr (def_stmt));
3024
3025           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3026             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3027         }
3028       else
3029         {
3030           if (dump_enabled_p ())
3031             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3032         }
3033
3034       return def_stmt;
3035     }
3036
3037   /* Try to find SLP reduction chain.  */
3038   if (check_reduction && code != COND_EXPR
3039       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3040     {
3041       if (dump_enabled_p ())
3042         report_vect_op (MSG_NOTE, def_stmt,
3043                         "reduction: detected reduction chain: ");
3044
3045       return def_stmt;
3046     }
3047
3048   if (dump_enabled_p ())
3049     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3050                     "reduction: unknown pattern: ");
3051
3052   return NULL;
3053 }
3054
3055 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3056    in-place if it enables detection of more reductions.  Arguments
3057    as there.  */
3058
3059 gimple *
3060 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3061                              bool check_reduction, bool *double_reduc,
3062                              bool need_wrapping_integral_overflow)
3063 {
3064   enum vect_reduction_type v_reduc_type;
3065   return vect_is_simple_reduction (loop_info, phi, check_reduction,
3066                                    double_reduc,
3067                                    need_wrapping_integral_overflow,
3068                                    &v_reduc_type);
3069 }
3070
3071 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3072 int
3073 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3074                              int *peel_iters_epilogue,
3075                              stmt_vector_for_cost *scalar_cost_vec,
3076                              stmt_vector_for_cost *prologue_cost_vec,
3077                              stmt_vector_for_cost *epilogue_cost_vec)
3078 {
3079   int retval = 0;
3080   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3081
3082   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3083     {
3084       *peel_iters_epilogue = vf/2;
3085       if (dump_enabled_p ())
3086         dump_printf_loc (MSG_NOTE, vect_location,
3087                          "cost model: epilogue peel iters set to vf/2 "
3088                          "because loop iterations are unknown .\n");
3089
3090       /* If peeled iterations are known but number of scalar loop
3091          iterations are unknown, count a taken branch per peeled loop.  */
3092       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3093                                  NULL, 0, vect_prologue);
3094       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3095                                  NULL, 0, vect_epilogue);
3096     }
3097   else
3098     {
3099       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3100       peel_iters_prologue = niters < peel_iters_prologue ?
3101                             niters : peel_iters_prologue;
3102       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3103       /* If we need to peel for gaps, but no peeling is required, we have to
3104          peel VF iterations.  */
3105       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3106         *peel_iters_epilogue = vf;
3107     }
3108
3109   stmt_info_for_cost *si;
3110   int j;
3111   if (peel_iters_prologue)
3112     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3113       retval += record_stmt_cost (prologue_cost_vec,
3114                                   si->count * peel_iters_prologue,
3115                                   si->kind, NULL, si->misalign,
3116                                   vect_prologue);
3117   if (*peel_iters_epilogue)
3118     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3119       retval += record_stmt_cost (epilogue_cost_vec,
3120                                   si->count * *peel_iters_epilogue,
3121                                   si->kind, NULL, si->misalign,
3122                                   vect_epilogue);
3123
3124   return retval;
3125 }
3126
3127 /* Function vect_estimate_min_profitable_iters
3128
3129    Return the number of iterations required for the vector version of the
3130    loop to be profitable relative to the cost of the scalar version of the
3131    loop.
3132
3133    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3134    of iterations for vectorization.  -1 value means loop vectorization
3135    is not profitable.  This returned value may be used for dynamic
3136    profitability check.
3137
3138    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3139    for static check against estimated number of iterations.  */
3140
3141 static void
3142 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3143                                     int *ret_min_profitable_niters,
3144                                     int *ret_min_profitable_estimate)
3145 {
3146   int min_profitable_iters;
3147   int min_profitable_estimate;
3148   int peel_iters_prologue;
3149   int peel_iters_epilogue;
3150   unsigned vec_inside_cost = 0;
3151   int vec_outside_cost = 0;
3152   unsigned vec_prologue_cost = 0;
3153   unsigned vec_epilogue_cost = 0;
3154   int scalar_single_iter_cost = 0;
3155   int scalar_outside_cost = 0;
3156   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3157   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3158   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3159
3160   /* Cost model disabled.  */
3161   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3162     {
3163       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3164       *ret_min_profitable_niters = 0;
3165       *ret_min_profitable_estimate = 0;
3166       return;
3167     }
3168
3169   /* Requires loop versioning tests to handle misalignment.  */
3170   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3171     {
3172       /*  FIXME: Make cost depend on complexity of individual check.  */
3173       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3174       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3175                             vect_prologue);
3176       dump_printf (MSG_NOTE,
3177                    "cost model: Adding cost of checks for loop "
3178                    "versioning to treat misalignment.\n");
3179     }
3180
3181   /* Requires loop versioning with alias checks.  */
3182   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3183     {
3184       /*  FIXME: Make cost depend on complexity of individual check.  */
3185       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3186       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3187                             vect_prologue);
3188       dump_printf (MSG_NOTE,
3189                    "cost model: Adding cost of checks for loop "
3190                    "versioning aliasing.\n");
3191     }
3192
3193   /* Requires loop versioning with niter checks.  */
3194   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3195     {
3196       /*  FIXME: Make cost depend on complexity of individual check.  */
3197       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3198                             vect_prologue);
3199       dump_printf (MSG_NOTE,
3200                    "cost model: Adding cost of checks for loop "
3201                    "versioning niters.\n");
3202     }
3203
3204   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3205     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3206                           vect_prologue);
3207
3208   /* Count statements in scalar loop.  Using this as scalar cost for a single
3209      iteration for now.
3210
3211      TODO: Add outer loop support.
3212
3213      TODO: Consider assigning different costs to different scalar
3214      statements.  */
3215
3216   scalar_single_iter_cost
3217     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3218
3219   /* Add additional cost for the peeled instructions in prologue and epilogue
3220      loop.
3221
3222      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3223      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3224
3225      TODO: Build an expression that represents peel_iters for prologue and
3226      epilogue to be used in a run-time test.  */
3227
3228   if (npeel  < 0)
3229     {
3230       peel_iters_prologue = vf/2;
3231       dump_printf (MSG_NOTE, "cost model: "
3232                    "prologue peel iters set to vf/2.\n");
3233
3234       /* If peeling for alignment is unknown, loop bound of main loop becomes
3235          unknown.  */
3236       peel_iters_epilogue = vf/2;
3237       dump_printf (MSG_NOTE, "cost model: "
3238                    "epilogue peel iters set to vf/2 because "
3239                    "peeling for alignment is unknown.\n");
3240
3241       /* If peeled iterations are unknown, count a taken branch and a not taken
3242          branch per peeled loop. Even if scalar loop iterations are known,
3243          vector iterations are not known since peeled prologue iterations are
3244          not known. Hence guards remain the same.  */
3245       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3246                             NULL, 0, vect_prologue);
3247       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3248                             NULL, 0, vect_prologue);
3249       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3250                             NULL, 0, vect_epilogue);
3251       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3252                             NULL, 0, vect_epilogue);
3253       stmt_info_for_cost *si;
3254       int j;
3255       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3256         {
3257           struct _stmt_vec_info *stmt_info
3258             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3259           (void) add_stmt_cost (target_cost_data,
3260                                 si->count * peel_iters_prologue,
3261                                 si->kind, stmt_info, si->misalign,
3262                                 vect_prologue);
3263           (void) add_stmt_cost (target_cost_data,
3264                                 si->count * peel_iters_epilogue,
3265                                 si->kind, stmt_info, si->misalign,
3266                                 vect_epilogue);
3267         }
3268     }
3269   else
3270     {
3271       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3272       stmt_info_for_cost *si;
3273       int j;
3274       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3275
3276       prologue_cost_vec.create (2);
3277       epilogue_cost_vec.create (2);
3278       peel_iters_prologue = npeel;
3279
3280       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3281                                           &peel_iters_epilogue,
3282                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3283                                             (loop_vinfo),
3284                                           &prologue_cost_vec,
3285                                           &epilogue_cost_vec);
3286
3287       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3288         {
3289           struct _stmt_vec_info *stmt_info
3290             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3291           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3292                                 si->misalign, vect_prologue);
3293         }
3294
3295       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3296         {
3297           struct _stmt_vec_info *stmt_info
3298             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3299           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3300                                 si->misalign, vect_epilogue);
3301         }
3302
3303       prologue_cost_vec.release ();
3304       epilogue_cost_vec.release ();
3305     }
3306
3307   /* FORNOW: The scalar outside cost is incremented in one of the
3308      following ways:
3309
3310      1. The vectorizer checks for alignment and aliasing and generates
3311      a condition that allows dynamic vectorization.  A cost model
3312      check is ANDED with the versioning condition.  Hence scalar code
3313      path now has the added cost of the versioning check.
3314
3315        if (cost > th & versioning_check)
3316          jmp to vector code
3317
3318      Hence run-time scalar is incremented by not-taken branch cost.
3319
3320      2. The vectorizer then checks if a prologue is required.  If the
3321      cost model check was not done before during versioning, it has to
3322      be done before the prologue check.
3323
3324        if (cost <= th)
3325          prologue = scalar_iters
3326        if (prologue == 0)
3327          jmp to vector code
3328        else
3329          execute prologue
3330        if (prologue == num_iters)
3331          go to exit
3332
3333      Hence the run-time scalar cost is incremented by a taken branch,
3334      plus a not-taken branch, plus a taken branch cost.
3335
3336      3. The vectorizer then checks if an epilogue is required.  If the
3337      cost model check was not done before during prologue check, it
3338      has to be done with the epilogue check.
3339
3340        if (prologue == 0)
3341          jmp to vector code
3342        else
3343          execute prologue
3344        if (prologue == num_iters)
3345          go to exit
3346        vector code:
3347          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3348            jmp to epilogue
3349
3350      Hence the run-time scalar cost should be incremented by 2 taken
3351      branches.
3352
3353      TODO: The back end may reorder the BBS's differently and reverse
3354      conditions/branch directions.  Change the estimates below to
3355      something more reasonable.  */
3356
3357   /* If the number of iterations is known and we do not do versioning, we can
3358      decide whether to vectorize at compile time.  Hence the scalar version
3359      do not carry cost model guard costs.  */
3360   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3361       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3362     {
3363       /* Cost model check occurs at versioning.  */
3364       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3365         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3366       else
3367         {
3368           /* Cost model check occurs at prologue generation.  */
3369           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3370             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3371               + vect_get_stmt_cost (cond_branch_not_taken);
3372           /* Cost model check occurs at epilogue generation.  */
3373           else
3374             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3375         }
3376     }
3377
3378   /* Complete the target-specific cost calculations.  */
3379   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3380                &vec_inside_cost, &vec_epilogue_cost);
3381
3382   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3383
3384   if (dump_enabled_p ())
3385     {
3386       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3387       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3388                    vec_inside_cost);
3389       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3390                    vec_prologue_cost);
3391       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3392                    vec_epilogue_cost);
3393       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3394                    scalar_single_iter_cost);
3395       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3396                    scalar_outside_cost);
3397       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3398                    vec_outside_cost);
3399       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3400                    peel_iters_prologue);
3401       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3402                    peel_iters_epilogue);
3403     }
3404
3405   /* Calculate number of iterations required to make the vector version
3406      profitable, relative to the loop bodies only.  The following condition
3407      must hold true:
3408      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3409      where
3410      SIC = scalar iteration cost, VIC = vector iteration cost,
3411      VOC = vector outside cost, VF = vectorization factor,
3412      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3413      SOC = scalar outside cost for run time cost model check.  */
3414
3415   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3416     {
3417       if (vec_outside_cost <= 0)
3418         min_profitable_iters = 1;
3419       else
3420         {
3421           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3422                                   - vec_inside_cost * peel_iters_prologue
3423                                   - vec_inside_cost * peel_iters_epilogue)
3424                                  / ((scalar_single_iter_cost * vf)
3425                                     - vec_inside_cost);
3426
3427           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3428               <= (((int) vec_inside_cost * min_profitable_iters)
3429                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3430             min_profitable_iters++;
3431         }
3432     }
3433   /* vector version will never be profitable.  */
3434   else
3435     {
3436       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3437         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3438                     "did not happen for a simd loop");
3439
3440       if (dump_enabled_p ())
3441         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3442                          "cost model: the vector iteration cost = %d "
3443                          "divided by the scalar iteration cost = %d "
3444                          "is greater or equal to the vectorization factor = %d"
3445                          ".\n",
3446                          vec_inside_cost, scalar_single_iter_cost, vf);
3447       *ret_min_profitable_niters = -1;
3448       *ret_min_profitable_estimate = -1;
3449       return;
3450     }
3451
3452   dump_printf (MSG_NOTE,
3453                "  Calculated minimum iters for profitability: %d\n",
3454                min_profitable_iters);
3455
3456   min_profitable_iters =
3457         min_profitable_iters < vf ? vf : min_profitable_iters;
3458
3459   /* Because the condition we create is:
3460      if (niters <= min_profitable_iters)
3461        then skip the vectorized loop.  */
3462   min_profitable_iters--;
3463
3464   if (dump_enabled_p ())
3465     dump_printf_loc (MSG_NOTE, vect_location,
3466                      "  Runtime profitability threshold = %d\n",
3467                      min_profitable_iters);
3468
3469   *ret_min_profitable_niters = min_profitable_iters;
3470
3471   /* Calculate number of iterations required to make the vector version
3472      profitable, relative to the loop bodies only.
3473
3474      Non-vectorized variant is SIC * niters and it must win over vector
3475      variant on the expected loop trip count.  The following condition must hold true:
3476      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3477
3478   if (vec_outside_cost <= 0)
3479     min_profitable_estimate = 1;
3480   else
3481     {
3482       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3483                                  - vec_inside_cost * peel_iters_prologue
3484                                  - vec_inside_cost * peel_iters_epilogue)
3485                                  / ((scalar_single_iter_cost * vf)
3486                                    - vec_inside_cost);
3487     }
3488   min_profitable_estimate --;
3489   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3490   if (dump_enabled_p ())
3491     dump_printf_loc (MSG_NOTE, vect_location,
3492                      "  Static estimate profitability threshold = %d\n",
3493                      min_profitable_estimate);
3494
3495   *ret_min_profitable_estimate = min_profitable_estimate;
3496 }
3497
3498 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3499    vector elements (not bits) for a vector of mode MODE.  */
3500 static void
3501 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3502                               unsigned char *sel)
3503 {
3504   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3505
3506   for (i = 0; i < nelt; i++)
3507     sel[i] = (i + offset) & (2*nelt - 1);
3508 }
3509
3510 /* Checks whether the target supports whole-vector shifts for vectors of mode
3511    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3512    it supports vec_perm_const with masks for all necessary shift amounts.  */
3513 static bool
3514 have_whole_vector_shift (enum machine_mode mode)
3515 {
3516   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3517     return true;
3518
3519   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3520     return false;
3521
3522   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3523   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3524
3525   for (i = nelt/2; i >= 1; i/=2)
3526     {
3527       calc_vec_perm_mask_for_shift (mode, i, sel);
3528       if (!can_vec_perm_p (mode, false, sel))
3529         return false;
3530     }
3531   return true;
3532 }
3533
3534 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3535
3536 static tree
3537 get_reduction_op (gimple *stmt, int reduc_index)
3538 {
3539   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3540     {
3541     case GIMPLE_SINGLE_RHS:
3542       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3543                   == ternary_op);
3544       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3545     case GIMPLE_UNARY_RHS:
3546       return gimple_assign_rhs1 (stmt);
3547     case GIMPLE_BINARY_RHS:
3548       return (reduc_index
3549               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3550     case GIMPLE_TERNARY_RHS:
3551       return gimple_op (stmt, reduc_index + 1);
3552     default:
3553       gcc_unreachable ();
3554     }
3555 }
3556
3557 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3558    functions. Design better to avoid maintenance issues.  */
3559
3560 /* Function vect_model_reduction_cost.
3561
3562    Models cost for a reduction operation, including the vector ops
3563    generated within the strip-mine loop, the initial definition before
3564    the loop, and the epilogue code that must be generated.  */
3565
3566 static bool
3567 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3568                            int ncopies, int reduc_index)
3569 {
3570   int prologue_cost = 0, epilogue_cost = 0;
3571   enum tree_code code;
3572   optab optab;
3573   tree vectype;
3574   gimple *stmt, *orig_stmt;
3575   tree reduction_op;
3576   machine_mode mode;
3577   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3578   struct loop *loop = NULL;
3579   void *target_cost_data;
3580
3581   if (loop_vinfo)
3582     {
3583       loop = LOOP_VINFO_LOOP (loop_vinfo);
3584       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3585     }
3586   else
3587     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3588
3589   /* Condition reductions generate two reductions in the loop.  */
3590   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3591     ncopies *= 2;
3592
3593   /* Cost of reduction op inside loop.  */
3594   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3595                                         stmt_info, 0, vect_body);
3596   stmt = STMT_VINFO_STMT (stmt_info);
3597
3598   reduction_op = get_reduction_op (stmt, reduc_index);
3599
3600   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3601   if (!vectype)
3602     {
3603       if (dump_enabled_p ())
3604         {
3605           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3606                            "unsupported data-type ");
3607           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3608                              TREE_TYPE (reduction_op));
3609           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3610         }
3611       return false;
3612    }
3613
3614   mode = TYPE_MODE (vectype);
3615   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3616
3617   if (!orig_stmt)
3618     orig_stmt = STMT_VINFO_STMT (stmt_info);
3619
3620   code = gimple_assign_rhs_code (orig_stmt);
3621
3622   /* Add in cost for initial definition.
3623      For cond reduction we have four vectors: initial index, step, initial
3624      result of the data reduction, initial value of the index reduction.  */
3625   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3626                        == COND_REDUCTION ? 4 : 1;
3627   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3628                                   scalar_to_vec, stmt_info, 0,
3629                                   vect_prologue);
3630
3631   /* Determine cost of epilogue code.
3632
3633      We have a reduction operator that will reduce the vector in one statement.
3634      Also requires scalar extract.  */
3635
3636   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3637     {
3638       if (reduc_code != ERROR_MARK)
3639         {
3640           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3641             {
3642               /* An EQ stmt and an COND_EXPR stmt.  */
3643               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3644                                               vector_stmt, stmt_info, 0,
3645                                               vect_epilogue);
3646               /* Reduction of the max index and a reduction of the found
3647                  values.  */
3648               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3649                                               vec_to_scalar, stmt_info, 0,
3650                                               vect_epilogue);
3651               /* A broadcast of the max value.  */
3652               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3653                                               scalar_to_vec, stmt_info, 0,
3654                                               vect_epilogue);
3655             }
3656           else
3657             {
3658               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3659                                               stmt_info, 0, vect_epilogue);
3660               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3661                                               vec_to_scalar, stmt_info, 0,
3662                                               vect_epilogue);
3663             }
3664         }
3665       else
3666         {
3667           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3668           tree bitsize =
3669             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3670           int element_bitsize = tree_to_uhwi (bitsize);
3671           int nelements = vec_size_in_bits / element_bitsize;
3672
3673           optab = optab_for_tree_code (code, vectype, optab_default);
3674
3675           /* We have a whole vector shift available.  */
3676           if (VECTOR_MODE_P (mode)
3677               && optab_handler (optab, mode) != CODE_FOR_nothing
3678               && have_whole_vector_shift (mode))
3679             {
3680               /* Final reduction via vector shifts and the reduction operator.
3681                  Also requires scalar extract.  */
3682               epilogue_cost += add_stmt_cost (target_cost_data,
3683                                               exact_log2 (nelements) * 2,
3684                                               vector_stmt, stmt_info, 0,
3685                                               vect_epilogue);
3686               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3687                                               vec_to_scalar, stmt_info, 0,
3688                                               vect_epilogue);
3689             }
3690           else
3691             /* Use extracts and reduction op for final reduction.  For N
3692                elements, we have N extracts and N-1 reduction ops.  */
3693             epilogue_cost += add_stmt_cost (target_cost_data,
3694                                             nelements + nelements - 1,
3695                                             vector_stmt, stmt_info, 0,
3696                                             vect_epilogue);
3697         }
3698     }
3699
3700   if (dump_enabled_p ())
3701     dump_printf (MSG_NOTE,
3702                  "vect_model_reduction_cost: inside_cost = %d, "
3703                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3704                  prologue_cost, epilogue_cost);
3705
3706   return true;
3707 }
3708
3709
3710 /* Function vect_model_induction_cost.
3711
3712    Models cost for induction operations.  */
3713
3714 static void
3715 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3716 {
3717   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3718   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3719   unsigned inside_cost, prologue_cost;
3720
3721   /* loop cost for vec_loop.  */
3722   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3723                                stmt_info, 0, vect_body);
3724
3725   /* prologue cost for vec_init and vec_step.  */
3726   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3727                                  stmt_info, 0, vect_prologue);
3728
3729   if (dump_enabled_p ())
3730     dump_printf_loc (MSG_NOTE, vect_location,
3731                      "vect_model_induction_cost: inside_cost = %d, "
3732                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3733 }
3734
3735
3736 /* Function get_initial_def_for_induction
3737
3738    Input:
3739    STMT - a stmt that performs an induction operation in the loop.
3740    IV_PHI - the initial value of the induction variable
3741
3742    Output:
3743    Return a vector variable, initialized with the first VF values of
3744    the induction variable.  E.g., for an iv with IV_PHI='X' and
3745    evolution S, for a vector of 4 units, we want to return:
3746    [X, X + S, X + 2*S, X + 3*S].  */
3747
3748 static tree
3749 get_initial_def_for_induction (gimple *iv_phi)
3750 {
3751   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3752   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3753   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3754   tree vectype;
3755   int nunits;
3756   edge pe = loop_preheader_edge (loop);
3757   struct loop *iv_loop;
3758   basic_block new_bb;
3759   tree new_vec, vec_init, vec_step, t;
3760   tree new_name;
3761   gimple *new_stmt;
3762   gphi *induction_phi;
3763   tree induc_def, vec_def, vec_dest;
3764   tree init_expr, step_expr;
3765   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3766   int i;
3767   int ncopies;
3768   tree expr;
3769   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3770   bool nested_in_vect_loop = false;
3771   gimple_seq stmts;
3772   imm_use_iterator imm_iter;
3773   use_operand_p use_p;
3774   gimple *exit_phi;
3775   edge latch_e;
3776   tree loop_arg;
3777   gimple_stmt_iterator si;
3778   basic_block bb = gimple_bb (iv_phi);
3779   tree stepvectype;
3780   tree resvectype;
3781
3782   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3783   if (nested_in_vect_loop_p (loop, iv_phi))
3784     {
3785       nested_in_vect_loop = true;
3786       iv_loop = loop->inner;
3787     }
3788   else
3789     iv_loop = loop;
3790   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3791
3792   latch_e = loop_latch_edge (iv_loop);
3793   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3794
3795   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3796   gcc_assert (step_expr != NULL_TREE);
3797
3798   pe = loop_preheader_edge (iv_loop);
3799   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3800                                      loop_preheader_edge (iv_loop));
3801
3802   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3803   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3804   gcc_assert (vectype);
3805   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3806   ncopies = vf / nunits;
3807
3808   gcc_assert (phi_info);
3809   gcc_assert (ncopies >= 1);
3810
3811   /* Convert the step to the desired type.  */
3812   stmts = NULL;
3813   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3814   if (stmts)
3815     {
3816       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3817       gcc_assert (!new_bb);
3818     }
3819
3820   /* Find the first insertion point in the BB.  */
3821   si = gsi_after_labels (bb);
3822
3823   /* Create the vector that holds the initial_value of the induction.  */
3824   if (nested_in_vect_loop)
3825     {
3826       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3827          been created during vectorization of previous stmts.  We obtain it
3828          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3829       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3830       /* If the initial value is not of proper type, convert it.  */
3831       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3832         {
3833           new_stmt
3834             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3835                                                           vect_simple_var,
3836                                                           "vec_iv_"),
3837                                    VIEW_CONVERT_EXPR,
3838                                    build1 (VIEW_CONVERT_EXPR, vectype,
3839                                            vec_init));
3840           vec_init = gimple_assign_lhs (new_stmt);
3841           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3842                                                  new_stmt);
3843           gcc_assert (!new_bb);
3844           set_vinfo_for_stmt (new_stmt,
3845                               new_stmt_vec_info (new_stmt, loop_vinfo));
3846         }
3847     }
3848   else
3849     {
3850       vec<constructor_elt, va_gc> *v;
3851
3852       /* iv_loop is the loop to be vectorized. Create:
3853          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3854       stmts = NULL;
3855       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3856
3857       vec_alloc (v, nunits);
3858       bool constant_p = is_gimple_min_invariant (new_name);
3859       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3860       for (i = 1; i < nunits; i++)
3861         {
3862           /* Create: new_name_i = new_name + step_expr  */
3863           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3864                                    new_name, step_expr);
3865           if (!is_gimple_min_invariant (new_name))
3866             constant_p = false;
3867           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3868         }
3869       if (stmts)
3870         {
3871           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3872           gcc_assert (!new_bb);
3873         }
3874
3875       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3876       if (constant_p)
3877         new_vec = build_vector_from_ctor (vectype, v);
3878       else
3879         new_vec = build_constructor (vectype, v);
3880       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3881     }
3882
3883
3884   /* Create the vector that holds the step of the induction.  */
3885   if (nested_in_vect_loop)
3886     /* iv_loop is nested in the loop to be vectorized. Generate:
3887        vec_step = [S, S, S, S]  */
3888     new_name = step_expr;
3889   else
3890     {
3891       /* iv_loop is the loop to be vectorized. Generate:
3892           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3893       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3894         {
3895           expr = build_int_cst (integer_type_node, vf);
3896           expr = fold_convert (TREE_TYPE (step_expr), expr);
3897         }
3898       else
3899         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3900       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3901                               expr, step_expr);
3902       if (TREE_CODE (step_expr) == SSA_NAME)
3903         new_name = vect_init_vector (iv_phi, new_name,
3904                                      TREE_TYPE (step_expr), NULL);
3905     }
3906
3907   t = unshare_expr (new_name);
3908   gcc_assert (CONSTANT_CLASS_P (new_name)
3909               || TREE_CODE (new_name) == SSA_NAME);
3910   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3911   gcc_assert (stepvectype);
3912   new_vec = build_vector_from_val (stepvectype, t);
3913   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3914
3915
3916   /* Create the following def-use cycle:
3917      loop prolog:
3918          vec_init = ...
3919          vec_step = ...
3920      loop:
3921          vec_iv = PHI <vec_init, vec_loop>
3922          ...
3923          STMT
3924          ...
3925          vec_loop = vec_iv + vec_step;  */
3926
3927   /* Create the induction-phi that defines the induction-operand.  */
3928   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3929   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3930   set_vinfo_for_stmt (induction_phi,
3931                       new_stmt_vec_info (induction_phi, loop_vinfo));
3932   induc_def = PHI_RESULT (induction_phi);
3933
3934   /* Create the iv update inside the loop  */
3935   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3936   vec_def = make_ssa_name (vec_dest, new_stmt);
3937   gimple_assign_set_lhs (new_stmt, vec_def);
3938   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3939   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3940
3941   /* Set the arguments of the phi node:  */
3942   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3943   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3944                UNKNOWN_LOCATION);
3945
3946
3947   /* In case that vectorization factor (VF) is bigger than the number
3948      of elements that we can fit in a vectype (nunits), we have to generate
3949      more than one vector stmt - i.e - we need to "unroll" the
3950      vector stmt by a factor VF/nunits.  For more details see documentation
3951      in vectorizable_operation.  */
3952
3953   if (ncopies > 1)
3954     {
3955       stmt_vec_info prev_stmt_vinfo;
3956       /* FORNOW. This restriction should be relaxed.  */
3957       gcc_assert (!nested_in_vect_loop);
3958
3959       /* Create the vector that holds the step of the induction.  */
3960       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3961         {
3962           expr = build_int_cst (integer_type_node, nunits);
3963           expr = fold_convert (TREE_TYPE (step_expr), expr);
3964         }
3965       else
3966         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3967       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3968                               expr, step_expr);
3969       if (TREE_CODE (step_expr) == SSA_NAME)
3970         new_name = vect_init_vector (iv_phi, new_name,
3971                                      TREE_TYPE (step_expr), NULL);
3972       t = unshare_expr (new_name);
3973       gcc_assert (CONSTANT_CLASS_P (new_name)
3974                   || TREE_CODE (new_name) == SSA_NAME);
3975       new_vec = build_vector_from_val (stepvectype, t);
3976       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3977
3978       vec_def = induc_def;
3979       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3980       for (i = 1; i < ncopies; i++)
3981         {
3982           /* vec_i = vec_prev + vec_step  */
3983           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3984                                           vec_def, vec_step);
3985           vec_def = make_ssa_name (vec_dest, new_stmt);
3986           gimple_assign_set_lhs (new_stmt, vec_def);
3987
3988           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3989           if (!useless_type_conversion_p (resvectype, vectype))
3990             {
3991               new_stmt
3992                 = gimple_build_assign
3993                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3994                                                 "vec_iv_"),
3995                          VIEW_CONVERT_EXPR,
3996                          build1 (VIEW_CONVERT_EXPR, resvectype,
3997                                  gimple_assign_lhs (new_stmt)));
3998               gimple_assign_set_lhs (new_stmt,
3999                                      make_ssa_name
4000                                        (gimple_assign_lhs (new_stmt), new_stmt));
4001               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4002             }
4003           set_vinfo_for_stmt (new_stmt,
4004                               new_stmt_vec_info (new_stmt, loop_vinfo));
4005           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
4006           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
4007         }
4008     }
4009
4010   if (nested_in_vect_loop)
4011     {
4012       /* Find the loop-closed exit-phi of the induction, and record
4013          the final vector of induction results:  */
4014       exit_phi = NULL;
4015       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
4016         {
4017           gimple *use_stmt = USE_STMT (use_p);
4018           if (is_gimple_debug (use_stmt))
4019             continue;
4020
4021           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
4022             {
4023               exit_phi = use_stmt;
4024               break;
4025             }
4026         }
4027       if (exit_phi)
4028         {
4029           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
4030           /* FORNOW. Currently not supporting the case that an inner-loop induction
4031              is not used in the outer-loop (i.e. only outside the outer-loop).  */
4032           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
4033                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
4034
4035           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
4036           if (dump_enabled_p ())
4037             {
4038               dump_printf_loc (MSG_NOTE, vect_location,
4039                                "vector of inductions after inner-loop:");
4040               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
4041             }
4042         }
4043     }
4044
4045
4046   if (dump_enabled_p ())
4047     {
4048       dump_printf_loc (MSG_NOTE, vect_location,
4049                        "transform induction: created def-use cycle: ");
4050       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
4051       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4052                         SSA_NAME_DEF_STMT (vec_def), 0);
4053     }
4054
4055   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4056   if (!useless_type_conversion_p (resvectype, vectype))
4057     {
4058       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4059                                                              vect_simple_var,
4060                                                              "vec_iv_"),
4061                                       VIEW_CONVERT_EXPR,
4062                                       build1 (VIEW_CONVERT_EXPR, resvectype,
4063                                               induc_def));
4064       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4065       gimple_assign_set_lhs (new_stmt, induc_def);
4066       si = gsi_after_labels (bb);
4067       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4068       set_vinfo_for_stmt (new_stmt,
4069                           new_stmt_vec_info (new_stmt, loop_vinfo));
4070       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4071         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4072     }
4073
4074   return induc_def;
4075 }
4076
4077
4078 /* Function get_initial_def_for_reduction
4079
4080    Input:
4081    STMT - a stmt that performs a reduction operation in the loop.
4082    INIT_VAL - the initial value of the reduction variable
4083
4084    Output:
4085    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4086         of the reduction (used for adjusting the epilog - see below).
4087    Return a vector variable, initialized according to the operation that STMT
4088         performs. This vector will be used as the initial value of the
4089         vector of partial results.
4090
4091    Option1 (adjust in epilog): Initialize the vector as follows:
4092      add/bit or/xor:    [0,0,...,0,0]
4093      mult/bit and:      [1,1,...,1,1]
4094      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4095    and when necessary (e.g. add/mult case) let the caller know
4096    that it needs to adjust the result by init_val.
4097
4098    Option2: Initialize the vector as follows:
4099      add/bit or/xor:    [init_val,0,0,...,0]
4100      mult/bit and:      [init_val,1,1,...,1]
4101      min/max/cond_expr: [init_val,init_val,...,init_val]
4102    and no adjustments are needed.
4103
4104    For example, for the following code:
4105
4106    s = init_val;
4107    for (i=0;i<n;i++)
4108      s = s + a[i];
4109
4110    STMT is 's = s + a[i]', and the reduction variable is 's'.
4111    For a vector of 4 units, we want to return either [0,0,0,init_val],
4112    or [0,0,0,0] and let the caller know that it needs to adjust
4113    the result at the end by 'init_val'.
4114
4115    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4116    initialization vector is simpler (same element in all entries), if
4117    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4118
4119    A cost model should help decide between these two schemes.  */
4120
4121 tree
4122 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4123                                tree *adjustment_def)
4124 {
4125   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4126   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4127   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4128   tree scalar_type = TREE_TYPE (init_val);
4129   tree vectype = get_vectype_for_scalar_type (scalar_type);
4130   int nunits;
4131   enum tree_code code = gimple_assign_rhs_code (stmt);
4132   tree def_for_init;
4133   tree init_def;
4134   tree *elts;
4135   int i;
4136   bool nested_in_vect_loop = false;
4137   REAL_VALUE_TYPE real_init_val = dconst0;
4138   int int_init_val = 0;
4139   gimple *def_stmt = NULL;
4140   gimple_seq stmts = NULL;
4141
4142   gcc_assert (vectype);
4143   nunits = TYPE_VECTOR_SUBPARTS (vectype);
4144
4145   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4146               || SCALAR_FLOAT_TYPE_P (scalar_type));
4147
4148   if (nested_in_vect_loop_p (loop, stmt))
4149     nested_in_vect_loop = true;
4150   else
4151     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4152
4153   /* In case of double reduction we only create a vector variable to be put
4154      in the reduction phi node.  The actual statement creation is done in
4155      vect_create_epilog_for_reduction.  */
4156   if (adjustment_def && nested_in_vect_loop
4157       && TREE_CODE (init_val) == SSA_NAME
4158       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4159       && gimple_code (def_stmt) == GIMPLE_PHI
4160       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4161       && vinfo_for_stmt (def_stmt)
4162       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4163           == vect_double_reduction_def)
4164     {
4165       *adjustment_def = NULL;
4166       return vect_create_destination_var (init_val, vectype);
4167     }
4168
4169   /* In case of a nested reduction do not use an adjustment def as
4170      that case is not supported by the epilogue generation correctly
4171      if ncopies is not one.  */
4172   if (adjustment_def && nested_in_vect_loop)
4173     {
4174       *adjustment_def = NULL;
4175       return vect_get_vec_def_for_operand (init_val, stmt);
4176     }
4177
4178   switch (code)
4179     {
4180       case WIDEN_SUM_EXPR:
4181       case DOT_PROD_EXPR:
4182       case SAD_EXPR:
4183       case PLUS_EXPR:
4184       case MINUS_EXPR:
4185       case BIT_IOR_EXPR:
4186       case BIT_XOR_EXPR:
4187       case MULT_EXPR:
4188       case BIT_AND_EXPR:
4189         /* ADJUSMENT_DEF is NULL when called from
4190            vect_create_epilog_for_reduction to vectorize double reduction.  */
4191         if (adjustment_def)
4192           *adjustment_def = init_val;
4193
4194         if (code == MULT_EXPR)
4195           {
4196             real_init_val = dconst1;
4197             int_init_val = 1;
4198           }
4199
4200         if (code == BIT_AND_EXPR)
4201           int_init_val = -1;
4202
4203         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4204           def_for_init = build_real (scalar_type, real_init_val);
4205         else
4206           def_for_init = build_int_cst (scalar_type, int_init_val);
4207
4208         /* Create a vector of '0' or '1' except the first element.  */
4209         elts = XALLOCAVEC (tree, nunits);
4210         for (i = nunits - 2; i >= 0; --i)
4211           elts[i + 1] = def_for_init;
4212
4213         /* Option1: the first element is '0' or '1' as well.  */
4214         if (adjustment_def)
4215           {
4216             elts[0] = def_for_init;
4217             init_def = build_vector (vectype, elts);
4218             break;
4219           }
4220
4221         /* Option2: the first element is INIT_VAL.  */
4222         elts[0] = init_val;
4223         if (TREE_CONSTANT (init_val))
4224           init_def = build_vector (vectype, elts);
4225         else
4226           {
4227             vec<constructor_elt, va_gc> *v;
4228             vec_alloc (v, nunits);
4229             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4230             for (i = 1; i < nunits; ++i)
4231               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4232             init_def = build_constructor (vectype, v);
4233           }
4234
4235         break;
4236
4237       case MIN_EXPR:
4238       case MAX_EXPR:
4239       case COND_EXPR:
4240         if (adjustment_def)
4241           {
4242             *adjustment_def = NULL_TREE;
4243             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4244               {
4245                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4246                 break;
4247               }
4248           }
4249         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4250         if (! gimple_seq_empty_p (stmts))
4251           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4252         init_def = build_vector_from_val (vectype, init_val);
4253         break;
4254
4255       default:
4256         gcc_unreachable ();
4257     }
4258
4259   return init_def;
4260 }
4261
4262 /* Function vect_create_epilog_for_reduction
4263
4264    Create code at the loop-epilog to finalize the result of a reduction
4265    computation.
4266
4267    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4268      reduction statements.
4269    STMT is the scalar reduction stmt that is being vectorized.
4270    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4271      number of elements that we can fit in a vectype (nunits).  In this case
4272      we have to generate more than one vector stmt - i.e - we need to "unroll"
4273      the vector stmt by a factor VF/nunits.  For more details see documentation
4274      in vectorizable_operation.
4275    REDUC_CODE is the tree-code for the epilog reduction.
4276    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4277      computation.
4278    REDUC_INDEX is the index of the operand in the right hand side of the
4279      statement that is defined by REDUCTION_PHI.
4280    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4281    SLP_NODE is an SLP node containing a group of reduction statements. The
4282      first one in this group is STMT.
4283    INDUCTION_INDEX is the index of the loop for condition reductions.
4284      Otherwise it is undefined.
4285
4286    This function:
4287    1. Creates the reduction def-use cycles: sets the arguments for
4288       REDUCTION_PHIS:
4289       The loop-entry argument is the vectorized initial-value of the reduction.
4290       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4291       sums.
4292    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4293       by applying the operation specified by REDUC_CODE if available, or by
4294       other means (whole-vector shifts or a scalar loop).
4295       The function also creates a new phi node at the loop exit to preserve
4296       loop-closed form, as illustrated below.
4297
4298      The flow at the entry to this function:
4299
4300         loop:
4301           vec_def = phi <null, null>            # REDUCTION_PHI
4302           VECT_DEF = vector_stmt                # vectorized form of STMT
4303           s_loop = scalar_stmt                  # (scalar) STMT
4304         loop_exit:
4305           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4306           use <s_out0>
4307           use <s_out0>
4308
4309      The above is transformed by this function into:
4310
4311         loop:
4312           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4313           VECT_DEF = vector_stmt                # vectorized form of STMT
4314           s_loop = scalar_stmt                  # (scalar) STMT
4315         loop_exit:
4316           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4317           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4318           v_out2 = reduce <v_out1>
4319           s_out3 = extract_field <v_out2, 0>
4320           s_out4 = adjust_result <s_out3>
4321           use <s_out4>
4322           use <s_out4>
4323 */
4324
4325 static void
4326 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4327                                   int ncopies, enum tree_code reduc_code,
4328                                   vec<gimple *> reduction_phis,
4329                                   int reduc_index, bool double_reduc,
4330                                   slp_tree slp_node, tree induction_index)
4331 {
4332   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4333   stmt_vec_info prev_phi_info;
4334   tree vectype;
4335   machine_mode mode;
4336   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4337   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4338   basic_block exit_bb;
4339   tree scalar_dest;
4340   tree scalar_type;
4341   gimple *new_phi = NULL, *phi;
4342   gimple_stmt_iterator exit_gsi;
4343   tree vec_dest;
4344   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4345   gimple *epilog_stmt = NULL;
4346   enum tree_code code = gimple_assign_rhs_code (stmt);
4347   gimple *exit_phi;
4348   tree bitsize;
4349   tree adjustment_def = NULL;
4350   tree vec_initial_def = NULL;
4351   tree reduction_op, expr, def, initial_def = NULL;
4352   tree orig_name, scalar_result;
4353   imm_use_iterator imm_iter, phi_imm_iter;
4354   use_operand_p use_p, phi_use_p;
4355   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4356   bool nested_in_vect_loop = false;
4357   auto_vec<gimple *> new_phis;
4358   auto_vec<gimple *> inner_phis;
4359   enum vect_def_type dt = vect_unknown_def_type;
4360   int j, i;
4361   auto_vec<tree> scalar_results;
4362   unsigned int group_size = 1, k, ratio;
4363   auto_vec<tree> vec_initial_defs;
4364   auto_vec<gimple *> phis;
4365   bool slp_reduc = false;
4366   tree new_phi_result;
4367   gimple *inner_phi = NULL;
4368
4369   if (slp_node)
4370     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4371
4372   if (nested_in_vect_loop_p (loop, stmt))
4373     {
4374       outer_loop = loop;
4375       loop = loop->inner;
4376       nested_in_vect_loop = true;
4377       gcc_assert (!slp_node);
4378     }
4379
4380   reduction_op = get_reduction_op (stmt, reduc_index);
4381
4382   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4383   gcc_assert (vectype);
4384   mode = TYPE_MODE (vectype);
4385
4386   /* 1. Create the reduction def-use cycle:
4387      Set the arguments of REDUCTION_PHIS, i.e., transform
4388
4389         loop:
4390           vec_def = phi <null, null>            # REDUCTION_PHI
4391           VECT_DEF = vector_stmt                # vectorized form of STMT
4392           ...
4393
4394      into:
4395
4396         loop:
4397           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4398           VECT_DEF = vector_stmt                # vectorized form of STMT
4399           ...
4400
4401      (in case of SLP, do it for all the phis). */
4402
4403   /* Get the loop-entry arguments.  */
4404   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4405   if (slp_node)
4406     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4407                        NULL, slp_node, reduc_index);
4408   else
4409     {
4410       /* Get at the scalar def before the loop, that defines the initial value
4411          of the reduction variable.  */
4412       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4413       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4414                                            loop_preheader_edge (loop));
4415       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4416       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4417                                                        &adjustment_def);
4418       vec_initial_defs.create (1);
4419       vec_initial_defs.quick_push (vec_initial_def);
4420     }
4421
4422   /* Set phi nodes arguments.  */
4423   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4424     {
4425       tree vec_init_def, def;
4426       gimple_seq stmts;
4427       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4428                                            true, NULL_TREE);
4429       if (stmts)
4430         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4431
4432       def = vect_defs[i];
4433       for (j = 0; j < ncopies; j++)
4434         {
4435           if (j != 0)
4436             {
4437               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4438               if (nested_in_vect_loop)
4439                 vec_init_def
4440                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4441                                                     vec_init_def);
4442             }
4443
4444           /* Set the loop-entry arg of the reduction-phi.  */
4445
4446           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4447               == INTEGER_INDUC_COND_REDUCTION)
4448             {
4449               /* Initialise the reduction phi to zero.  This prevents initial
4450                  values of non-zero interferring with the reduction op.  */
4451               gcc_assert (ncopies == 1);
4452               gcc_assert (i == 0);
4453
4454               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4455               tree zero_vec = build_zero_cst (vec_init_def_type);
4456
4457               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4458                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4459             }
4460           else
4461             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4462                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4463
4464           /* Set the loop-latch arg for the reduction-phi.  */
4465           if (j > 0)
4466             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4467
4468           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4469                        UNKNOWN_LOCATION);
4470
4471           if (dump_enabled_p ())
4472             {
4473               dump_printf_loc (MSG_NOTE, vect_location,
4474                                "transform reduction: created def-use cycle: ");
4475               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4476               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4477             }
4478         }
4479     }
4480
4481   /* 2. Create epilog code.
4482         The reduction epilog code operates across the elements of the vector
4483         of partial results computed by the vectorized loop.
4484         The reduction epilog code consists of:
4485
4486         step 1: compute the scalar result in a vector (v_out2)
4487         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4488         step 3: adjust the scalar result (s_out3) if needed.
4489
4490         Step 1 can be accomplished using one the following three schemes:
4491           (scheme 1) using reduc_code, if available.
4492           (scheme 2) using whole-vector shifts, if available.
4493           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4494                      combined.
4495
4496           The overall epilog code looks like this:
4497
4498           s_out0 = phi <s_loop>         # original EXIT_PHI
4499           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4500           v_out2 = reduce <v_out1>              # step 1
4501           s_out3 = extract_field <v_out2, 0>    # step 2
4502           s_out4 = adjust_result <s_out3>       # step 3
4503
4504           (step 3 is optional, and steps 1 and 2 may be combined).
4505           Lastly, the uses of s_out0 are replaced by s_out4.  */
4506
4507
4508   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4509          v_out1 = phi <VECT_DEF>
4510          Store them in NEW_PHIS.  */
4511
4512   exit_bb = single_exit (loop)->dest;
4513   prev_phi_info = NULL;
4514   new_phis.create (vect_defs.length ());
4515   FOR_EACH_VEC_ELT (vect_defs, i, def)
4516     {
4517       for (j = 0; j < ncopies; j++)
4518         {
4519           tree new_def = copy_ssa_name (def);
4520           phi = create_phi_node (new_def, exit_bb);
4521           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4522           if (j == 0)
4523             new_phis.quick_push (phi);
4524           else
4525             {
4526               def = vect_get_vec_def_for_stmt_copy (dt, def);
4527               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4528             }
4529
4530           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4531           prev_phi_info = vinfo_for_stmt (phi);
4532         }
4533     }
4534
4535   /* The epilogue is created for the outer-loop, i.e., for the loop being
4536      vectorized.  Create exit phis for the outer loop.  */
4537   if (double_reduc)
4538     {
4539       loop = outer_loop;
4540       exit_bb = single_exit (loop)->dest;
4541       inner_phis.create (vect_defs.length ());
4542       FOR_EACH_VEC_ELT (new_phis, i, phi)
4543         {
4544           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4545           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4546           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4547                            PHI_RESULT (phi));
4548           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4549                                                             loop_vinfo));
4550           inner_phis.quick_push (phi);
4551           new_phis[i] = outer_phi;
4552           prev_phi_info = vinfo_for_stmt (outer_phi);
4553           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4554             {
4555               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4556               new_result = copy_ssa_name (PHI_RESULT (phi));
4557               outer_phi = create_phi_node (new_result, exit_bb);
4558               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4559                                PHI_RESULT (phi));
4560               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4561                                                                 loop_vinfo));
4562               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4563               prev_phi_info = vinfo_for_stmt (outer_phi);
4564             }
4565         }
4566     }
4567
4568   exit_gsi = gsi_after_labels (exit_bb);
4569
4570   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4571          (i.e. when reduc_code is not available) and in the final adjustment
4572          code (if needed).  Also get the original scalar reduction variable as
4573          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4574          represents a reduction pattern), the tree-code and scalar-def are
4575          taken from the original stmt that the pattern-stmt (STMT) replaces.
4576          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4577          are taken from STMT.  */
4578
4579   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4580   if (!orig_stmt)
4581     {
4582       /* Regular reduction  */
4583       orig_stmt = stmt;
4584     }
4585   else
4586     {
4587       /* Reduction pattern  */
4588       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4589       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4590       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4591     }
4592
4593   code = gimple_assign_rhs_code (orig_stmt);
4594   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4595      partial results are added and not subtracted.  */
4596   if (code == MINUS_EXPR)
4597     code = PLUS_EXPR;
4598
4599   scalar_dest = gimple_assign_lhs (orig_stmt);
4600   scalar_type = TREE_TYPE (scalar_dest);
4601   scalar_results.create (group_size);
4602   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4603   bitsize = TYPE_SIZE (scalar_type);
4604
4605   /* In case this is a reduction in an inner-loop while vectorizing an outer
4606      loop - we don't need to extract a single scalar result at the end of the
4607      inner-loop (unless it is double reduction, i.e., the use of reduction is
4608      outside the outer-loop).  The final vector of partial results will be used
4609      in the vectorized outer-loop, or reduced to a scalar result at the end of
4610      the outer-loop.  */
4611   if (nested_in_vect_loop && !double_reduc)
4612     goto vect_finalize_reduction;
4613
4614   /* SLP reduction without reduction chain, e.g.,
4615      # a1 = phi <a2, a0>
4616      # b1 = phi <b2, b0>
4617      a2 = operation (a1)
4618      b2 = operation (b1)  */
4619   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4620
4621   /* In case of reduction chain, e.g.,
4622      # a1 = phi <a3, a0>
4623      a2 = operation (a1)
4624      a3 = operation (a2),
4625
4626      we may end up with more than one vector result.  Here we reduce them to
4627      one vector.  */
4628   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4629     {
4630       tree first_vect = PHI_RESULT (new_phis[0]);
4631       tree tmp;
4632       gassign *new_vec_stmt = NULL;
4633
4634       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4635       for (k = 1; k < new_phis.length (); k++)
4636         {
4637           gimple *next_phi = new_phis[k];
4638           tree second_vect = PHI_RESULT (next_phi);
4639
4640           tmp = build2 (code, vectype,  first_vect, second_vect);
4641           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4642           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4643           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4644           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4645         }
4646
4647       new_phi_result = first_vect;
4648       if (new_vec_stmt)
4649         {
4650           new_phis.truncate (0);
4651           new_phis.safe_push (new_vec_stmt);
4652         }
4653     }
4654   else
4655     new_phi_result = PHI_RESULT (new_phis[0]);
4656
4657   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4658     {
4659       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4660          various data values where the condition matched and another vector
4661          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4662          need to extract the last matching index (which will be the index with
4663          highest value) and use this to index into the data vector.
4664          For the case where there were no matches, the data vector will contain
4665          all default values and the index vector will be all zeros.  */
4666
4667       /* Get various versions of the type of the vector of indexes.  */
4668       tree index_vec_type = TREE_TYPE (induction_index);
4669       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4670       tree index_scalar_type = TREE_TYPE (index_vec_type);
4671       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4672         (index_vec_type);
4673
4674       /* Get an unsigned integer version of the type of the data vector.  */
4675       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4676       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4677       tree vectype_unsigned = build_vector_type
4678         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4679
4680       /* First we need to create a vector (ZERO_VEC) of zeros and another
4681          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4682          can create using a MAX reduction and then expanding.
4683          In the case where the loop never made any matches, the max index will
4684          be zero.  */
4685
4686       /* Vector of {0, 0, 0,...}.  */
4687       tree zero_vec = make_ssa_name (vectype);
4688       tree zero_vec_rhs = build_zero_cst (vectype);
4689       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4690       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4691
4692       /* Find maximum value from the vector of found indexes.  */
4693       tree max_index = make_ssa_name (index_scalar_type);
4694       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4695                                                     induction_index);
4696       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4697
4698       /* Vector of {max_index, max_index, max_index,...}.  */
4699       tree max_index_vec = make_ssa_name (index_vec_type);
4700       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4701                                                       max_index);
4702       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4703                                                         max_index_vec_rhs);
4704       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4705
4706       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4707          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4708          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4709          otherwise.  Only one value should match, resulting in a vector
4710          (VEC_COND) with one data value and the rest zeros.
4711          In the case where the loop never made any matches, every index will
4712          match, resulting in a vector with all data values (which will all be
4713          the default value).  */
4714
4715       /* Compare the max index vector to the vector of found indexes to find
4716          the position of the max value.  */
4717       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4718       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4719                                                       induction_index,
4720                                                       max_index_vec);
4721       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4722
4723       /* Use the compare to choose either values from the data vector or
4724          zero.  */
4725       tree vec_cond = make_ssa_name (vectype);
4726       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4727                                                    vec_compare, new_phi_result,
4728                                                    zero_vec);
4729       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4730
4731       /* Finally we need to extract the data value from the vector (VEC_COND)
4732          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4733          reduction, but because this doesn't exist, we can use a MAX reduction
4734          instead.  The data value might be signed or a float so we need to cast
4735          it first.
4736          In the case where the loop never made any matches, the data values are
4737          all identical, and so will reduce down correctly.  */
4738
4739       /* Make the matched data values unsigned.  */
4740       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4741       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4742                                        vec_cond);
4743       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4744                                                         VIEW_CONVERT_EXPR,
4745                                                         vec_cond_cast_rhs);
4746       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4747
4748       /* Reduce down to a scalar value.  */
4749       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4750       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4751                                       optab_default);
4752       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4753                   != CODE_FOR_nothing);
4754       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4755                                                      REDUC_MAX_EXPR,
4756                                                      vec_cond_cast);
4757       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4758
4759       /* Convert the reduced value back to the result type and set as the
4760          result.  */
4761       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4762                                      data_reduc);
4763       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4764       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4765       gimple_assign_set_lhs (epilog_stmt, new_temp);
4766       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4767       scalar_results.safe_push (new_temp);
4768     }
4769
4770   /* 2.3 Create the reduction code, using one of the three schemes described
4771          above. In SLP we simply need to extract all the elements from the
4772          vector (without reducing them), so we use scalar shifts.  */
4773   else if (reduc_code != ERROR_MARK && !slp_reduc)
4774     {
4775       tree tmp;
4776       tree vec_elem_type;
4777
4778       /*** Case 1:  Create:
4779            v_out2 = reduc_expr <v_out1>  */
4780
4781       if (dump_enabled_p ())
4782         dump_printf_loc (MSG_NOTE, vect_location,
4783                          "Reduce using direct vector reduction.\n");
4784
4785       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4786       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4787         {
4788           tree tmp_dest =
4789               vect_create_destination_var (scalar_dest, vec_elem_type);
4790           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4791           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4792           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4793           gimple_assign_set_lhs (epilog_stmt, new_temp);
4794           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4795
4796           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4797         }
4798       else
4799         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4800
4801       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4802       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4803       gimple_assign_set_lhs (epilog_stmt, new_temp);
4804       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4805
4806       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4807           == INTEGER_INDUC_COND_REDUCTION)
4808         {
4809           /* Earlier we set the initial value to be zero.  Check the result
4810              and if it is zero then replace with the original initial
4811              value.  */
4812           tree zero = build_zero_cst (scalar_type);
4813           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4814
4815           tmp = make_ssa_name (new_scalar_dest);
4816           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4817                                              initial_def, new_temp);
4818           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4819           new_temp = tmp;
4820         }
4821
4822       scalar_results.safe_push (new_temp);
4823     }
4824   else
4825     {
4826       bool reduce_with_shift = have_whole_vector_shift (mode);
4827       int element_bitsize = tree_to_uhwi (bitsize);
4828       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4829       tree vec_temp;
4830
4831       /* Regardless of whether we have a whole vector shift, if we're
4832          emulating the operation via tree-vect-generic, we don't want
4833          to use it.  Only the first round of the reduction is likely
4834          to still be profitable via emulation.  */
4835       /* ??? It might be better to emit a reduction tree code here, so that
4836          tree-vect-generic can expand the first round via bit tricks.  */
4837       if (!VECTOR_MODE_P (mode))
4838         reduce_with_shift = false;
4839       else
4840         {
4841           optab optab = optab_for_tree_code (code, vectype, optab_default);
4842           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4843             reduce_with_shift = false;
4844         }
4845
4846       if (reduce_with_shift && !slp_reduc)
4847         {
4848           int nelements = vec_size_in_bits / element_bitsize;
4849           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4850
4851           int elt_offset;
4852
4853           tree zero_vec = build_zero_cst (vectype);
4854           /*** Case 2: Create:
4855              for (offset = nelements/2; offset >= 1; offset/=2)
4856                 {
4857                   Create:  va' = vec_shift <va, offset>
4858                   Create:  va = vop <va, va'>
4859                 }  */
4860
4861           tree rhs;
4862
4863           if (dump_enabled_p ())
4864             dump_printf_loc (MSG_NOTE, vect_location,
4865                              "Reduce using vector shifts\n");
4866
4867           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4868           new_temp = new_phi_result;
4869           for (elt_offset = nelements / 2;
4870                elt_offset >= 1;
4871                elt_offset /= 2)
4872             {
4873               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4874               tree mask = vect_gen_perm_mask_any (vectype, sel);
4875               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4876                                                  new_temp, zero_vec, mask);
4877               new_name = make_ssa_name (vec_dest, epilog_stmt);
4878               gimple_assign_set_lhs (epilog_stmt, new_name);
4879               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4880
4881               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4882                                                  new_temp);
4883               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4884               gimple_assign_set_lhs (epilog_stmt, new_temp);
4885               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4886             }
4887
4888           /* 2.4  Extract the final scalar result.  Create:
4889              s_out3 = extract_field <v_out2, bitpos>  */
4890
4891           if (dump_enabled_p ())
4892             dump_printf_loc (MSG_NOTE, vect_location,
4893                              "extract scalar result\n");
4894
4895           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4896                         bitsize, bitsize_zero_node);
4897           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4898           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4899           gimple_assign_set_lhs (epilog_stmt, new_temp);
4900           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4901           scalar_results.safe_push (new_temp);
4902         }
4903       else
4904         {
4905           /*** Case 3: Create:
4906              s = extract_field <v_out2, 0>
4907              for (offset = element_size;
4908                   offset < vector_size;
4909                   offset += element_size;)
4910                {
4911                  Create:  s' = extract_field <v_out2, offset>
4912                  Create:  s = op <s, s'>  // For non SLP cases
4913                }  */
4914
4915           if (dump_enabled_p ())
4916             dump_printf_loc (MSG_NOTE, vect_location,
4917                              "Reduce using scalar code.\n");
4918
4919           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4920           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4921             {
4922               int bit_offset;
4923               if (gimple_code (new_phi) == GIMPLE_PHI)
4924                 vec_temp = PHI_RESULT (new_phi);
4925               else
4926                 vec_temp = gimple_assign_lhs (new_phi);
4927               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4928                             bitsize_zero_node);
4929               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4930               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4931               gimple_assign_set_lhs (epilog_stmt, new_temp);
4932               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4933
4934               /* In SLP we don't need to apply reduction operation, so we just
4935                  collect s' values in SCALAR_RESULTS.  */
4936               if (slp_reduc)
4937                 scalar_results.safe_push (new_temp);
4938
4939               for (bit_offset = element_bitsize;
4940                    bit_offset < vec_size_in_bits;
4941                    bit_offset += element_bitsize)
4942                 {
4943                   tree bitpos = bitsize_int (bit_offset);
4944                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4945                                      bitsize, bitpos);
4946
4947                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4948                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4949                   gimple_assign_set_lhs (epilog_stmt, new_name);
4950                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4951
4952                   if (slp_reduc)
4953                     {
4954                       /* In SLP we don't need to apply reduction operation, so
4955                          we just collect s' values in SCALAR_RESULTS.  */
4956                       new_temp = new_name;
4957                       scalar_results.safe_push (new_name);
4958                     }
4959                   else
4960                     {
4961                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4962                                                          new_name, new_temp);
4963                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4964                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4965                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4966                     }
4967                 }
4968             }
4969
4970           /* The only case where we need to reduce scalar results in SLP, is
4971              unrolling.  If the size of SCALAR_RESULTS is greater than
4972              GROUP_SIZE, we reduce them combining elements modulo
4973              GROUP_SIZE.  */
4974           if (slp_reduc)
4975             {
4976               tree res, first_res, new_res;
4977               gimple *new_stmt;
4978
4979               /* Reduce multiple scalar results in case of SLP unrolling.  */
4980               for (j = group_size; scalar_results.iterate (j, &res);
4981                    j++)
4982                 {
4983                   first_res = scalar_results[j % group_size];
4984                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4985                                                   first_res, res);
4986                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4987                   gimple_assign_set_lhs (new_stmt, new_res);
4988                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4989                   scalar_results[j % group_size] = new_res;
4990                 }
4991             }
4992           else
4993             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4994             scalar_results.safe_push (new_temp);
4995         }
4996     }
4997
4998 vect_finalize_reduction:
4999
5000   if (double_reduc)
5001     loop = loop->inner;
5002
5003   /* 2.5 Adjust the final result by the initial value of the reduction
5004          variable. (When such adjustment is not needed, then
5005          'adjustment_def' is zero).  For example, if code is PLUS we create:
5006          new_temp = loop_exit_def + adjustment_def  */
5007
5008   if (adjustment_def)
5009     {
5010       gcc_assert (!slp_reduc);
5011       if (nested_in_vect_loop)
5012         {
5013           new_phi = new_phis[0];
5014           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5015           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5016           new_dest = vect_create_destination_var (scalar_dest, vectype);
5017         }
5018       else
5019         {
5020           new_temp = scalar_results[0];
5021           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5022           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5023           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5024         }
5025
5026       epilog_stmt = gimple_build_assign (new_dest, expr);
5027       new_temp = make_ssa_name (new_dest, epilog_stmt);
5028       gimple_assign_set_lhs (epilog_stmt, new_temp);
5029       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5030       if (nested_in_vect_loop)
5031         {
5032           set_vinfo_for_stmt (epilog_stmt,
5033                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5034           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5035                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5036
5037           if (!double_reduc)
5038             scalar_results.quick_push (new_temp);
5039           else
5040             scalar_results[0] = new_temp;
5041         }
5042       else
5043         scalar_results[0] = new_temp;
5044
5045       new_phis[0] = epilog_stmt;
5046     }
5047
5048   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5049           phis with new adjusted scalar results, i.e., replace use <s_out0>
5050           with use <s_out4>.
5051
5052      Transform:
5053         loop_exit:
5054           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5055           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5056           v_out2 = reduce <v_out1>
5057           s_out3 = extract_field <v_out2, 0>
5058           s_out4 = adjust_result <s_out3>
5059           use <s_out0>
5060           use <s_out0>
5061
5062      into:
5063
5064         loop_exit:
5065           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5066           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5067           v_out2 = reduce <v_out1>
5068           s_out3 = extract_field <v_out2, 0>
5069           s_out4 = adjust_result <s_out3>
5070           use <s_out4>
5071           use <s_out4> */
5072
5073
5074   /* In SLP reduction chain we reduce vector results into one vector if
5075      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5076      the last stmt in the reduction chain, since we are looking for the loop
5077      exit phi node.  */
5078   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5079     {
5080       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5081       /* Handle reduction patterns.  */
5082       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5083         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5084
5085       scalar_dest = gimple_assign_lhs (dest_stmt);
5086       group_size = 1;
5087     }
5088
5089   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5090      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5091      need to match SCALAR_RESULTS with corresponding statements.  The first
5092      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5093      the first vector stmt, etc.
5094      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5095   if (group_size > new_phis.length ())
5096     {
5097       ratio = group_size / new_phis.length ();
5098       gcc_assert (!(group_size % new_phis.length ()));
5099     }
5100   else
5101     ratio = 1;
5102
5103   for (k = 0; k < group_size; k++)
5104     {
5105       if (k % ratio == 0)
5106         {
5107           epilog_stmt = new_phis[k / ratio];
5108           reduction_phi = reduction_phis[k / ratio];
5109           if (double_reduc)
5110             inner_phi = inner_phis[k / ratio];
5111         }
5112
5113       if (slp_reduc)
5114         {
5115           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5116
5117           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5118           /* SLP statements can't participate in patterns.  */
5119           gcc_assert (!orig_stmt);
5120           scalar_dest = gimple_assign_lhs (current_stmt);
5121         }
5122
5123       phis.create (3);
5124       /* Find the loop-closed-use at the loop exit of the original scalar
5125          result.  (The reduction result is expected to have two immediate uses -
5126          one at the latch block, and one at the loop exit).  */
5127       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5128         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5129             && !is_gimple_debug (USE_STMT (use_p)))
5130           phis.safe_push (USE_STMT (use_p));
5131
5132       /* While we expect to have found an exit_phi because of loop-closed-ssa
5133          form we can end up without one if the scalar cycle is dead.  */
5134
5135       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5136         {
5137           if (outer_loop)
5138             {
5139               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5140               gphi *vect_phi;
5141
5142               /* FORNOW. Currently not supporting the case that an inner-loop
5143                  reduction is not used in the outer-loop (but only outside the
5144                  outer-loop), unless it is double reduction.  */
5145               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5146                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5147                           || double_reduc);
5148
5149               if (double_reduc)
5150                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5151               else
5152                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5153               if (!double_reduc
5154                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5155                       != vect_double_reduction_def)
5156                 continue;
5157
5158               /* Handle double reduction:
5159
5160                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5161                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5162                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5163                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5164
5165                  At that point the regular reduction (stmt2 and stmt3) is
5166                  already vectorized, as well as the exit phi node, stmt4.
5167                  Here we vectorize the phi node of double reduction, stmt1, and
5168                  update all relevant statements.  */
5169
5170               /* Go through all the uses of s2 to find double reduction phi
5171                  node, i.e., stmt1 above.  */
5172               orig_name = PHI_RESULT (exit_phi);
5173               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5174                 {
5175                   stmt_vec_info use_stmt_vinfo;
5176                   stmt_vec_info new_phi_vinfo;
5177                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5178                   basic_block bb = gimple_bb (use_stmt);
5179                   gimple *use;
5180
5181                   /* Check that USE_STMT is really double reduction phi
5182                      node.  */
5183                   if (gimple_code (use_stmt) != GIMPLE_PHI
5184                       || gimple_phi_num_args (use_stmt) != 2
5185                       || bb->loop_father != outer_loop)
5186                     continue;
5187                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5188                   if (!use_stmt_vinfo
5189                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5190                           != vect_double_reduction_def)
5191                     continue;
5192
5193                   /* Create vector phi node for double reduction:
5194                      vs1 = phi <vs0, vs2>
5195                      vs1 was created previously in this function by a call to
5196                        vect_get_vec_def_for_operand and is stored in
5197                        vec_initial_def;
5198                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5199                      vs0 is created here.  */
5200
5201                   /* Create vector phi node.  */
5202                   vect_phi = create_phi_node (vec_initial_def, bb);
5203                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5204                                     loop_vec_info_for_loop (outer_loop));
5205                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5206
5207                   /* Create vs0 - initial def of the double reduction phi.  */
5208                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5209                                              loop_preheader_edge (outer_loop));
5210                   init_def = get_initial_def_for_reduction (stmt,
5211                                                           preheader_arg, NULL);
5212                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5213                                                     vectype, NULL);
5214
5215                   /* Update phi node arguments with vs0 and vs2.  */
5216                   add_phi_arg (vect_phi, vect_phi_init,
5217                                loop_preheader_edge (outer_loop),
5218                                UNKNOWN_LOCATION);
5219                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5220                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5221                   if (dump_enabled_p ())
5222                     {
5223                       dump_printf_loc (MSG_NOTE, vect_location,
5224                                        "created double reduction phi node: ");
5225                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5226                     }
5227
5228                   vect_phi_res = PHI_RESULT (vect_phi);
5229
5230                   /* Replace the use, i.e., set the correct vs1 in the regular
5231                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5232                      loop is redundant.  */
5233                   use = reduction_phi;
5234                   for (j = 0; j < ncopies; j++)
5235                     {
5236                       edge pr_edge = loop_preheader_edge (loop);
5237                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5238                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5239                     }
5240                 }
5241             }
5242         }
5243
5244       phis.release ();
5245       if (nested_in_vect_loop)
5246         {
5247           if (double_reduc)
5248             loop = outer_loop;
5249           else
5250             continue;
5251         }
5252
5253       phis.create (3);
5254       /* Find the loop-closed-use at the loop exit of the original scalar
5255          result.  (The reduction result is expected to have two immediate uses,
5256          one at the latch block, and one at the loop exit).  For double
5257          reductions we are looking for exit phis of the outer loop.  */
5258       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5259         {
5260           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5261             {
5262               if (!is_gimple_debug (USE_STMT (use_p)))
5263                 phis.safe_push (USE_STMT (use_p));
5264             }
5265           else
5266             {
5267               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5268                 {
5269                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5270
5271                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5272                     {
5273                       if (!flow_bb_inside_loop_p (loop,
5274                                              gimple_bb (USE_STMT (phi_use_p)))
5275                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5276                         phis.safe_push (USE_STMT (phi_use_p));
5277                     }
5278                 }
5279             }
5280         }
5281
5282       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5283         {
5284           /* Replace the uses:  */
5285           orig_name = PHI_RESULT (exit_phi);
5286           scalar_result = scalar_results[k];
5287           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5288             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5289               SET_USE (use_p, scalar_result);
5290         }
5291
5292       phis.release ();
5293     }
5294 }
5295
5296
5297 /* Function is_nonwrapping_integer_induction.
5298
5299    Check if STMT (which is part of loop LOOP) both increments and
5300    does not cause overflow.  */
5301
5302 static bool
5303 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5304 {
5305   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5306   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5307   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5308   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5309   widest_int ni, max_loop_value, lhs_max;
5310   bool overflow = false;
5311
5312   /* Make sure the loop is integer based.  */
5313   if (TREE_CODE (base) != INTEGER_CST
5314       || TREE_CODE (step) != INTEGER_CST)
5315     return false;
5316
5317   /* Check that the induction increments.  */
5318   if (tree_int_cst_sgn (step) == -1)
5319     return false;
5320
5321   /* Check that the max size of the loop will not wrap.  */
5322
5323   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5324     return true;
5325
5326   if (! max_stmt_executions (loop, &ni))
5327     return false;
5328
5329   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5330                             &overflow);
5331   if (overflow)
5332     return false;
5333
5334   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5335                             TYPE_SIGN (lhs_type), &overflow);
5336   if (overflow)
5337     return false;
5338
5339   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5340           <= TYPE_PRECISION (lhs_type));
5341 }
5342
5343 /* Function vectorizable_reduction.
5344
5345    Check if STMT performs a reduction operation that can be vectorized.
5346    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5347    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5348    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5349
5350    This function also handles reduction idioms (patterns) that have been
5351    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5352    of this form:
5353      X = pattern_expr (arg0, arg1, ..., X)
5354    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5355    sequence that had been detected and replaced by the pattern-stmt (STMT).
5356
5357    This function also handles reduction of condition expressions, for example:
5358      for (int i = 0; i < N; i++)
5359        if (a[i] < value)
5360          last = a[i];
5361    This is handled by vectorising the loop and creating an additional vector
5362    containing the loop indexes for which "a[i] < value" was true.  In the
5363    function epilogue this is reduced to a single max value and then used to
5364    index into the vector of results.
5365
5366    In some cases of reduction patterns, the type of the reduction variable X is
5367    different than the type of the other arguments of STMT.
5368    In such cases, the vectype that is used when transforming STMT into a vector
5369    stmt is different than the vectype that is used to determine the
5370    vectorization factor, because it consists of a different number of elements
5371    than the actual number of elements that are being operated upon in parallel.
5372
5373    For example, consider an accumulation of shorts into an int accumulator.
5374    On some targets it's possible to vectorize this pattern operating on 8
5375    shorts at a time (hence, the vectype for purposes of determining the
5376    vectorization factor should be V8HI); on the other hand, the vectype that
5377    is used to create the vector form is actually V4SI (the type of the result).
5378
5379    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5380    indicates what is the actual level of parallelism (V8HI in the example), so
5381    that the right vectorization factor would be derived.  This vectype
5382    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5383    be used to create the vectorized stmt.  The right vectype for the vectorized
5384    stmt is obtained from the type of the result X:
5385         get_vectype_for_scalar_type (TREE_TYPE (X))
5386
5387    This means that, contrary to "regular" reductions (or "regular" stmts in
5388    general), the following equation:
5389       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5390    does *NOT* necessarily hold for reduction patterns.  */
5391
5392 bool
5393 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5394                         gimple **vec_stmt, slp_tree slp_node)
5395 {
5396   tree vec_dest;
5397   tree scalar_dest;
5398   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5399   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5400   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5401   tree vectype_in = NULL_TREE;
5402   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5403   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5404   enum tree_code code, orig_code, epilog_reduc_code;
5405   machine_mode vec_mode;
5406   int op_type;
5407   optab optab, reduc_optab;
5408   tree new_temp = NULL_TREE;
5409   gimple *def_stmt;
5410   enum vect_def_type dt, cond_reduc_dt = vect_unknown_def_type;
5411   gphi *new_phi = NULL;
5412   tree scalar_type;
5413   bool is_simple_use;
5414   gimple *orig_stmt;
5415   stmt_vec_info orig_stmt_info;
5416   tree expr = NULL_TREE;
5417   int i;
5418   int ncopies;
5419   int epilog_copies;
5420   stmt_vec_info prev_stmt_info, prev_phi_info;
5421   bool single_defuse_cycle = false;
5422   tree reduc_def = NULL_TREE;
5423   gimple *new_stmt = NULL;
5424   int j;
5425   tree ops[3];
5426   bool nested_cycle = false, found_nested_cycle_def = false;
5427   gimple *reduc_def_stmt = NULL;
5428   bool double_reduc = false, dummy;
5429   basic_block def_bb;
5430   struct loop * def_stmt_loop, *outer_loop = NULL;
5431   tree def_arg;
5432   gimple *def_arg_stmt;
5433   auto_vec<tree> vec_oprnds0;
5434   auto_vec<tree> vec_oprnds1;
5435   auto_vec<tree> vect_defs;
5436   auto_vec<gimple *> phis;
5437   int vec_num;
5438   tree def0, def1, tem, op1 = NULL_TREE;
5439   bool first_p = true;
5440   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5441   tree cond_reduc_val = NULL_TREE, const_cond_cmp = NULL_TREE;
5442
5443   /* In case of reduction chain we switch to the first stmt in the chain, but
5444      we don't update STMT_INFO, since only the last stmt is marked as reduction
5445      and has reduction properties.  */
5446   if (GROUP_FIRST_ELEMENT (stmt_info)
5447       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5448     {
5449       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5450       first_p = false;
5451     }
5452
5453   if (nested_in_vect_loop_p (loop, stmt))
5454     {
5455       outer_loop = loop;
5456       loop = loop->inner;
5457       nested_cycle = true;
5458     }
5459
5460   /* 1. Is vectorizable reduction?  */
5461   /* Not supportable if the reduction variable is used in the loop, unless
5462      it's a reduction chain.  */
5463   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5464       && !GROUP_FIRST_ELEMENT (stmt_info))
5465     return false;
5466
5467   /* Reductions that are not used even in an enclosing outer-loop,
5468      are expected to be "live" (used out of the loop).  */
5469   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5470       && !STMT_VINFO_LIVE_P (stmt_info))
5471     return false;
5472
5473   /* Make sure it was already recognized as a reduction computation.  */
5474   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5475       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5476     return false;
5477
5478   /* 2. Has this been recognized as a reduction pattern?
5479
5480      Check if STMT represents a pattern that has been recognized
5481      in earlier analysis stages.  For stmts that represent a pattern,
5482      the STMT_VINFO_RELATED_STMT field records the last stmt in
5483      the original sequence that constitutes the pattern.  */
5484
5485   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5486   if (orig_stmt)
5487     {
5488       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5489       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5490       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5491     }
5492
5493   /* 3. Check the operands of the operation.  The first operands are defined
5494         inside the loop body. The last operand is the reduction variable,
5495         which is defined by the loop-header-phi.  */
5496
5497   gcc_assert (is_gimple_assign (stmt));
5498
5499   /* Flatten RHS.  */
5500   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5501     {
5502     case GIMPLE_SINGLE_RHS:
5503       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5504       if (op_type == ternary_op)
5505         {
5506           tree rhs = gimple_assign_rhs1 (stmt);
5507           ops[0] = TREE_OPERAND (rhs, 0);
5508           ops[1] = TREE_OPERAND (rhs, 1);
5509           ops[2] = TREE_OPERAND (rhs, 2);
5510           code = TREE_CODE (rhs);
5511         }
5512       else
5513         return false;
5514       break;
5515
5516     case GIMPLE_BINARY_RHS:
5517       code = gimple_assign_rhs_code (stmt);
5518       op_type = TREE_CODE_LENGTH (code);
5519       gcc_assert (op_type == binary_op);
5520       ops[0] = gimple_assign_rhs1 (stmt);
5521       ops[1] = gimple_assign_rhs2 (stmt);
5522       break;
5523
5524     case GIMPLE_TERNARY_RHS:
5525       code = gimple_assign_rhs_code (stmt);
5526       op_type = TREE_CODE_LENGTH (code);
5527       gcc_assert (op_type == ternary_op);
5528       ops[0] = gimple_assign_rhs1 (stmt);
5529       ops[1] = gimple_assign_rhs2 (stmt);
5530       ops[2] = gimple_assign_rhs3 (stmt);
5531       break;
5532
5533     case GIMPLE_UNARY_RHS:
5534       return false;
5535
5536     default:
5537       gcc_unreachable ();
5538     }
5539   /* The default is that the reduction variable is the last in statement.  */
5540   int reduc_index = op_type - 1;
5541   if (code == MINUS_EXPR)
5542     reduc_index = 0;
5543
5544   if (code == COND_EXPR && slp_node)
5545     return false;
5546
5547   scalar_dest = gimple_assign_lhs (stmt);
5548   scalar_type = TREE_TYPE (scalar_dest);
5549   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5550       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5551     return false;
5552
5553   /* Do not try to vectorize bit-precision reductions.  */
5554   if ((TYPE_PRECISION (scalar_type)
5555        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5556     return false;
5557
5558   /* All uses but the last are expected to be defined in the loop.
5559      The last use is the reduction variable.  In case of nested cycle this
5560      assumption is not true: we use reduc_index to record the index of the
5561      reduction variable.  */
5562   for (i = 0; i < op_type; i++)
5563     {
5564       if (i == reduc_index)
5565         continue;
5566
5567       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5568       if (i == 0 && code == COND_EXPR)
5569         continue;
5570
5571       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5572                                           &def_stmt, &dt, &tem);
5573       if (!vectype_in)
5574         vectype_in = tem;
5575       gcc_assert (is_simple_use);
5576
5577       if (dt != vect_internal_def
5578           && dt != vect_external_def
5579           && dt != vect_constant_def
5580           && dt != vect_induction_def
5581           && !(dt == vect_nested_cycle && nested_cycle))
5582         return false;
5583
5584       if (dt == vect_nested_cycle)
5585         {
5586           found_nested_cycle_def = true;
5587           reduc_def_stmt = def_stmt;
5588           reduc_index = i;
5589         }
5590
5591       if (i == 1 && code == COND_EXPR)
5592         {
5593           /* Record how value of COND_EXPR is defined.  */
5594           if (dt == vect_constant_def)
5595             {
5596               cond_reduc_dt = dt;
5597               cond_reduc_val = ops[i];
5598             }
5599           if (dt == vect_induction_def && def_stmt != NULL
5600               && is_nonwrapping_integer_induction (def_stmt, loop))
5601             cond_reduc_dt = dt;
5602         }
5603     }
5604
5605   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5606                                       &def_stmt, &dt, &tem);
5607   if (!vectype_in)
5608     vectype_in = tem;
5609   gcc_assert (is_simple_use);
5610   if (!found_nested_cycle_def)
5611     reduc_def_stmt = def_stmt;
5612
5613   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5614     return false;
5615
5616   if (!(dt == vect_reduction_def
5617         || dt == vect_nested_cycle
5618         || ((dt == vect_internal_def || dt == vect_external_def
5619              || dt == vect_constant_def || dt == vect_induction_def)
5620             && nested_cycle && found_nested_cycle_def)))
5621     {
5622       /* For pattern recognized stmts, orig_stmt might be a reduction,
5623          but some helper statements for the pattern might not, or
5624          might be COND_EXPRs with reduction uses in the condition.  */
5625       gcc_assert (orig_stmt);
5626       return false;
5627     }
5628
5629   enum vect_reduction_type v_reduc_type;
5630   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5631                                           !nested_cycle, &dummy, false,
5632                                           &v_reduc_type);
5633
5634   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5635   /* If we have a condition reduction, see if we can simplify it further.  */
5636   if (v_reduc_type == COND_REDUCTION)
5637     {
5638       if (cond_reduc_dt == vect_induction_def)
5639         {
5640           if (dump_enabled_p ())
5641             dump_printf_loc (MSG_NOTE, vect_location,
5642                              "condition expression based on "
5643                              "integer induction.\n");
5644           STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5645             = INTEGER_INDUC_COND_REDUCTION;
5646         }
5647
5648       if (cond_reduc_dt == vect_constant_def)
5649         {
5650           enum vect_def_type cond_initial_dt;
5651           gimple *def_stmt = SSA_NAME_DEF_STMT (ops[reduc_index]);
5652           tree cond_initial_val
5653             = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
5654
5655           gcc_assert (cond_reduc_val != NULL_TREE);
5656           vect_is_simple_use (cond_initial_val, loop_vinfo,
5657                               &def_stmt, &cond_initial_dt);
5658           if (cond_initial_dt == vect_constant_def
5659               && types_compatible_p (TREE_TYPE (cond_initial_val),
5660                                      TREE_TYPE (cond_reduc_val)))
5661             {
5662               tree e = fold_build2 (LE_EXPR, boolean_type_node,
5663                                     cond_initial_val, cond_reduc_val);
5664               if (e && (integer_onep (e) || integer_zerop (e)))
5665                 {
5666                   if (dump_enabled_p ())
5667                     dump_printf_loc (MSG_NOTE, vect_location,
5668                                      "condition expression based on "
5669                                      "compile time constant.\n");
5670                   const_cond_cmp = e;
5671                   STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5672                     = CONST_COND_REDUCTION;
5673                 }
5674             }
5675         }
5676     }
5677
5678   if (orig_stmt)
5679     gcc_assert (tmp == orig_stmt
5680                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5681   else
5682     /* We changed STMT to be the first stmt in reduction chain, hence we
5683        check that in this case the first element in the chain is STMT.  */
5684     gcc_assert (stmt == tmp
5685                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5686
5687   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5688     return false;
5689
5690   if (slp_node)
5691     ncopies = 1;
5692   else
5693     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5694                / TYPE_VECTOR_SUBPARTS (vectype_in));
5695
5696   gcc_assert (ncopies >= 1);
5697
5698   vec_mode = TYPE_MODE (vectype_in);
5699
5700   if (code == COND_EXPR)
5701     {
5702       /* Only call during the analysis stage, otherwise we'll lose
5703          STMT_VINFO_TYPE.  */
5704       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5705                                                 ops[reduc_index], 0, NULL))
5706         {
5707           if (dump_enabled_p ())
5708             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5709                              "unsupported condition in reduction\n");
5710           return false;
5711         }
5712     }
5713   else
5714     {
5715       /* 4. Supportable by target?  */
5716
5717       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5718           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5719         {
5720           /* Shifts and rotates are only supported by vectorizable_shifts,
5721              not vectorizable_reduction.  */
5722           if (dump_enabled_p ())
5723             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5724                              "unsupported shift or rotation.\n");
5725           return false;
5726         }
5727
5728       /* 4.1. check support for the operation in the loop  */
5729       optab = optab_for_tree_code (code, vectype_in, optab_default);
5730       if (!optab)
5731         {
5732           if (dump_enabled_p ())
5733             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5734                              "no optab.\n");
5735
5736           return false;
5737         }
5738
5739       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5740         {
5741           if (dump_enabled_p ())
5742             dump_printf (MSG_NOTE, "op not supported by target.\n");
5743
5744           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5745               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5746                   < vect_min_worthwhile_factor (code))
5747             return false;
5748
5749           if (dump_enabled_p ())
5750             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5751         }
5752
5753       /* Worthwhile without SIMD support?  */
5754       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5755           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5756              < vect_min_worthwhile_factor (code))
5757         {
5758           if (dump_enabled_p ())
5759             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5760                              "not worthwhile without SIMD support.\n");
5761
5762           return false;
5763         }
5764     }
5765
5766   /* 4.2. Check support for the epilog operation.
5767
5768           If STMT represents a reduction pattern, then the type of the
5769           reduction variable may be different than the type of the rest
5770           of the arguments.  For example, consider the case of accumulation
5771           of shorts into an int accumulator; The original code:
5772                         S1: int_a = (int) short_a;
5773           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5774
5775           was replaced with:
5776                         STMT: int_acc = widen_sum <short_a, int_acc>
5777
5778           This means that:
5779           1. The tree-code that is used to create the vector operation in the
5780              epilog code (that reduces the partial results) is not the
5781              tree-code of STMT, but is rather the tree-code of the original
5782              stmt from the pattern that STMT is replacing.  I.e, in the example
5783              above we want to use 'widen_sum' in the loop, but 'plus' in the
5784              epilog.
5785           2. The type (mode) we use to check available target support
5786              for the vector operation to be created in the *epilog*, is
5787              determined by the type of the reduction variable (in the example
5788              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5789              However the type (mode) we use to check available target support
5790              for the vector operation to be created *inside the loop*, is
5791              determined by the type of the other arguments to STMT (in the
5792              example we'd check this: optab_handler (widen_sum_optab,
5793              vect_short_mode)).
5794
5795           This is contrary to "regular" reductions, in which the types of all
5796           the arguments are the same as the type of the reduction variable.
5797           For "regular" reductions we can therefore use the same vector type
5798           (and also the same tree-code) when generating the epilog code and
5799           when generating the code inside the loop.  */
5800
5801   if (orig_stmt)
5802     {
5803       /* This is a reduction pattern: get the vectype from the type of the
5804          reduction variable, and get the tree-code from orig_stmt.  */
5805       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5806                   == TREE_CODE_REDUCTION);
5807       orig_code = gimple_assign_rhs_code (orig_stmt);
5808       gcc_assert (vectype_out);
5809       vec_mode = TYPE_MODE (vectype_out);
5810     }
5811   else
5812     {
5813       /* Regular reduction: use the same vectype and tree-code as used for
5814          the vector code inside the loop can be used for the epilog code. */
5815       orig_code = code;
5816
5817       if (code == MINUS_EXPR)
5818         orig_code = PLUS_EXPR;
5819
5820       /* For simple condition reductions, replace with the actual expression
5821          we want to base our reduction around.  */
5822       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == CONST_COND_REDUCTION)
5823         {
5824           gcc_assert (const_cond_cmp != NULL_TREE);
5825           gcc_assert (integer_onep (const_cond_cmp)
5826                       || integer_zerop (const_cond_cmp));
5827           orig_code = integer_onep (const_cond_cmp) ? MAX_EXPR : MIN_EXPR;
5828         }
5829       else if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5830                  == INTEGER_INDUC_COND_REDUCTION)
5831         orig_code = MAX_EXPR;
5832     }
5833
5834   if (nested_cycle)
5835     {
5836       def_bb = gimple_bb (reduc_def_stmt);
5837       def_stmt_loop = def_bb->loop_father;
5838       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5839                                        loop_preheader_edge (def_stmt_loop));
5840       if (TREE_CODE (def_arg) == SSA_NAME
5841           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5842           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5843           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5844           && vinfo_for_stmt (def_arg_stmt)
5845           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5846               == vect_double_reduction_def)
5847         double_reduc = true;
5848     }
5849
5850   epilog_reduc_code = ERROR_MARK;
5851
5852   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != COND_REDUCTION)
5853     {
5854       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5855         {
5856           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5857                                          optab_default);
5858           if (!reduc_optab)
5859             {
5860               if (dump_enabled_p ())
5861                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5862                                  "no optab for reduction.\n");
5863
5864               epilog_reduc_code = ERROR_MARK;
5865             }
5866           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5867             {
5868               if (dump_enabled_p ())
5869                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5870                                  "reduc op not supported by target.\n");
5871
5872               epilog_reduc_code = ERROR_MARK;
5873             }
5874
5875           /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5876              generated in the epilog using multiple expressions.  This does not
5877              work for condition reductions.  */
5878           if (epilog_reduc_code == ERROR_MARK
5879               && (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5880                         == INTEGER_INDUC_COND_REDUCTION
5881                   || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5882                         == CONST_COND_REDUCTION))
5883             {
5884               if (dump_enabled_p ())
5885                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5886                                  "no reduc code for scalar code.\n");
5887               return false;
5888             }
5889         }
5890       else
5891         {
5892           if (!nested_cycle || double_reduc)
5893             {
5894               if (dump_enabled_p ())
5895                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5896                                  "no reduc code for scalar code.\n");
5897
5898               return false;
5899             }
5900         }
5901     }
5902   else
5903     {
5904       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5905       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5906       cr_index_vector_type = build_vector_type
5907         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5908
5909       epilog_reduc_code = REDUC_MAX_EXPR;
5910       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5911                                    optab_default);
5912       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5913           == CODE_FOR_nothing)
5914         {
5915           if (dump_enabled_p ())
5916             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5917                              "reduc max op not supported by target.\n");
5918           return false;
5919         }
5920     }
5921
5922   if ((double_reduc
5923        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) != TREE_CODE_REDUCTION)
5924       && ncopies > 1)
5925     {
5926       if (dump_enabled_p ())
5927         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5928                          "multiple types in double reduction or condition "
5929                          "reduction.\n");
5930       return false;
5931     }
5932
5933   /* In case of widenning multiplication by a constant, we update the type
5934      of the constant to be the type of the other operand.  We check that the
5935      constant fits the type in the pattern recognition pass.  */
5936   if (code == DOT_PROD_EXPR
5937       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5938     {
5939       if (TREE_CODE (ops[0]) == INTEGER_CST)
5940         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5941       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5942         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5943       else
5944         {
5945           if (dump_enabled_p ())
5946             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5947                              "invalid types in dot-prod\n");
5948
5949           return false;
5950         }
5951     }
5952
5953   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5954     {
5955       widest_int ni;
5956
5957       if (! max_loop_iterations (loop, &ni))
5958         {
5959           if (dump_enabled_p ())
5960             dump_printf_loc (MSG_NOTE, vect_location,
5961                              "loop count not known, cannot create cond "
5962                              "reduction.\n");
5963           return false;
5964         }
5965       /* Convert backedges to iterations.  */
5966       ni += 1;
5967
5968       /* The additional index will be the same type as the condition.  Check
5969          that the loop can fit into this less one (because we'll use up the
5970          zero slot for when there are no matches).  */
5971       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5972       if (wi::geu_p (ni, wi::to_widest (max_index)))
5973         {
5974           if (dump_enabled_p ())
5975             dump_printf_loc (MSG_NOTE, vect_location,
5976                              "loop size is greater than data size.\n");
5977           return false;
5978         }
5979     }
5980
5981   if (!vec_stmt) /* transformation not required.  */
5982     {
5983       if (first_p
5984           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5985                                          reduc_index))
5986         return false;
5987       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5988       return true;
5989     }
5990
5991   /** Transform.  **/
5992
5993   if (dump_enabled_p ())
5994     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5995
5996   /* FORNOW: Multiple types are not supported for condition.  */
5997   if (code == COND_EXPR)
5998     gcc_assert (ncopies == 1);
5999
6000   /* Create the destination vector  */
6001   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
6002
6003   /* In case the vectorization factor (VF) is bigger than the number
6004      of elements that we can fit in a vectype (nunits), we have to generate
6005      more than one vector stmt - i.e - we need to "unroll" the
6006      vector stmt by a factor VF/nunits.  For more details see documentation
6007      in vectorizable_operation.  */
6008
6009   /* If the reduction is used in an outer loop we need to generate
6010      VF intermediate results, like so (e.g. for ncopies=2):
6011         r0 = phi (init, r0)
6012         r1 = phi (init, r1)
6013         r0 = x0 + r0;
6014         r1 = x1 + r1;
6015     (i.e. we generate VF results in 2 registers).
6016     In this case we have a separate def-use cycle for each copy, and therefore
6017     for each copy we get the vector def for the reduction variable from the
6018     respective phi node created for this copy.
6019
6020     Otherwise (the reduction is unused in the loop nest), we can combine
6021     together intermediate results, like so (e.g. for ncopies=2):
6022         r = phi (init, r)
6023         r = x0 + r;
6024         r = x1 + r;
6025    (i.e. we generate VF/2 results in a single register).
6026    In this case for each copy we get the vector def for the reduction variable
6027    from the vectorized reduction operation generated in the previous iteration.
6028   */
6029
6030   if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
6031     {
6032       single_defuse_cycle = true;
6033       epilog_copies = 1;
6034     }
6035   else
6036     epilog_copies = ncopies;
6037
6038   prev_stmt_info = NULL;
6039   prev_phi_info = NULL;
6040   if (slp_node)
6041     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6042   else
6043     {
6044       vec_num = 1;
6045       vec_oprnds0.create (1);
6046       if (op_type == ternary_op)
6047         vec_oprnds1.create (1);
6048     }
6049
6050   phis.create (vec_num);
6051   vect_defs.create (vec_num);
6052   if (!slp_node)
6053     vect_defs.quick_push (NULL_TREE);
6054
6055   for (j = 0; j < ncopies; j++)
6056     {
6057       if (j == 0 || !single_defuse_cycle)
6058         {
6059           for (i = 0; i < vec_num; i++)
6060             {
6061               /* Create the reduction-phi that defines the reduction
6062                  operand.  */
6063               new_phi = create_phi_node (vec_dest, loop->header);
6064               set_vinfo_for_stmt (new_phi,
6065                                   new_stmt_vec_info (new_phi, loop_vinfo));
6066                if (j == 0 || slp_node)
6067                  phis.quick_push (new_phi);
6068             }
6069         }
6070
6071       if (code == COND_EXPR)
6072         {
6073           gcc_assert (!slp_node);
6074           vectorizable_condition (stmt, gsi, vec_stmt,
6075                                   PHI_RESULT (phis[0]),
6076                                   reduc_index, NULL);
6077           /* Multiple types are not supported for condition.  */
6078           break;
6079         }
6080
6081       /* Handle uses.  */
6082       if (j == 0)
6083         {
6084           if (slp_node)
6085             {
6086               /* Get vec defs for all the operands except the reduction index,
6087                 ensuring the ordering of the ops in the vector is kept.  */
6088               auto_vec<tree, 3> slp_ops;
6089               auto_vec<vec<tree>, 3> vec_defs;
6090
6091               slp_ops.quick_push ((reduc_index == 0) ? NULL : ops[0]);
6092               slp_ops.quick_push ((reduc_index == 1) ? NULL : ops[1]);
6093               if (op_type == ternary_op)
6094                 slp_ops.quick_push ((reduc_index == 2) ? NULL : ops[2]);
6095
6096               vect_get_slp_defs (slp_ops, slp_node, &vec_defs, -1);
6097
6098               vec_oprnds0.safe_splice (vec_defs[(reduc_index == 0) ? 1 : 0]);
6099               if (op_type == ternary_op)
6100                 vec_oprnds1.safe_splice (vec_defs[(reduc_index == 2) ? 1 : 2]);
6101             }
6102           else
6103             {
6104               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6105                                                             stmt);
6106               vec_oprnds0.quick_push (loop_vec_def0);
6107               if (op_type == ternary_op)
6108                {
6109                  op1 = (reduc_index == 0) ? ops[2] : ops[1];
6110                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6111                  vec_oprnds1.quick_push (loop_vec_def1);
6112                }
6113             }
6114         }
6115       else
6116         {
6117           if (!slp_node)
6118             {
6119               enum vect_def_type dt;
6120               gimple *dummy_stmt;
6121
6122               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6123                                   &dummy_stmt, &dt);
6124               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6125                                                               loop_vec_def0);
6126               vec_oprnds0[0] = loop_vec_def0;
6127               if (op_type == ternary_op)
6128                 {
6129                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6130                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6131                                                                 loop_vec_def1);
6132                   vec_oprnds1[0] = loop_vec_def1;
6133                 }
6134             }
6135
6136           if (single_defuse_cycle)
6137             reduc_def = gimple_assign_lhs (new_stmt);
6138
6139           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6140         }
6141
6142       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6143         {
6144           if (slp_node)
6145             reduc_def = PHI_RESULT (phis[i]);
6146           else
6147             {
6148               if (!single_defuse_cycle || j == 0)
6149                 reduc_def = PHI_RESULT (new_phi);
6150             }
6151
6152           def1 = ((op_type == ternary_op)
6153                   ? vec_oprnds1[i] : NULL);
6154           if (op_type == binary_op)
6155             {
6156               if (reduc_index == 0)
6157                 expr = build2 (code, vectype_out, reduc_def, def0);
6158               else
6159                 expr = build2 (code, vectype_out, def0, reduc_def);
6160             }
6161           else
6162             {
6163               if (reduc_index == 0)
6164                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6165               else
6166                 {
6167                   if (reduc_index == 1)
6168                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
6169                   else
6170                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
6171                 }
6172             }
6173
6174           new_stmt = gimple_build_assign (vec_dest, expr);
6175           new_temp = make_ssa_name (vec_dest, new_stmt);
6176           gimple_assign_set_lhs (new_stmt, new_temp);
6177           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6178
6179           if (slp_node)
6180             {
6181               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6182               vect_defs.quick_push (new_temp);
6183             }
6184           else
6185             vect_defs[0] = new_temp;
6186         }
6187
6188       if (slp_node)
6189         continue;
6190
6191       if (j == 0)
6192         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6193       else
6194         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6195
6196       prev_stmt_info = vinfo_for_stmt (new_stmt);
6197       prev_phi_info = vinfo_for_stmt (new_phi);
6198     }
6199
6200   tree indx_before_incr, indx_after_incr, cond_name = NULL;
6201
6202   /* Finalize the reduction-phi (set its arguments) and create the
6203      epilog reduction code.  */
6204   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6205     {
6206       new_temp = gimple_assign_lhs (*vec_stmt);
6207       vect_defs[0] = new_temp;
6208
6209       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6210          which is updated with the current index of the loop for every match of
6211          the original loop's cond_expr (VEC_STMT).  This results in a vector
6212          containing the last time the condition passed for that vector lane.
6213          The first match will be a 1 to allow 0 to be used for non-matching
6214          indexes.  If there are no matches at all then the vector will be all
6215          zeroes.  */
6216       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6217         {
6218           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6219           int k;
6220
6221           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6222
6223           /* First we create a simple vector induction variable which starts
6224              with the values {1,2,3,...} (SERIES_VECT) and increments by the
6225              vector size (STEP).  */
6226
6227           /* Create a {1,2,3,...} vector.  */
6228           tree *vtemp = XALLOCAVEC (tree, nunits_out);
6229           for (k = 0; k < nunits_out; ++k)
6230             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6231           tree series_vect = build_vector (cr_index_vector_type, vtemp);
6232
6233           /* Create a vector of the step value.  */
6234           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6235           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6236
6237           /* Create an induction variable.  */
6238           gimple_stmt_iterator incr_gsi;
6239           bool insert_after;
6240           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6241           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6242                      insert_after, &indx_before_incr, &indx_after_incr);
6243
6244           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6245              filled with zeros (VEC_ZERO).  */
6246
6247           /* Create a vector of 0s.  */
6248           tree zero = build_zero_cst (cr_index_scalar_type);
6249           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6250
6251           /* Create a vector phi node.  */
6252           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6253           new_phi = create_phi_node (new_phi_tree, loop->header);
6254           set_vinfo_for_stmt (new_phi,
6255                               new_stmt_vec_info (new_phi, loop_vinfo));
6256           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6257                        UNKNOWN_LOCATION);
6258
6259           /* Now take the condition from the loops original cond_expr
6260              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6261              every match uses values from the induction variable
6262              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6263              (NEW_PHI_TREE).
6264              Finally, we update the phi (NEW_PHI_TREE) to take the value of
6265              the new cond_expr (INDEX_COND_EXPR).  */
6266
6267           /* Duplicate the condition from vec_stmt.  */
6268           tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt));
6269
6270           /* Create a conditional, where the condition is taken from vec_stmt
6271              (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
6272              else is the phi (NEW_PHI_TREE).  */
6273           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6274                                          ccompare, indx_before_incr,
6275                                          new_phi_tree);
6276           cond_name = make_ssa_name (cr_index_vector_type);
6277           gimple *index_condition = gimple_build_assign (cond_name,
6278                                                          index_cond_expr);
6279           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6280           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6281                                                             loop_vinfo);
6282           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6283           set_vinfo_for_stmt (index_condition, index_vec_info);
6284
6285           /* Update the phi with the vec cond.  */
6286           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6287                        UNKNOWN_LOCATION);
6288         }
6289     }
6290
6291   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6292                                     epilog_reduc_code, phis, reduc_index,
6293                                     double_reduc, slp_node, cond_name);
6294
6295   return true;
6296 }
6297
6298 /* Function vect_min_worthwhile_factor.
6299
6300    For a loop where we could vectorize the operation indicated by CODE,
6301    return the minimum vectorization factor that makes it worthwhile
6302    to use generic vectors.  */
6303 int
6304 vect_min_worthwhile_factor (enum tree_code code)
6305 {
6306   switch (code)
6307     {
6308     case PLUS_EXPR:
6309     case MINUS_EXPR:
6310     case NEGATE_EXPR:
6311       return 4;
6312
6313     case BIT_AND_EXPR:
6314     case BIT_IOR_EXPR:
6315     case BIT_XOR_EXPR:
6316     case BIT_NOT_EXPR:
6317       return 2;
6318
6319     default:
6320       return INT_MAX;
6321     }
6322 }
6323
6324
6325 /* Function vectorizable_induction
6326
6327    Check if PHI performs an induction computation that can be vectorized.
6328    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6329    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6330    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6331
6332 bool
6333 vectorizable_induction (gimple *phi,
6334                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6335                         gimple **vec_stmt)
6336 {
6337   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6338   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6339   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6340   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6341   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6342   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6343   tree vec_def;
6344
6345   gcc_assert (ncopies >= 1);
6346   /* FORNOW. These restrictions should be relaxed.  */
6347   if (nested_in_vect_loop_p (loop, phi))
6348     {
6349       imm_use_iterator imm_iter;
6350       use_operand_p use_p;
6351       gimple *exit_phi;
6352       edge latch_e;
6353       tree loop_arg;
6354
6355       if (ncopies > 1)
6356         {
6357           if (dump_enabled_p ())
6358             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6359                              "multiple types in nested loop.\n");
6360           return false;
6361         }
6362
6363       exit_phi = NULL;
6364       latch_e = loop_latch_edge (loop->inner);
6365       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6366       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6367         {
6368           gimple *use_stmt = USE_STMT (use_p);
6369           if (is_gimple_debug (use_stmt))
6370             continue;
6371
6372           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6373             {
6374               exit_phi = use_stmt;
6375               break;
6376             }
6377         }
6378       if (exit_phi)
6379         {
6380           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6381           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6382                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6383             {
6384               if (dump_enabled_p ())
6385                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6386                                  "inner-loop induction only used outside "
6387                                  "of the outer vectorized loop.\n");
6388               return false;
6389             }
6390         }
6391     }
6392
6393   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6394     return false;
6395
6396   /* FORNOW: SLP not supported.  */
6397   if (STMT_SLP_TYPE (stmt_info))
6398     return false;
6399
6400   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6401
6402   if (gimple_code (phi) != GIMPLE_PHI)
6403     return false;
6404
6405   if (!vec_stmt) /* transformation not required.  */
6406     {
6407       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6408       if (dump_enabled_p ())
6409         dump_printf_loc (MSG_NOTE, vect_location,
6410                          "=== vectorizable_induction ===\n");
6411       vect_model_induction_cost (stmt_info, ncopies);
6412       return true;
6413     }
6414
6415   /** Transform.  **/
6416
6417   if (dump_enabled_p ())
6418     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6419
6420   vec_def = get_initial_def_for_induction (phi);
6421   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6422   return true;
6423 }
6424
6425 /* Function vectorizable_live_operation.
6426
6427    STMT computes a value that is used outside the loop.  Check if
6428    it can be supported.  */
6429
6430 bool
6431 vectorizable_live_operation (gimple *stmt,
6432                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6433                              slp_tree slp_node, int slp_index,
6434                              gimple **vec_stmt)
6435 {
6436   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6437   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6438   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6439   imm_use_iterator imm_iter;
6440   tree lhs, lhs_type, bitsize, vec_bitsize;
6441   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6442   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6443   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6444   gimple *use_stmt;
6445   auto_vec<tree> vec_oprnds;
6446
6447   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6448
6449   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6450     return false;
6451
6452   /* FORNOW.  CHECKME.  */
6453   if (nested_in_vect_loop_p (loop, stmt))
6454     return false;
6455
6456   /* If STMT is not relevant and it is a simple assignment and its inputs are
6457      invariant then it can remain in place, unvectorized.  The original last
6458      scalar value that it computes will be used.  */
6459   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6460     {
6461       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
6462       if (dump_enabled_p ())
6463         dump_printf_loc (MSG_NOTE, vect_location,
6464                          "statement is simple and uses invariant.  Leaving in "
6465                          "place.\n");
6466       return true;
6467     }
6468
6469   if (!vec_stmt)
6470     /* No transformation required.  */
6471     return true;
6472
6473   /* If stmt has a related stmt, then use that for getting the lhs.  */
6474   if (is_pattern_stmt_p (stmt_info))
6475     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
6476
6477   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
6478         : gimple_get_lhs (stmt);
6479   lhs_type = TREE_TYPE (lhs);
6480
6481   /* Find all uses of STMT outside the loop - there should be at least one.  */
6482   auto_vec<gimple *, 4> worklist;
6483   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
6484     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
6485         && !is_gimple_debug (use_stmt))
6486       worklist.safe_push (use_stmt);
6487   gcc_assert (worklist.length () >= 1);
6488
6489   bitsize = TYPE_SIZE (TREE_TYPE (vectype));
6490   vec_bitsize = TYPE_SIZE (vectype);
6491
6492   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
6493   tree vec_lhs, bitstart;
6494   if (slp_node)
6495     {
6496       gcc_assert (slp_index >= 0);
6497
6498       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6499       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6500
6501       /* Get the last occurrence of the scalar index from the concatenation of
6502          all the slp vectors. Calculate which slp vector it is and the index
6503          within.  */
6504       int pos = (num_vec * nunits) - num_scalar + slp_index;
6505       int vec_entry = pos / nunits;
6506       int vec_index = pos % nunits;
6507
6508       /* Get the correct slp vectorized stmt.  */
6509       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
6510
6511       /* Get entry to use.  */
6512       bitstart = build_int_cst (unsigned_type_node, vec_index);
6513       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
6514     }
6515   else
6516     {
6517       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
6518       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
6519
6520       /* For multiple copies, get the last copy.  */
6521       for (int i = 1; i < ncopies; ++i)
6522         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
6523                                                   vec_lhs);
6524
6525       /* Get the last lane in the vector.  */
6526       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
6527     }
6528
6529   /* Create a new vectorized stmt for the uses of STMT and insert outside the
6530      loop.  */
6531   gimple_seq stmts = NULL;
6532   tree new_tree = build3 (BIT_FIELD_REF, TREE_TYPE (vectype), vec_lhs, bitsize,
6533                           bitstart);
6534   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
6535                                    true, NULL_TREE);
6536   if (stmts)
6537     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
6538
6539   /* Replace all uses of the USE_STMT in the worklist with the newly inserted
6540      statement.  */
6541   while (!worklist.is_empty ())
6542     {
6543       use_stmt = worklist.pop ();
6544       replace_uses_by (gimple_phi_result (use_stmt), new_tree);
6545       update_stmt (use_stmt);
6546     }
6547
6548   return true;
6549 }
6550
6551 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6552
6553 static void
6554 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6555 {
6556   ssa_op_iter op_iter;
6557   imm_use_iterator imm_iter;
6558   def_operand_p def_p;
6559   gimple *ustmt;
6560
6561   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6562     {
6563       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6564         {
6565           basic_block bb;
6566
6567           if (!is_gimple_debug (ustmt))
6568             continue;
6569
6570           bb = gimple_bb (ustmt);
6571
6572           if (!flow_bb_inside_loop_p (loop, bb))
6573             {
6574               if (gimple_debug_bind_p (ustmt))
6575                 {
6576                   if (dump_enabled_p ())
6577                     dump_printf_loc (MSG_NOTE, vect_location,
6578                                      "killing debug use\n");
6579
6580                   gimple_debug_bind_reset_value (ustmt);
6581                   update_stmt (ustmt);
6582                 }
6583               else
6584                 gcc_unreachable ();
6585             }
6586         }
6587     }
6588 }
6589
6590
6591 /* This function builds ni_name = number of iterations.  Statements
6592    are emitted on the loop preheader edge.  */
6593
6594 static tree
6595 vect_build_loop_niters (loop_vec_info loop_vinfo)
6596 {
6597   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6598   if (TREE_CODE (ni) == INTEGER_CST)
6599     return ni;
6600   else
6601     {
6602       tree ni_name, var;
6603       gimple_seq stmts = NULL;
6604       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6605
6606       var = create_tmp_var (TREE_TYPE (ni), "niters");
6607       ni_name = force_gimple_operand (ni, &stmts, false, var);
6608       if (stmts)
6609         gsi_insert_seq_on_edge_immediate (pe, stmts);
6610
6611       return ni_name;
6612     }
6613 }
6614
6615
6616 /* This function generates the following statements:
6617
6618    ni_name = number of iterations loop executes
6619    ratio = ni_name / vf
6620    ratio_mult_vf_name = ratio * vf
6621
6622    and places them on the loop preheader edge.  */
6623
6624 static void
6625 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6626                                  tree ni_name,
6627                                  tree *ratio_mult_vf_name_ptr,
6628                                  tree *ratio_name_ptr)
6629 {
6630   tree ni_minus_gap_name;
6631   tree var;
6632   tree ratio_name;
6633   tree ratio_mult_vf_name;
6634   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6635   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6636   tree log_vf;
6637
6638   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6639
6640   /* If epilogue loop is required because of data accesses with gaps, we
6641      subtract one iteration from the total number of iterations here for
6642      correct calculation of RATIO.  */
6643   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6644     {
6645       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6646                                        ni_name,
6647                                        build_one_cst (TREE_TYPE (ni_name)));
6648       if (!is_gimple_val (ni_minus_gap_name))
6649         {
6650           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6651           gimple *stmts = NULL;
6652           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6653                                                     true, var);
6654           gsi_insert_seq_on_edge_immediate (pe, stmts);
6655         }
6656     }
6657   else
6658     ni_minus_gap_name = ni_name;
6659
6660   /* Create: ratio = ni >> log2(vf) */
6661   /* ???  As we have ni == number of latch executions + 1, ni could
6662      have overflown to zero.  So avoid computing ratio based on ni
6663      but compute it using the fact that we know ratio will be at least
6664      one, thus via (ni - vf) >> log2(vf) + 1.  */
6665   ratio_name
6666     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6667                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6668                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6669                                              ni_minus_gap_name,
6670                                              build_int_cst
6671                                                (TREE_TYPE (ni_name), vf)),
6672                                 log_vf),
6673                    build_int_cst (TREE_TYPE (ni_name), 1));
6674   if (!is_gimple_val (ratio_name))
6675     {
6676       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6677       gimple *stmts = NULL;
6678       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6679       gsi_insert_seq_on_edge_immediate (pe, stmts);
6680     }
6681   *ratio_name_ptr = ratio_name;
6682
6683   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6684
6685   if (ratio_mult_vf_name_ptr)
6686     {
6687       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6688                                         ratio_name, log_vf);
6689       if (!is_gimple_val (ratio_mult_vf_name))
6690         {
6691           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6692           gimple *stmts = NULL;
6693           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6694                                                      true, var);
6695           gsi_insert_seq_on_edge_immediate (pe, stmts);
6696         }
6697       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6698     }
6699
6700   return;
6701 }
6702
6703
6704 /* Function vect_transform_loop.
6705
6706    The analysis phase has determined that the loop is vectorizable.
6707    Vectorize the loop - created vectorized stmts to replace the scalar
6708    stmts in the loop, and update the loop exit condition.  */
6709
6710 void
6711 vect_transform_loop (loop_vec_info loop_vinfo)
6712 {
6713   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6714   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6715   int nbbs = loop->num_nodes;
6716   int i;
6717   tree ratio = NULL;
6718   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6719   bool grouped_store;
6720   bool slp_scheduled = false;
6721   gimple *stmt, *pattern_stmt;
6722   gimple_seq pattern_def_seq = NULL;
6723   gimple_stmt_iterator pattern_def_si = gsi_none ();
6724   bool transform_pattern_stmt = false;
6725   bool check_profitability = false;
6726   int th;
6727   /* Record number of iterations before we started tampering with the profile. */
6728   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6729
6730   if (dump_enabled_p ())
6731     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6732
6733   /* If profile is inprecise, we have chance to fix it up.  */
6734   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6735     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6736
6737   /* Use the more conservative vectorization threshold.  If the number
6738      of iterations is constant assume the cost check has been performed
6739      by our caller.  If the threshold makes all loops profitable that
6740      run at least the vectorization factor number of times checking
6741      is pointless, too.  */
6742   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6743   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6744       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6745     {
6746       if (dump_enabled_p ())
6747         dump_printf_loc (MSG_NOTE, vect_location,
6748                          "Profitability threshold is %d loop iterations.\n",
6749                          th);
6750       check_profitability = true;
6751     }
6752
6753   /* Make sure there exists a single-predecessor exit bb.  Do this before
6754      versioning.   */
6755   edge e = single_exit (loop);
6756   if (! single_pred_p (e->dest))
6757     {
6758       split_loop_exit_edge (e);
6759       if (dump_enabled_p ())
6760         dump_printf (MSG_NOTE, "split exit edge\n");
6761     }
6762
6763   /* Version the loop first, if required, so the profitability check
6764      comes first.  */
6765
6766   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
6767     {
6768       vect_loop_versioning (loop_vinfo, th, check_profitability);
6769       check_profitability = false;
6770     }
6771
6772   /* Make sure there exists a single-predecessor exit bb also on the
6773      scalar loop copy.  Do this after versioning but before peeling
6774      so CFG structure is fine for both scalar and if-converted loop
6775      to make slpeel_duplicate_current_defs_from_edges face matched
6776      loop closed PHI nodes on the exit.  */
6777   if (LOOP_VINFO_SCALAR_LOOP (loop_vinfo))
6778     {
6779       e = single_exit (LOOP_VINFO_SCALAR_LOOP (loop_vinfo));
6780       if (! single_pred_p (e->dest))
6781         {
6782           split_loop_exit_edge (e);
6783           if (dump_enabled_p ())
6784             dump_printf (MSG_NOTE, "split exit edge of scalar loop\n");
6785         }
6786     }
6787
6788   tree ni_name = vect_build_loop_niters (loop_vinfo);
6789   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6790
6791   /* Peel the loop if there are data refs with unknown alignment.
6792      Only one data ref with unknown store is allowed.  */
6793
6794   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6795     {
6796       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6797                                      th, check_profitability);
6798       check_profitability = false;
6799       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6800          be re-computed.  */
6801       ni_name = NULL_TREE;
6802     }
6803
6804   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6805      compile time constant), or it is a constant that doesn't divide by the
6806      vectorization factor, then an epilog loop needs to be created.
6807      We therefore duplicate the loop: the original loop will be vectorized,
6808      and will compute the first (n/VF) iterations.  The second copy of the loop
6809      will remain scalar and will compute the remaining (n%VF) iterations.
6810      (VF is the vectorization factor).  */
6811
6812   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6813       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6814     {
6815       tree ratio_mult_vf;
6816       if (!ni_name)
6817         ni_name = vect_build_loop_niters (loop_vinfo);
6818       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6819                                        &ratio);
6820       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6821                                       th, check_profitability);
6822     }
6823   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6824     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6825                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6826   else
6827     {
6828       if (!ni_name)
6829         ni_name = vect_build_loop_niters (loop_vinfo);
6830       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6831     }
6832
6833   /* 1) Make sure the loop header has exactly two entries
6834      2) Make sure we have a preheader basic block.  */
6835
6836   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6837
6838   split_edge (loop_preheader_edge (loop));
6839
6840   /* FORNOW: the vectorizer supports only loops which body consist
6841      of one basic block (header + empty latch). When the vectorizer will
6842      support more involved loop forms, the order by which the BBs are
6843      traversed need to be reconsidered.  */
6844
6845   for (i = 0; i < nbbs; i++)
6846     {
6847       basic_block bb = bbs[i];
6848       stmt_vec_info stmt_info;
6849
6850       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6851            gsi_next (&si))
6852         {
6853           gphi *phi = si.phi ();
6854           if (dump_enabled_p ())
6855             {
6856               dump_printf_loc (MSG_NOTE, vect_location,
6857                                "------>vectorizing phi: ");
6858               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6859             }
6860           stmt_info = vinfo_for_stmt (phi);
6861           if (!stmt_info)
6862             continue;
6863
6864           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6865             vect_loop_kill_debug_uses (loop, phi);
6866
6867           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6868               && !STMT_VINFO_LIVE_P (stmt_info))
6869             continue;
6870
6871           if (STMT_VINFO_VECTYPE (stmt_info)
6872               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6873                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6874               && dump_enabled_p ())
6875             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6876
6877           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6878             {
6879               if (dump_enabled_p ())
6880                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6881               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6882             }
6883         }
6884
6885       pattern_stmt = NULL;
6886       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6887            !gsi_end_p (si) || transform_pattern_stmt;)
6888         {
6889           bool is_store;
6890
6891           if (transform_pattern_stmt)
6892             stmt = pattern_stmt;
6893           else
6894             {
6895               stmt = gsi_stmt (si);
6896               /* During vectorization remove existing clobber stmts.  */
6897               if (gimple_clobber_p (stmt))
6898                 {
6899                   unlink_stmt_vdef (stmt);
6900                   gsi_remove (&si, true);
6901                   release_defs (stmt);
6902                   continue;
6903                 }
6904             }
6905
6906           if (dump_enabled_p ())
6907             {
6908               dump_printf_loc (MSG_NOTE, vect_location,
6909                                "------>vectorizing statement: ");
6910               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6911             }
6912
6913           stmt_info = vinfo_for_stmt (stmt);
6914
6915           /* vector stmts created in the outer-loop during vectorization of
6916              stmts in an inner-loop may not have a stmt_info, and do not
6917              need to be vectorized.  */
6918           if (!stmt_info)
6919             {
6920               gsi_next (&si);
6921               continue;
6922             }
6923
6924           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6925             vect_loop_kill_debug_uses (loop, stmt);
6926
6927           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6928               && !STMT_VINFO_LIVE_P (stmt_info))
6929             {
6930               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6931                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6932                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6933                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6934                 {
6935                   stmt = pattern_stmt;
6936                   stmt_info = vinfo_for_stmt (stmt);
6937                 }
6938               else
6939                 {
6940                   gsi_next (&si);
6941                   continue;
6942                 }
6943             }
6944           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6945                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6946                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6947                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6948             transform_pattern_stmt = true;
6949
6950           /* If pattern statement has def stmts, vectorize them too.  */
6951           if (is_pattern_stmt_p (stmt_info))
6952             {
6953               if (pattern_def_seq == NULL)
6954                 {
6955                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6956                   pattern_def_si = gsi_start (pattern_def_seq);
6957                 }
6958               else if (!gsi_end_p (pattern_def_si))
6959                 gsi_next (&pattern_def_si);
6960               if (pattern_def_seq != NULL)
6961                 {
6962                   gimple *pattern_def_stmt = NULL;
6963                   stmt_vec_info pattern_def_stmt_info = NULL;
6964
6965                   while (!gsi_end_p (pattern_def_si))
6966                     {
6967                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6968                       pattern_def_stmt_info
6969                         = vinfo_for_stmt (pattern_def_stmt);
6970                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6971                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6972                         break;
6973                       gsi_next (&pattern_def_si);
6974                     }
6975
6976                   if (!gsi_end_p (pattern_def_si))
6977                     {
6978                       if (dump_enabled_p ())
6979                         {
6980                           dump_printf_loc (MSG_NOTE, vect_location,
6981                                            "==> vectorizing pattern def "
6982                                            "stmt: ");
6983                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6984                                             pattern_def_stmt, 0);
6985                         }
6986
6987                       stmt = pattern_def_stmt;
6988                       stmt_info = pattern_def_stmt_info;
6989                     }
6990                   else
6991                     {
6992                       pattern_def_si = gsi_none ();
6993                       transform_pattern_stmt = false;
6994                     }
6995                 }
6996               else
6997                 transform_pattern_stmt = false;
6998             }
6999
7000           if (STMT_VINFO_VECTYPE (stmt_info))
7001             {
7002               unsigned int nunits
7003                 = (unsigned int)
7004                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
7005               if (!STMT_SLP_TYPE (stmt_info)
7006                   && nunits != (unsigned int) vectorization_factor
7007                   && dump_enabled_p ())
7008                   /* For SLP VF is set according to unrolling factor, and not
7009                      to vector size, hence for SLP this print is not valid.  */
7010                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
7011             }
7012
7013           /* SLP. Schedule all the SLP instances when the first SLP stmt is
7014              reached.  */
7015           if (STMT_SLP_TYPE (stmt_info))
7016             {
7017               if (!slp_scheduled)
7018                 {
7019                   slp_scheduled = true;
7020
7021                   if (dump_enabled_p ())
7022                     dump_printf_loc (MSG_NOTE, vect_location,
7023                                      "=== scheduling SLP instances ===\n");
7024
7025                   vect_schedule_slp (loop_vinfo);
7026                 }
7027
7028               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
7029               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
7030                 {
7031                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7032                     {
7033                       pattern_def_seq = NULL;
7034                       gsi_next (&si);
7035                     }
7036                   continue;
7037                 }
7038             }
7039
7040           /* -------- vectorize statement ------------ */
7041           if (dump_enabled_p ())
7042             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
7043
7044           grouped_store = false;
7045           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
7046           if (is_store)
7047             {
7048               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
7049                 {
7050                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
7051                      interleaving chain was completed - free all the stores in
7052                      the chain.  */
7053                   gsi_next (&si);
7054                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
7055                 }
7056               else
7057                 {
7058                   /* Free the attached stmt_vec_info and remove the stmt.  */
7059                   gimple *store = gsi_stmt (si);
7060                   free_stmt_vec_info (store);
7061                   unlink_stmt_vdef (store);
7062                   gsi_remove (&si, true);
7063                   release_defs (store);
7064                 }
7065
7066               /* Stores can only appear at the end of pattern statements.  */
7067               gcc_assert (!transform_pattern_stmt);
7068               pattern_def_seq = NULL;
7069             }
7070           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7071             {
7072               pattern_def_seq = NULL;
7073               gsi_next (&si);
7074             }
7075         }                       /* stmts in BB */
7076     }                           /* BBs in loop */
7077
7078   slpeel_make_loop_iterate_ntimes (loop, ratio);
7079
7080   /* Reduce loop iterations by the vectorization factor.  */
7081   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
7082                       expected_iterations / vectorization_factor);
7083   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
7084     {
7085       if (loop->nb_iterations_upper_bound != 0)
7086         loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
7087       if (loop->nb_iterations_likely_upper_bound != 0)
7088         loop->nb_iterations_likely_upper_bound
7089            = loop->nb_iterations_likely_upper_bound - 1;
7090     }
7091   loop->nb_iterations_upper_bound
7092     = wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
7093                       vectorization_factor) - 1;
7094   loop->nb_iterations_likely_upper_bound
7095     = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + 1,
7096                       vectorization_factor) - 1;
7097
7098   if (loop->any_estimate)
7099     {
7100       loop->nb_iterations_estimate
7101         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
7102        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
7103            && loop->nb_iterations_estimate != 0)
7104          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
7105     }
7106
7107   if (dump_enabled_p ())
7108     {
7109       dump_printf_loc (MSG_NOTE, vect_location,
7110                        "LOOP VECTORIZED\n");
7111       if (loop->inner)
7112         dump_printf_loc (MSG_NOTE, vect_location,
7113                          "OUTER LOOP VECTORIZED\n");
7114       dump_printf (MSG_NOTE, "\n");
7115     }
7116
7117   /* Free SLP instances here because otherwise stmt reference counting
7118      won't work.  */
7119   slp_instance instance;
7120   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7121     vect_free_slp_instance (instance);
7122   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7123   /* Clear-up safelen field since its value is invalid after vectorization
7124      since vectorized loop can have loop-carried dependencies.  */
7125   loop->safelen = 0;
7126 }
7127
7128 /* The code below is trying to perform simple optimization - revert
7129    if-conversion for masked stores, i.e. if the mask of a store is zero
7130    do not perform it and all stored value producers also if possible.
7131    For example,
7132      for (i=0; i<n; i++)
7133        if (c[i])
7134         {
7135           p1[i] += 1;
7136           p2[i] = p3[i] +2;
7137         }
7138    this transformation will produce the following semi-hammock:
7139
7140    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7141      {
7142        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7143        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7144        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7145        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7146        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7147        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7148      }
7149 */
7150
7151 void
7152 optimize_mask_stores (struct loop *loop)
7153 {
7154   basic_block *bbs = get_loop_body (loop);
7155   unsigned nbbs = loop->num_nodes;
7156   unsigned i;
7157   basic_block bb;
7158   gimple_stmt_iterator gsi;
7159   gimple *stmt;
7160   auto_vec<gimple *> worklist;
7161
7162   vect_location = find_loop_location (loop);
7163   /* Pick up all masked stores in loop if any.  */
7164   for (i = 0; i < nbbs; i++)
7165     {
7166       bb = bbs[i];
7167       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7168            gsi_next (&gsi))
7169         {
7170           stmt = gsi_stmt (gsi);
7171           if (is_gimple_call (stmt)
7172               && gimple_call_internal_p (stmt)
7173               && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7174             worklist.safe_push (stmt);
7175         }
7176     }
7177
7178   free (bbs);
7179   if (worklist.is_empty ())
7180     return;
7181
7182   /* Loop has masked stores.  */
7183   while (!worklist.is_empty ())
7184     {
7185       gimple *last, *last_store;
7186       edge e, efalse;
7187       tree mask;
7188       basic_block store_bb, join_bb;
7189       gimple_stmt_iterator gsi_to;
7190       tree vdef, new_vdef;
7191       gphi *phi;
7192       tree vectype;
7193       tree zero;
7194
7195       last = worklist.pop ();
7196       mask = gimple_call_arg (last, 2);
7197       bb = gimple_bb (last);
7198       /* Create new bb.  */
7199       e = split_block (bb, last);
7200       join_bb = e->dest;
7201       store_bb = create_empty_bb (bb);
7202       add_bb_to_loop (store_bb, loop);
7203       e->flags = EDGE_TRUE_VALUE;
7204       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7205       /* Put STORE_BB to likely part.  */
7206       efalse->probability = PROB_UNLIKELY;
7207       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7208       make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7209       if (dom_info_available_p (CDI_DOMINATORS))
7210         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7211       if (dump_enabled_p ())
7212         dump_printf_loc (MSG_NOTE, vect_location,
7213                          "Create new block %d to sink mask stores.",
7214                          store_bb->index);
7215       /* Create vector comparison with boolean result.  */
7216       vectype = TREE_TYPE (mask);
7217       zero = build_zero_cst (vectype);
7218       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7219       gsi = gsi_last_bb (bb);
7220       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7221       /* Create new PHI node for vdef of the last masked store:
7222          .MEM_2 = VDEF <.MEM_1>
7223          will be converted to
7224          .MEM.3 = VDEF <.MEM_1>
7225          and new PHI node will be created in join bb
7226          .MEM_2 = PHI <.MEM_1, .MEM_3>
7227       */
7228       vdef = gimple_vdef (last);
7229       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7230       gimple_set_vdef (last, new_vdef);
7231       phi = create_phi_node (vdef, join_bb);
7232       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7233
7234       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7235       while (true)
7236         {
7237           gimple_stmt_iterator gsi_from;
7238           gimple *stmt1 = NULL;
7239
7240           /* Move masked store to STORE_BB.  */
7241           last_store = last;
7242           gsi = gsi_for_stmt (last);
7243           gsi_from = gsi;
7244           /* Shift GSI to the previous stmt for further traversal.  */
7245           gsi_prev (&gsi);
7246           gsi_to = gsi_start_bb (store_bb);
7247           gsi_move_before (&gsi_from, &gsi_to);
7248           /* Setup GSI_TO to the non-empty block start.  */
7249           gsi_to = gsi_start_bb (store_bb);
7250           if (dump_enabled_p ())
7251             {
7252               dump_printf_loc (MSG_NOTE, vect_location,
7253                                "Move stmt to created bb\n");
7254               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7255             }
7256           /* Move all stored value producers if possible.  */
7257           while (!gsi_end_p (gsi))
7258             {
7259               tree lhs;
7260               imm_use_iterator imm_iter;
7261               use_operand_p use_p;
7262               bool res;
7263
7264               /* Skip debug statements.  */
7265               if (is_gimple_debug (gsi_stmt (gsi)))
7266                 {
7267                   gsi_prev (&gsi);
7268                   continue;
7269                 }
7270               stmt1 = gsi_stmt (gsi);
7271               /* Do not consider statements writing to memory or having
7272                  volatile operand.  */
7273               if (gimple_vdef (stmt1)
7274                   || gimple_has_volatile_ops (stmt1))
7275                 break;
7276               gsi_from = gsi;
7277               gsi_prev (&gsi);
7278               lhs = gimple_get_lhs (stmt1);
7279               if (!lhs)
7280                 break;
7281
7282               /* LHS of vectorized stmt must be SSA_NAME.  */
7283               if (TREE_CODE (lhs) != SSA_NAME)
7284                 break;
7285
7286               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7287                 {
7288                   /* Remove dead scalar statement.  */
7289                   if (has_zero_uses (lhs))
7290                     {
7291                       gsi_remove (&gsi_from, true);
7292                       continue;
7293                     }
7294                 }
7295
7296               /* Check that LHS does not have uses outside of STORE_BB.  */
7297               res = true;
7298               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7299                 {
7300                   gimple *use_stmt;
7301                   use_stmt = USE_STMT (use_p);
7302                   if (is_gimple_debug (use_stmt))
7303                     continue;
7304                   if (gimple_bb (use_stmt) != store_bb)
7305                     {
7306                       res = false;
7307                       break;
7308                     }
7309                 }
7310               if (!res)
7311                 break;
7312
7313               if (gimple_vuse (stmt1)
7314                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7315                 break;
7316
7317               /* Can move STMT1 to STORE_BB.  */
7318               if (dump_enabled_p ())
7319                 {
7320                   dump_printf_loc (MSG_NOTE, vect_location,
7321                                    "Move stmt to created bb\n");
7322                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7323                 }
7324               gsi_move_before (&gsi_from, &gsi_to);
7325               /* Shift GSI_TO for further insertion.  */
7326               gsi_prev (&gsi_to);
7327             }
7328           /* Put other masked stores with the same mask to STORE_BB.  */
7329           if (worklist.is_empty ()
7330               || gimple_call_arg (worklist.last (), 2) != mask
7331               || worklist.last () != stmt1)
7332             break;
7333           last = worklist.pop ();
7334         }
7335       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7336     }
7337 }