gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2016 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "backend.h"
  26 #include "target.h"
  27 #include "rtl.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "cfghooks.h"
  31 #include "tree-pass.h"
  32 #include "ssa.h"
  33 #include "optabs-tree.h"
  34 #include "diagnostic-core.h"
  35 #include "fold-const.h"
  36 #include "stor-layout.h"
  37 #include "cfganal.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-ssa-loop.h"
  45 #include "cfgloop.h"
  46 #include "params.h"
  47 #include "tree-scalar-evolution.h"
  48 #include "tree-vectorizer.h"
  49 #include "gimple-fold.h"
  50 #include "cgraph.h"
  51 #include "tree-cfg.h"
  52
  53 /* Loop Vectorization Pass.
  54
  55    This pass tries to vectorize loops.
  56
  57    For example, the vectorizer transforms the following simple loop:
  58
  59         short a[N]; short b[N]; short c[N]; int i;
  60
  61         for (i=0; i<N; i++){
  62           a[i] = b[i] + c[i];
  63         }
  64
  65    as if it was manually vectorized by rewriting the source code into:
  66
  67         typedef int __attribute__((mode(V8HI))) v8hi;
  68         short a[N];  short b[N]; short c[N];   int i;
  69         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  70         v8hi va, vb, vc;
  71
  72         for (i=0; i<N/8; i++){
  73           vb = pb[i];
  74           vc = pc[i];
  75           va = vb + vc;
  76           pa[i] = va;
  77         }
  78
  79         The main entry to this pass is vectorize_loops(), in which
  80    the vectorizer applies a set of analyses on a given set of loops,
  81    followed by the actual vectorization transformation for the loops that
  82    had successfully passed the analysis phase.
  83         Throughout this pass we make a distinction between two types of
  84    data: scalars (which are represented by SSA_NAMES), and memory references
  85    ("data-refs").  These two types of data require different handling both
  86    during analysis and transformation. The types of data-refs that the
  87    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  88    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  89    accesses are required to have a simple (consecutive) access pattern.
  90
  91    Analysis phase:
  92    ===============
  93         The driver for the analysis phase is vect_analyze_loop().
  94    It applies a set of analyses, some of which rely on the scalar evolution
  95    analyzer (scev) developed by Sebastian Pop.
  96
  97         During the analysis phase the vectorizer records some information
  98    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
  99    loop, as well as general information about the loop as a whole, which is
 100    recorded in a "loop_vec_info" struct attached to each loop.
 101
 102    Transformation phase:
 103    =====================
 104         The loop transformation phase scans all the stmts in the loop, and
 105    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 106    the loop that needs to be vectorized.  It inserts the vector code sequence
 107    just before the scalar stmt S, and records a pointer to the vector code
 108    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 109    attached to S).  This pointer will be used for the vectorization of following
 110    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 111    otherwise, we rely on dead code elimination for removing it.
 112
 113         For example, say stmt S1 was vectorized into stmt VS1:
 114
 115    VS1: vb = px[i];
 116    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 117    S2:  a = b;
 118
 119    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 120    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 121    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 122    resulting sequence would be:
 123
 124    VS1: vb = px[i];
 125    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 126    VS2: va = vb;
 127    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 128
 129         Operands that are not SSA_NAMEs, are data-refs that appear in
 130    load/store operations (like 'x[i]' in S1), and are handled differently.
 131
 132    Target modeling:
 133    =================
 134         Currently the only target specific information that is used is the
 135    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 136    Targets that can support different sizes of vectors, for now will need
 137    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 138    flexibility will be added in the future.
 139
 140         Since we only vectorize operations which vector form can be
 141    expressed using existing tree codes, to verify that an operation is
 142    supported, the vectorizer checks the relevant optab at the relevant
 143    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 144    the value found is CODE_FOR_nothing, then there's no target support, and
 145    we can't vectorize the stmt.
 146
 147    For additional information on this project see:
 148    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 149 */
 150
 151 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 152
 153 /* Function vect_determine_vectorization_factor
 154
 155    Determine the vectorization factor (VF).  VF is the number of data elements
 156    that are operated upon in parallel in a single iteration of the vectorized
 157    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 158    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 159    elements can fit in a single vector register.
 160
 161    We currently support vectorization of loops in which all types operated upon
 162    are of the same size.  Therefore this function currently sets VF according to
 163    the size of the types operated upon, and fails if there are multiple sizes
 164    in the loop.
 165
 166    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 167    original loop:
 168         for (i=0; i<N; i++){
 169           a[i] = b[i] + c[i];
 170         }
 171
 172    vectorized loop:
 173         for (i=0; i<N; i+=VF){
 174           a[i:VF] = b[i:VF] + c[i:VF];
 175         }
 176 */
 177
 178 static bool
 179 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 180 {
 181   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 182   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 183   unsigned nbbs = loop->num_nodes;
 184   unsigned int vectorization_factor = 0;
 185   tree scalar_type;
 186   gphi *phi;
 187   tree vectype;
 188   unsigned int nunits;
 189   stmt_vec_info stmt_info;
 190   unsigned i;
 191   HOST_WIDE_INT dummy;
 192   gimple *stmt, *pattern_stmt = NULL;
 193   gimple_seq pattern_def_seq = NULL;
 194   gimple_stmt_iterator pattern_def_si = gsi_none ();
 195   bool analyze_pattern_stmt = false;
 196   bool bool_result;
 197   auto_vec<stmt_vec_info> mask_producers;
 198
 199   if (dump_enabled_p ())
 200     dump_printf_loc (MSG_NOTE, vect_location,
 201                      "=== vect_determine_vectorization_factor ===\n");
 202
 203   for (i = 0; i < nbbs; i++)
 204     {
 205       basic_block bb = bbs[i];
 206
 207       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 208            gsi_next (&si))
 209         {
 210           phi = si.phi ();
 211           stmt_info = vinfo_for_stmt (phi);
 212           if (dump_enabled_p ())
 213             {
 214               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 215               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 216             }
 217
 218           gcc_assert (stmt_info);
 219
 220           if (STMT_VINFO_RELEVANT_P (stmt_info)
 221               || STMT_VINFO_LIVE_P (stmt_info))
 222             {
 223               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 224               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 225
 226               if (dump_enabled_p ())
 227                 {
 228                   dump_printf_loc (MSG_NOTE, vect_location,
 229                                    "get vectype for scalar type:  ");
 230                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 231                   dump_printf (MSG_NOTE, "\n");
 232                 }
 233
 234               vectype = get_vectype_for_scalar_type (scalar_type);
 235               if (!vectype)
 236                 {
 237                   if (dump_enabled_p ())
 238                     {
 239                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 240                                        "not vectorized: unsupported "
 241                                        "data-type ");
 242                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 243                                          scalar_type);
 244                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 245                     }
 246                   return false;
 247                 }
 248               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 249
 250               if (dump_enabled_p ())
 251                 {
 252                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 253                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 254                   dump_printf (MSG_NOTE, "\n");
 255                 }
 256
 257               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 258               if (dump_enabled_p ())
 259                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 260                                  nunits);
 261
 262               if (!vectorization_factor
 263                   || (nunits > vectorization_factor))
 264                 vectorization_factor = nunits;
 265             }
 266         }
 267
 268       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 269            !gsi_end_p (si) || analyze_pattern_stmt;)
 270         {
 271           tree vf_vectype;
 272
 273           if (analyze_pattern_stmt)
 274             stmt = pattern_stmt;
 275           else
 276             stmt = gsi_stmt (si);
 277
 278           stmt_info = vinfo_for_stmt (stmt);
 279
 280           if (dump_enabled_p ())
 281             {
 282               dump_printf_loc (MSG_NOTE, vect_location,
 283                                "==> examining statement: ");
 284               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 285             }
 286
 287           gcc_assert (stmt_info);
 288
 289           /* Skip stmts which do not need to be vectorized.  */
 290           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 291                && !STMT_VINFO_LIVE_P (stmt_info))
 292               || gimple_clobber_p (stmt))
 293             {
 294               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 295                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 296                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 297                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 298                 {
 299                   stmt = pattern_stmt;
 300                   stmt_info = vinfo_for_stmt (pattern_stmt);
 301                   if (dump_enabled_p ())
 302                     {
 303                       dump_printf_loc (MSG_NOTE, vect_location,
 304                                        "==> examining pattern statement: ");
 305                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 306                     }
 307                 }
 308               else
 309                 {
 310                   if (dump_enabled_p ())
 311                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 312                   gsi_next (&si);
 313                   continue;
 314                 }
 315             }
 316           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 317                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 318                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 319                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 320             analyze_pattern_stmt = true;
 321
 322           /* If a pattern statement has def stmts, analyze them too.  */
 323           if (is_pattern_stmt_p (stmt_info))
 324             {
 325               if (pattern_def_seq == NULL)
 326                 {
 327                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 328                   pattern_def_si = gsi_start (pattern_def_seq);
 329                 }
 330               else if (!gsi_end_p (pattern_def_si))
 331                 gsi_next (&pattern_def_si);
 332               if (pattern_def_seq != NULL)
 333                 {
 334                   gimple *pattern_def_stmt = NULL;
 335                   stmt_vec_info pattern_def_stmt_info = NULL;
 336
 337                   while (!gsi_end_p (pattern_def_si))
 338                     {
 339                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 340                       pattern_def_stmt_info
 341                         = vinfo_for_stmt (pattern_def_stmt);
 342                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 343                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 344                         break;
 345                       gsi_next (&pattern_def_si);
 346                     }
 347
 348                   if (!gsi_end_p (pattern_def_si))
 349                     {
 350                       if (dump_enabled_p ())
 351                         {
 352                           dump_printf_loc (MSG_NOTE, vect_location,
 353                                            "==> examining pattern def stmt: ");
 354                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 355                                             pattern_def_stmt, 0);
 356                         }
 357
 358                       stmt = pattern_def_stmt;
 359                       stmt_info = pattern_def_stmt_info;
 360                     }
 361                   else
 362                     {
 363                       pattern_def_si = gsi_none ();
 364                       analyze_pattern_stmt = false;
 365                     }
 366                 }
 367               else
 368                 analyze_pattern_stmt = false;
 369             }
 370
 371           if (gimple_get_lhs (stmt) == NULL_TREE
 372               /* MASK_STORE has no lhs, but is ok.  */
 373               && (!is_gimple_call (stmt)
 374                   || !gimple_call_internal_p (stmt)
 375                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 376             {
 377               if (is_gimple_call (stmt))
 378                 {
 379                   /* Ignore calls with no lhs.  These must be calls to
 380                      #pragma omp simd functions, and what vectorization factor
 381                      it really needs can't be determined until
 382                      vectorizable_simd_clone_call.  */
 383                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 384                     {
 385                       pattern_def_seq = NULL;
 386                       gsi_next (&si);
 387                     }
 388                   continue;
 389                 }
 390               if (dump_enabled_p ())
 391                 {
 392                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 393                                    "not vectorized: irregular stmt.");
 394                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 395                                     0);
 396                 }
 397               return false;
 398             }
 399
 400           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 401             {
 402               if (dump_enabled_p ())
 403                 {
 404                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 405                                    "not vectorized: vector stmt in loop:");
 406                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 407                 }
 408               return false;
 409             }
 410
 411           bool_result = false;
 412
 413           if (STMT_VINFO_VECTYPE (stmt_info))
 414             {
 415               /* The only case when a vectype had been already set is for stmts
 416                  that contain a dataref, or for "pattern-stmts" (stmts
 417                  generated by the vectorizer to represent/replace a certain
 418                  idiom).  */
 419               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 420                           || is_pattern_stmt_p (stmt_info)
 421                           || !gsi_end_p (pattern_def_si));
 422               vectype = STMT_VINFO_VECTYPE (stmt_info);
 423             }
 424           else
 425             {
 426               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 427               if (is_gimple_call (stmt)
 428                   && gimple_call_internal_p (stmt)
 429                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 430                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 431               else
 432                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 433
 434               /* Bool ops don't participate in vectorization factor
 435                  computation.  For comparison use compared types to
 436                  compute a factor.  */
 437               if (TREE_CODE (scalar_type) == BOOLEAN_TYPE
 438                   && is_gimple_assign (stmt)
 439                   && gimple_assign_rhs_code (stmt) != COND_EXPR)
 440                 {
 441                   if (STMT_VINFO_RELEVANT_P (stmt_info)
 442                       || STMT_VINFO_LIVE_P (stmt_info))
 443                     mask_producers.safe_push (stmt_info);
 444                   bool_result = true;
 445
 446                   if (gimple_code (stmt) == GIMPLE_ASSIGN
 447                       && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt))
 448                          == tcc_comparison
 449                       && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt)))
 450                          != BOOLEAN_TYPE)
 451                     scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 452                   else
 453                     {
 454                       if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 455                         {
 456                           pattern_def_seq = NULL;
 457                           gsi_next (&si);
 458                         }
 459                       continue;
 460                     }
 461                 }
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location,
 466                                    "get vectype for scalar type:  ");
 467                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 468                   dump_printf (MSG_NOTE, "\n");
 469                 }
 470               vectype = get_vectype_for_scalar_type (scalar_type);
 471               if (!vectype)
 472                 {
 473                   if (dump_enabled_p ())
 474                     {
 475                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 476                                        "not vectorized: unsupported "
 477                                        "data-type ");
 478                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 479                                          scalar_type);
 480                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 481                     }
 482                   return false;
 483                 }
 484
 485               if (!bool_result)
 486                 STMT_VINFO_VECTYPE (stmt_info) = vectype;
 487
 488               if (dump_enabled_p ())
 489                 {
 490                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 491                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 492                   dump_printf (MSG_NOTE, "\n");
 493                 }
 494             }
 495
 496           /* Don't try to compute VF out scalar types if we stmt
 497              produces boolean vector.  Use result vectype instead.  */
 498           if (VECTOR_BOOLEAN_TYPE_P (vectype))
 499             vf_vectype = vectype;
 500           else
 501             {
 502               /* The vectorization factor is according to the smallest
 503                  scalar type (or the largest vector size, but we only
 504                  support one vector size per loop).  */
 505               if (!bool_result)
 506                 scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 507                                                              &dummy);
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_NOTE, vect_location,
 511                                    "get vectype for scalar type:  ");
 512                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 513                   dump_printf (MSG_NOTE, "\n");
 514                 }
 515               vf_vectype = get_vectype_for_scalar_type (scalar_type);
 516             }
 517           if (!vf_vectype)
 518             {
 519               if (dump_enabled_p ())
 520                 {
 521                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 522                                    "not vectorized: unsupported data-type ");
 523                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 524                                      scalar_type);
 525                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 526                 }
 527               return false;
 528             }
 529
 530           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 531                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 532             {
 533               if (dump_enabled_p ())
 534                 {
 535                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 536                                    "not vectorized: different sized vector "
 537                                    "types in statement, ");
 538                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 539                                      vectype);
 540                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 541                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 542                                      vf_vectype);
 543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 544                 }
 545               return false;
 546             }
 547
 548           if (dump_enabled_p ())
 549             {
 550               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 551               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 552               dump_printf (MSG_NOTE, "\n");
 553             }
 554
 555           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 556           if (dump_enabled_p ())
 557             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 558           if (!vectorization_factor
 559               || (nunits > vectorization_factor))
 560             vectorization_factor = nunits;
 561
 562           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 563             {
 564               pattern_def_seq = NULL;
 565               gsi_next (&si);
 566             }
 567         }
 568     }
 569
 570   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 571   if (dump_enabled_p ())
 572     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 573                      vectorization_factor);
 574   if (vectorization_factor <= 1)
 575     {
 576       if (dump_enabled_p ())
 577         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 578                          "not vectorized: unsupported data-type\n");
 579       return false;
 580     }
 581   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 582
 583   for (i = 0; i < mask_producers.length (); i++)
 584     {
 585       tree mask_type = NULL;
 586
 587       stmt = STMT_VINFO_STMT (mask_producers[i]);
 588
 589       if (gimple_code (stmt) == GIMPLE_ASSIGN
 590           && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison
 591           && TREE_CODE (TREE_TYPE (gimple_assign_rhs1 (stmt))) != BOOLEAN_TYPE)
 592         {
 593           scalar_type = TREE_TYPE (gimple_assign_rhs1 (stmt));
 594           mask_type = get_mask_type_for_scalar_type (scalar_type);
 595
 596           if (!mask_type)
 597             {
 598               if (dump_enabled_p ())
 599                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 600                                  "not vectorized: unsupported mask\n");
 601               return false;
 602             }
 603         }
 604       else
 605         {
 606           tree rhs;
 607           ssa_op_iter iter;
 608           gimple *def_stmt;
 609           enum vect_def_type dt;
 610
 611           FOR_EACH_SSA_TREE_OPERAND (rhs, stmt, iter, SSA_OP_USE)
 612             {
 613               if (!vect_is_simple_use (rhs, mask_producers[i]->vinfo,
 614                                        &def_stmt, &dt, &vectype))
 615                 {
 616                   if (dump_enabled_p ())
 617                     {
 618                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 619                                        "not vectorized: can't compute mask type "
 620                                        "for statement, ");
 621                       dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 622                                         0);
 623                     }
 624                   return false;
 625                 }
 626
 627               /* No vectype probably means external definition.
 628                  Allow it in case there is another operand which
 629                  allows to determine mask type.  */
 630               if (!vectype)
 631                 continue;
 632
 633               if (!mask_type)
 634                 mask_type = vectype;
 635               else if (TYPE_VECTOR_SUBPARTS (mask_type)
 636                        != TYPE_VECTOR_SUBPARTS (vectype))
 637                 {
 638                   if (dump_enabled_p ())
 639                     {
 640                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 641                                        "not vectorized: different sized masks "
 642                                        "types in statement, ");
 643                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 644                                          mask_type);
 645                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 646                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 647                                          vectype);
 648                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 649                     }
 650                   return false;
 651                 }
 652               else if (VECTOR_BOOLEAN_TYPE_P (mask_type)
 653                        != VECTOR_BOOLEAN_TYPE_P (vectype))
 654                 {
 655                   if (dump_enabled_p ())
 656                     {
 657                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 658                                        "not vectorized: mixed mask and "
 659                                        "nonmask vector types in statement, ");
 660                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 661                                          mask_type);
 662                       dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 663                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 664                                          vectype);
 665                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 666                     }
 667                   return false;
 668                 }
 669             }
 670
 671           /* We may compare boolean value loaded as vector of integers.
 672              Fix mask_type in such case.  */
 673           if (mask_type
 674               && !VECTOR_BOOLEAN_TYPE_P (mask_type)
 675               && gimple_code (stmt) == GIMPLE_ASSIGN
 676               && TREE_CODE_CLASS (gimple_assign_rhs_code (stmt)) == tcc_comparison)
 677             mask_type = build_same_sized_truth_vector_type (mask_type);
 678         }
 679
 680       /* No mask_type should mean loop invariant predicate.
 681          This is probably a subject for optimization in
 682          if-conversion.  */
 683       if (!mask_type)
 684         {
 685           if (dump_enabled_p ())
 686             {
 687               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 688                                "not vectorized: can't compute mask type "
 689                                "for statement, ");
 690               dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 691                                 0);
 692             }
 693           return false;
 694         }
 695
 696       STMT_VINFO_VECTYPE (mask_producers[i]) = mask_type;
 697     }
 698
 699   return true;
 700 }
 701
 702
 703 /* Function vect_is_simple_iv_evolution.
 704
 705    FORNOW: A simple evolution of an induction variables in the loop is
 706    considered a polynomial evolution.  */
 707
 708 static bool
 709 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 710                              tree * step)
 711 {
 712   tree init_expr;
 713   tree step_expr;
 714   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 715   basic_block bb;
 716
 717   /* When there is no evolution in this loop, the evolution function
 718      is not "simple".  */
 719   if (evolution_part == NULL_TREE)
 720     return false;
 721
 722   /* When the evolution is a polynomial of degree >= 2
 723      the evolution function is not "simple".  */
 724   if (tree_is_chrec (evolution_part))
 725     return false;
 726
 727   step_expr = evolution_part;
 728   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 729
 730   if (dump_enabled_p ())
 731     {
 732       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 733       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 734       dump_printf (MSG_NOTE, ",  init: ");
 735       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 736       dump_printf (MSG_NOTE, "\n");
 737     }
 738
 739   *init = init_expr;
 740   *step = step_expr;
 741
 742   if (TREE_CODE (step_expr) != INTEGER_CST
 743       && (TREE_CODE (step_expr) != SSA_NAME
 744           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 745               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 746           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 747               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 748                   || !flag_associative_math)))
 749       && (TREE_CODE (step_expr) != REAL_CST
 750           || !flag_associative_math))
 751     {
 752       if (dump_enabled_p ())
 753         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 754                          "step unknown.\n");
 755       return false;
 756     }
 757
 758   return true;
 759 }
 760
 761 /* Function vect_analyze_scalar_cycles_1.
 762
 763    Examine the cross iteration def-use cycles of scalar variables
 764    in LOOP.  LOOP_VINFO represents the loop that is now being
 765    considered for vectorization (can be LOOP, or an outer-loop
 766    enclosing LOOP).  */
 767
 768 static void
 769 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 770 {
 771   basic_block bb = loop->header;
 772   tree init, step;
 773   auto_vec<gimple *, 64> worklist;
 774   gphi_iterator gsi;
 775   bool double_reduc;
 776
 777   if (dump_enabled_p ())
 778     dump_printf_loc (MSG_NOTE, vect_location,
 779                      "=== vect_analyze_scalar_cycles ===\n");
 780
 781   /* First - identify all inductions.  Reduction detection assumes that all the
 782      inductions have been identified, therefore, this order must not be
 783      changed.  */
 784   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 785     {
 786       gphi *phi = gsi.phi ();
 787       tree access_fn = NULL;
 788       tree def = PHI_RESULT (phi);
 789       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 790
 791       if (dump_enabled_p ())
 792         {
 793           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 794           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 795         }
 796
 797       /* Skip virtual phi's.  The data dependences that are associated with
 798          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 799       if (virtual_operand_p (def))
 800         continue;
 801
 802       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 803
 804       /* Analyze the evolution function.  */
 805       access_fn = analyze_scalar_evolution (loop, def);
 806       if (access_fn)
 807         {
 808           STRIP_NOPS (access_fn);
 809           if (dump_enabled_p ())
 810             {
 811               dump_printf_loc (MSG_NOTE, vect_location,
 812                                "Access function of PHI: ");
 813               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 814               dump_printf (MSG_NOTE, "\n");
 815             }
 816           STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 817             = initial_condition_in_loop_num (access_fn, loop->num);
 818           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 819             = evolution_part_in_loop_num (access_fn, loop->num);
 820         }
 821
 822       if (!access_fn
 823           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 824           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 825               && TREE_CODE (step) != INTEGER_CST))
 826         {
 827           worklist.safe_push (phi);
 828           continue;
 829         }
 830
 831       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo)
 832                   != NULL_TREE);
 833       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 834
 835       if (dump_enabled_p ())
 836         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 837       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 838     }
 839
 840
 841   /* Second - identify all reductions and nested cycles.  */
 842   while (worklist.length () > 0)
 843     {
 844       gimple *phi = worklist.pop ();
 845       tree def = PHI_RESULT (phi);
 846       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 847       gimple *reduc_stmt;
 848       bool nested_cycle;
 849
 850       if (dump_enabled_p ())
 851         {
 852           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 853           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 854         }
 855
 856       gcc_assert (!virtual_operand_p (def)
 857                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 858
 859       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 860       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 861                                                 &double_reduc, false);
 862       if (reduc_stmt)
 863         {
 864           if (double_reduc)
 865             {
 866               if (dump_enabled_p ())
 867                 dump_printf_loc (MSG_NOTE, vect_location,
 868                                  "Detected double reduction.\n");
 869
 870               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 871               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 872                                                     vect_double_reduction_def;
 873             }
 874           else
 875             {
 876               if (nested_cycle)
 877                 {
 878                   if (dump_enabled_p ())
 879                     dump_printf_loc (MSG_NOTE, vect_location,
 880                                      "Detected vectorizable nested cycle.\n");
 881
 882                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 883                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 884                                                              vect_nested_cycle;
 885                 }
 886               else
 887                 {
 888                   if (dump_enabled_p ())
 889                     dump_printf_loc (MSG_NOTE, vect_location,
 890                                      "Detected reduction.\n");
 891
 892                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 893                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 894                                                            vect_reduction_def;
 895                   /* Store the reduction cycles for possible vectorization in
 896                      loop-aware SLP.  */
 897                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 898                 }
 899             }
 900         }
 901       else
 902         if (dump_enabled_p ())
 903           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 904                            "Unknown def-use cycle pattern.\n");
 905     }
 906 }
 907
 908
 909 /* Function vect_analyze_scalar_cycles.
 910
 911    Examine the cross iteration def-use cycles of scalar variables, by
 912    analyzing the loop-header PHIs of scalar variables.  Classify each
 913    cycle as one of the following: invariant, induction, reduction, unknown.
 914    We do that for the loop represented by LOOP_VINFO, and also to its
 915    inner-loop, if exists.
 916    Examples for scalar cycles:
 917
 918    Example1: reduction:
 919
 920               loop1:
 921               for (i=0; i<N; i++)
 922                  sum += a[i];
 923
 924    Example2: induction:
 925
 926               loop2:
 927               for (i=0; i<N; i++)
 928                  a[i] = i;  */
 929
 930 static void
 931 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 932 {
 933   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 934
 935   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 936
 937   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 938      Reductions in such inner-loop therefore have different properties than
 939      the reductions in the nest that gets vectorized:
 940      1. When vectorized, they are executed in the same order as in the original
 941         scalar loop, so we can't change the order of computation when
 942         vectorizing them.
 943      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 944         current checks are too strict.  */
 945
 946   if (loop->inner)
 947     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 948 }
 949
 950 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 951
 952 static void
 953 vect_fixup_reduc_chain (gimple *stmt)
 954 {
 955   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 956   gimple *stmtp;
 957   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 958               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 959   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 960   do
 961     {
 962       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 963       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 964       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 965       if (stmt)
 966         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 967           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 968     }
 969   while (stmt);
 970   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 971 }
 972
 973 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 974
 975 static void
 976 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 977 {
 978   gimple *first;
 979   unsigned i;
 980
 981   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 982     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 983       {
 984         gimple *next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (first));
 985         while (next)
 986           {
 987             if (! STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (next)))
 988               break;
 989             next = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next));
 990           }
 991         /* If not all stmt in the chain are patterns try to handle
 992            the chain without patterns.  */
 993         if (! next)
 994           {
 995             vect_fixup_reduc_chain (first);
 996             LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 997               = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 998           }
 999       }
1000 }
1001
1002 /* Function vect_get_loop_niters.
1003
1004    Determine how many iterations the loop is executed and place it
1005    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
1006    in NUMBER_OF_ITERATIONSM1.  Place the condition under which the
1007    niter information holds in ASSUMPTIONS.
1008
1009    Return the loop exit condition.  */
1010
1011
1012 static gcond *
1013 vect_get_loop_niters (struct loop *loop, tree *assumptions,
1014                       tree *number_of_iterations, tree *number_of_iterationsm1)
1015 {
1016   edge exit = single_exit (loop);
1017   struct tree_niter_desc niter_desc;
1018   tree niter_assumptions, niter, may_be_zero;
1019   gcond *cond = get_loop_exit_condition (loop);
1020
1021   *assumptions = boolean_true_node;
1022   *number_of_iterationsm1 = chrec_dont_know;
1023   *number_of_iterations = chrec_dont_know;
1024   if (dump_enabled_p ())
1025     dump_printf_loc (MSG_NOTE, vect_location,
1026                      "=== get_loop_niters ===\n");
1027
1028   if (!exit)
1029     return cond;
1030
1031   niter = chrec_dont_know;
1032   may_be_zero = NULL_TREE;
1033   niter_assumptions = boolean_true_node;
1034   if (!number_of_iterations_exit_assumptions (loop, exit, &niter_desc, NULL)
1035       || chrec_contains_undetermined (niter_desc.niter))
1036     return cond;
1037
1038   niter_assumptions = niter_desc.assumptions;
1039   may_be_zero = niter_desc.may_be_zero;
1040   niter = niter_desc.niter;
1041
1042   if (may_be_zero && integer_zerop (may_be_zero))
1043     may_be_zero = NULL_TREE;
1044
1045   if (may_be_zero)
1046     {
1047       if (COMPARISON_CLASS_P (may_be_zero))
1048         {
1049           /* Try to combine may_be_zero with assumptions, this can simplify
1050              computation of niter expression.  */
1051           if (niter_assumptions && !integer_nonzerop (niter_assumptions))
1052             niter_assumptions = fold_build2 (TRUTH_AND_EXPR, boolean_type_node,
1053                                              niter_assumptions,
1054                                              fold_build1 (TRUTH_NOT_EXPR,
1055                                                           boolean_type_node,
1056                                                           may_be_zero));
1057           else
1058             niter = fold_build3 (COND_EXPR, TREE_TYPE (niter), may_be_zero,
1059                                  build_int_cst (TREE_TYPE (niter), 0), niter);
1060
1061           may_be_zero = NULL_TREE;
1062         }
1063       else if (integer_nonzerop (may_be_zero))
1064         {
1065           *number_of_iterationsm1 = build_int_cst (TREE_TYPE (niter), 0);
1066           *number_of_iterations = build_int_cst (TREE_TYPE (niter), 1);
1067           return cond;
1068         }
1069       else
1070         return cond;
1071     }
1072
1073   *assumptions = niter_assumptions;
1074   *number_of_iterationsm1 = niter;
1075
1076   /* We want the number of loop header executions which is the number
1077      of latch executions plus one.
1078      ???  For UINT_MAX latch executions this number overflows to zero
1079      for loops like do { n++; } while (n != 0);  */
1080   if (niter && !chrec_contains_undetermined (niter))
1081     niter = fold_build2 (PLUS_EXPR, TREE_TYPE (niter), unshare_expr (niter),
1082                           build_int_cst (TREE_TYPE (niter), 1));
1083   *number_of_iterations = niter;
1084
1085   return cond;
1086 }
1087
1088 /* Function bb_in_loop_p
1089
1090    Used as predicate for dfs order traversal of the loop bbs.  */
1091
1092 static bool
1093 bb_in_loop_p (const_basic_block bb, const void *data)
1094 {
1095   const struct loop *const loop = (const struct loop *)data;
1096   if (flow_bb_inside_loop_p (loop, bb))
1097     return true;
1098   return false;
1099 }
1100
1101
1102 /* Function new_loop_vec_info.
1103
1104    Create and initialize a new loop_vec_info struct for LOOP, as well as
1105    stmt_vec_info structs for all the stmts in LOOP.  */
1106
1107 static loop_vec_info
1108 new_loop_vec_info (struct loop *loop)
1109 {
1110   loop_vec_info res;
1111   basic_block *bbs;
1112   gimple_stmt_iterator si;
1113   unsigned int i, nbbs;
1114
1115   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
1116   res->kind = vec_info::loop;
1117   LOOP_VINFO_LOOP (res) = loop;
1118
1119   bbs = get_loop_body (loop);
1120
1121   /* Create/Update stmt_info for all stmts in the loop.  */
1122   for (i = 0; i < loop->num_nodes; i++)
1123     {
1124       basic_block bb = bbs[i];
1125
1126       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1127         {
1128           gimple *phi = gsi_stmt (si);
1129           gimple_set_uid (phi, 0);
1130           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
1131         }
1132
1133       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1134         {
1135           gimple *stmt = gsi_stmt (si);
1136           gimple_set_uid (stmt, 0);
1137           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
1138         }
1139     }
1140
1141   /* CHECKME: We want to visit all BBs before their successors (except for
1142      latch blocks, for which this assertion wouldn't hold).  In the simple
1143      case of the loop forms we allow, a dfs order of the BBs would the same
1144      as reversed postorder traversal, so we are safe.  */
1145
1146    free (bbs);
1147    bbs = XCNEWVEC (basic_block, loop->num_nodes);
1148    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
1149                               bbs, loop->num_nodes, loop);
1150    gcc_assert (nbbs == loop->num_nodes);
1151
1152   LOOP_VINFO_BBS (res) = bbs;
1153   LOOP_VINFO_NITERSM1 (res) = NULL;
1154   LOOP_VINFO_NITERS (res) = NULL;
1155   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
1156   LOOP_VINFO_NITERS_ASSUMPTIONS (res) = NULL;
1157   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
1158   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
1159   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
1160   LOOP_VINFO_VECT_FACTOR (res) = 0;
1161   LOOP_VINFO_LOOP_NEST (res) = vNULL;
1162   LOOP_VINFO_DATAREFS (res) = vNULL;
1163   LOOP_VINFO_DDRS (res) = vNULL;
1164   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
1165   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
1166   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
1167   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
1168   LOOP_VINFO_REDUCTIONS (res) = vNULL;
1169   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
1170   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
1171   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1172   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1173   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1174   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1175   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1176
1177   return res;
1178 }
1179
1180
1181 /* Function destroy_loop_vec_info.
1182
1183    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1184    stmts in the loop.  */
1185
1186 void
1187 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1188 {
1189   struct loop *loop;
1190   basic_block *bbs;
1191   int nbbs;
1192   gimple_stmt_iterator si;
1193   int j;
1194   vec<slp_instance> slp_instances;
1195   slp_instance instance;
1196   bool swapped;
1197
1198   if (!loop_vinfo)
1199     return;
1200
1201   loop = LOOP_VINFO_LOOP (loop_vinfo);
1202
1203   bbs = LOOP_VINFO_BBS (loop_vinfo);
1204   nbbs = clean_stmts ? loop->num_nodes : 0;
1205   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1206
1207   for (j = 0; j < nbbs; j++)
1208     {
1209       basic_block bb = bbs[j];
1210       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1211         free_stmt_vec_info (gsi_stmt (si));
1212
1213       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1214         {
1215           gimple *stmt = gsi_stmt (si);
1216
1217           /* We may have broken canonical form by moving a constant
1218              into RHS1 of a commutative op.  Fix such occurrences.  */
1219           if (swapped && is_gimple_assign (stmt))
1220             {
1221               enum tree_code code = gimple_assign_rhs_code (stmt);
1222
1223               if ((code == PLUS_EXPR
1224                    || code == POINTER_PLUS_EXPR
1225                    || code == MULT_EXPR)
1226                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1227                 swap_ssa_operands (stmt,
1228                                    gimple_assign_rhs1_ptr (stmt),
1229                                    gimple_assign_rhs2_ptr (stmt));
1230             }
1231
1232           /* Free stmt_vec_info.  */
1233           free_stmt_vec_info (stmt);
1234           gsi_next (&si);
1235         }
1236     }
1237
1238   free (LOOP_VINFO_BBS (loop_vinfo));
1239   vect_destroy_datarefs (loop_vinfo);
1240   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1241   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1242   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1243   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
1244   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1245   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1246   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1247     vect_free_slp_instance (instance);
1248
1249   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1250   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1251   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1252   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1253
1254   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1255   loop_vinfo->scalar_cost_vec.release ();
1256
1257   free (loop_vinfo);
1258   loop->aux = NULL;
1259 }
1260
1261
1262 /* Calculate the cost of one scalar iteration of the loop.  */
1263 static void
1264 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1265 {
1266   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1267   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1268   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1269   int innerloop_iters, i;
1270
1271   /* Count statements in scalar loop.  Using this as scalar cost for a single
1272      iteration for now.
1273
1274      TODO: Add outer loop support.
1275
1276      TODO: Consider assigning different costs to different scalar
1277      statements.  */
1278
1279   /* FORNOW.  */
1280   innerloop_iters = 1;
1281   if (loop->inner)
1282     innerloop_iters = 50; /* FIXME */
1283
1284   for (i = 0; i < nbbs; i++)
1285     {
1286       gimple_stmt_iterator si;
1287       basic_block bb = bbs[i];
1288
1289       if (bb->loop_father == loop->inner)
1290         factor = innerloop_iters;
1291       else
1292         factor = 1;
1293
1294       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1295         {
1296           gimple *stmt = gsi_stmt (si);
1297           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1298
1299           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1300             continue;
1301
1302           /* Skip stmts that are not vectorized inside the loop.  */
1303           if (stmt_info
1304               && !STMT_VINFO_RELEVANT_P (stmt_info)
1305               && (!STMT_VINFO_LIVE_P (stmt_info)
1306                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1307               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1308             continue;
1309
1310           vect_cost_for_stmt kind;
1311           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1312             {
1313               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1314                kind = scalar_load;
1315              else
1316                kind = scalar_store;
1317             }
1318           else
1319             kind = scalar_stmt;
1320
1321           scalar_single_iter_cost
1322             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1323                                  factor, kind, NULL, 0, vect_prologue);
1324         }
1325     }
1326   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1327     = scalar_single_iter_cost;
1328 }
1329
1330
1331 /* Function vect_analyze_loop_form_1.
1332
1333    Verify that certain CFG restrictions hold, including:
1334    - the loop has a pre-header
1335    - the loop has a single entry and exit
1336    - the loop exit condition is simple enough
1337    - the number of iterations can be analyzed, i.e, a countable loop.  The
1338      niter could be analyzed under some assumptions.  */
1339
1340 bool
1341 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1342                           tree *assumptions, tree *number_of_iterationsm1,
1343                           tree *number_of_iterations, gcond **inner_loop_cond)
1344 {
1345   if (dump_enabled_p ())
1346     dump_printf_loc (MSG_NOTE, vect_location,
1347                      "=== vect_analyze_loop_form ===\n");
1348
1349   /* Different restrictions apply when we are considering an inner-most loop,
1350      vs. an outer (nested) loop.
1351      (FORNOW. May want to relax some of these restrictions in the future).  */
1352
1353   if (!loop->inner)
1354     {
1355       /* Inner-most loop.  We currently require that the number of BBs is
1356          exactly 2 (the header and latch).  Vectorizable inner-most loops
1357          look like this:
1358
1359                         (pre-header)
1360                            |
1361                           header <--------+
1362                            | |            |
1363                            | +--> latch --+
1364                            |
1365                         (exit-bb)  */
1366
1367       if (loop->num_nodes != 2)
1368         {
1369           if (dump_enabled_p ())
1370             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1371                              "not vectorized: control flow in loop.\n");
1372           return false;
1373         }
1374
1375       if (empty_block_p (loop->header))
1376         {
1377           if (dump_enabled_p ())
1378             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1379                              "not vectorized: empty loop.\n");
1380           return false;
1381         }
1382     }
1383   else
1384     {
1385       struct loop *innerloop = loop->inner;
1386       edge entryedge;
1387
1388       /* Nested loop. We currently require that the loop is doubly-nested,
1389          contains a single inner loop, and the number of BBs is exactly 5.
1390          Vectorizable outer-loops look like this:
1391
1392                         (pre-header)
1393                            |
1394                           header <---+
1395                            |         |
1396                           inner-loop |
1397                            |         |
1398                           tail ------+
1399                            |
1400                         (exit-bb)
1401
1402          The inner-loop has the properties expected of inner-most loops
1403          as described above.  */
1404
1405       if ((loop->inner)->inner || (loop->inner)->next)
1406         {
1407           if (dump_enabled_p ())
1408             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1409                              "not vectorized: multiple nested loops.\n");
1410           return false;
1411         }
1412
1413       if (loop->num_nodes != 5)
1414         {
1415           if (dump_enabled_p ())
1416             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1417                              "not vectorized: control flow in loop.\n");
1418           return false;
1419         }
1420
1421       entryedge = loop_preheader_edge (innerloop);
1422       if (entryedge->src != loop->header
1423           || !single_exit (innerloop)
1424           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1425         {
1426           if (dump_enabled_p ())
1427             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1428                              "not vectorized: unsupported outerloop form.\n");
1429           return false;
1430         }
1431
1432       /* Analyze the inner-loop.  */
1433       tree inner_niterm1, inner_niter, inner_assumptions;
1434       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1435                                       &inner_assumptions, &inner_niterm1,
1436                                       &inner_niter, NULL)
1437           /* Don't support analyzing niter under assumptions for inner
1438              loop.  */
1439           || !integer_onep (inner_assumptions))
1440         {
1441           if (dump_enabled_p ())
1442             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1443                              "not vectorized: Bad inner loop.\n");
1444           return false;
1445         }
1446
1447       if (!expr_invariant_in_loop_p (loop, inner_niter))
1448         {
1449           if (dump_enabled_p ())
1450             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1451                              "not vectorized: inner-loop count not"
1452                              " invariant.\n");
1453           return false;
1454         }
1455
1456       if (dump_enabled_p ())
1457         dump_printf_loc (MSG_NOTE, vect_location,
1458                          "Considering outer-loop vectorization.\n");
1459     }
1460
1461   if (!single_exit (loop)
1462       || EDGE_COUNT (loop->header->preds) != 2)
1463     {
1464       if (dump_enabled_p ())
1465         {
1466           if (!single_exit (loop))
1467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1468                              "not vectorized: multiple exits.\n");
1469           else if (EDGE_COUNT (loop->header->preds) != 2)
1470             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1471                              "not vectorized: too many incoming edges.\n");
1472         }
1473       return false;
1474     }
1475
1476   /* We assume that the loop exit condition is at the end of the loop. i.e,
1477      that the loop is represented as a do-while (with a proper if-guard
1478      before the loop if needed), where the loop header contains all the
1479      executable statements, and the latch is empty.  */
1480   if (!empty_block_p (loop->latch)
1481       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1482     {
1483       if (dump_enabled_p ())
1484         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1485                          "not vectorized: latch block not empty.\n");
1486       return false;
1487     }
1488
1489   /* Make sure there exists a single-predecessor exit bb:  */
1490   if (!single_pred_p (single_exit (loop)->dest))
1491     {
1492       edge e = single_exit (loop);
1493       if (!(e->flags & EDGE_ABNORMAL))
1494         {
1495           split_loop_exit_edge (e);
1496           if (dump_enabled_p ())
1497             dump_printf (MSG_NOTE, "split exit edge.\n");
1498         }
1499       else
1500         {
1501           if (dump_enabled_p ())
1502             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                              "not vectorized: abnormal loop exit edge.\n");
1504           return false;
1505         }
1506     }
1507
1508   *loop_cond = vect_get_loop_niters (loop, assumptions, number_of_iterations,
1509                                      number_of_iterationsm1);
1510   if (!*loop_cond)
1511     {
1512       if (dump_enabled_p ())
1513         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1514                          "not vectorized: complicated exit condition.\n");
1515       return false;
1516     }
1517
1518   if (integer_zerop (*assumptions)
1519       || !*number_of_iterations
1520       || chrec_contains_undetermined (*number_of_iterations))
1521     {
1522       if (dump_enabled_p ())
1523         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524                          "not vectorized: number of iterations cannot be "
1525                          "computed.\n");
1526       return false;
1527     }
1528
1529   if (integer_zerop (*number_of_iterations))
1530     {
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: number of iterations = 0.\n");
1534       return false;
1535     }
1536
1537   return true;
1538 }
1539
1540 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1541
1542 loop_vec_info
1543 vect_analyze_loop_form (struct loop *loop)
1544 {
1545   tree assumptions, number_of_iterations, number_of_iterationsm1;
1546   gcond *loop_cond, *inner_loop_cond = NULL;
1547
1548   if (! vect_analyze_loop_form_1 (loop, &loop_cond,
1549                                   &assumptions, &number_of_iterationsm1,
1550                                   &number_of_iterations, &inner_loop_cond))
1551     return NULL;
1552
1553   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1554   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1555   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1556   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1557   if (!integer_onep (assumptions))
1558     {
1559       /* We consider to vectorize this loop by versioning it under
1560          some assumptions.  In order to do this, we need to clear
1561          existing information computed by scev and niter analyzer.  */
1562       scev_reset_htab ();
1563       free_numbers_of_iterations_estimates_loop (loop);
1564       /* Also set flag for this loop so that following scev and niter
1565          analysis are done under the assumptions.  */
1566       loop_constraint_set (loop, LOOP_C_FINITE);
1567       /* Also record the assumptions for versioning.  */
1568       LOOP_VINFO_NITERS_ASSUMPTIONS (loop_vinfo) = assumptions;
1569     }
1570
1571   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1572     {
1573       if (dump_enabled_p ())
1574         {
1575           dump_printf_loc (MSG_NOTE, vect_location,
1576                            "Symbolic number of iterations is ");
1577           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1578           dump_printf (MSG_NOTE, "\n");
1579         }
1580     }
1581
1582   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1583   if (inner_loop_cond)
1584     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1585       = loop_exit_ctrl_vec_info_type;
1586
1587   gcc_assert (!loop->aux);
1588   loop->aux = loop_vinfo;
1589   return loop_vinfo;
1590 }
1591
1592
1593
1594 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1595    statements update the vectorization factor.  */
1596
1597 static void
1598 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1599 {
1600   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1601   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1602   int nbbs = loop->num_nodes;
1603   unsigned int vectorization_factor;
1604   int i;
1605
1606   if (dump_enabled_p ())
1607     dump_printf_loc (MSG_NOTE, vect_location,
1608                      "=== vect_update_vf_for_slp ===\n");
1609
1610   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1611   gcc_assert (vectorization_factor != 0);
1612
1613   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1614      vectorization factor of the loop is the unrolling factor required by
1615      the SLP instances.  If that unrolling factor is 1, we say, that we
1616      perform pure SLP on loop - cross iteration parallelism is not
1617      exploited.  */
1618   bool only_slp_in_loop = true;
1619   for (i = 0; i < nbbs; i++)
1620     {
1621       basic_block bb = bbs[i];
1622       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1623            gsi_next (&si))
1624         {
1625           gimple *stmt = gsi_stmt (si);
1626           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1627           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1628               && STMT_VINFO_RELATED_STMT (stmt_info))
1629             {
1630               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1631               stmt_info = vinfo_for_stmt (stmt);
1632             }
1633           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1634                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1635               && !PURE_SLP_STMT (stmt_info))
1636             /* STMT needs both SLP and loop-based vectorization.  */
1637             only_slp_in_loop = false;
1638         }
1639     }
1640
1641   if (only_slp_in_loop)
1642     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1643   else
1644     vectorization_factor
1645       = least_common_multiple (vectorization_factor,
1646                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1647
1648   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1649   if (dump_enabled_p ())
1650     dump_printf_loc (MSG_NOTE, vect_location,
1651                      "Updating vectorization factor to %d\n",
1652                      vectorization_factor);
1653 }
1654
1655 /* Function vect_analyze_loop_operations.
1656
1657    Scan the loop stmts and make sure they are all vectorizable.  */
1658
1659 static bool
1660 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1661 {
1662   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1663   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1664   int nbbs = loop->num_nodes;
1665   int i;
1666   stmt_vec_info stmt_info;
1667   bool need_to_vectorize = false;
1668   bool ok;
1669
1670   if (dump_enabled_p ())
1671     dump_printf_loc (MSG_NOTE, vect_location,
1672                      "=== vect_analyze_loop_operations ===\n");
1673
1674   for (i = 0; i < nbbs; i++)
1675     {
1676       basic_block bb = bbs[i];
1677
1678       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1679            gsi_next (&si))
1680         {
1681           gphi *phi = si.phi ();
1682           ok = true;
1683
1684           stmt_info = vinfo_for_stmt (phi);
1685           if (dump_enabled_p ())
1686             {
1687               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1688               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1689             }
1690           if (virtual_operand_p (gimple_phi_result (phi)))
1691             continue;
1692
1693           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1694              (i.e., a phi in the tail of the outer-loop).  */
1695           if (! is_loop_header_bb_p (bb))
1696             {
1697               /* FORNOW: we currently don't support the case that these phis
1698                  are not used in the outerloop (unless it is double reduction,
1699                  i.e., this phi is vect_reduction_def), cause this case
1700                  requires to actually do something here.  */
1701               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1702                    || STMT_VINFO_LIVE_P (stmt_info))
1703                   && STMT_VINFO_DEF_TYPE (stmt_info)
1704                      != vect_double_reduction_def)
1705                 {
1706                   if (dump_enabled_p ())
1707                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708                                      "Unsupported loop-closed phi in "
1709                                      "outer-loop.\n");
1710                   return false;
1711                 }
1712
1713               /* If PHI is used in the outer loop, we check that its operand
1714                  is defined in the inner loop.  */
1715               if (STMT_VINFO_RELEVANT_P (stmt_info))
1716                 {
1717                   tree phi_op;
1718                   gimple *op_def_stmt;
1719
1720                   if (gimple_phi_num_args (phi) != 1)
1721                     return false;
1722
1723                   phi_op = PHI_ARG_DEF (phi, 0);
1724                   if (TREE_CODE (phi_op) != SSA_NAME)
1725                     return false;
1726
1727                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1728                   if (gimple_nop_p (op_def_stmt)
1729                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1730                       || !vinfo_for_stmt (op_def_stmt))
1731                     return false;
1732
1733                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1734                         != vect_used_in_outer
1735                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1736                            != vect_used_in_outer_by_reduction)
1737                     return false;
1738                 }
1739
1740               continue;
1741             }
1742
1743           gcc_assert (stmt_info);
1744
1745           if ((STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1746                || STMT_VINFO_LIVE_P (stmt_info))
1747               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1748             {
1749               /* A scalar-dependence cycle that we don't support.  */
1750               if (dump_enabled_p ())
1751                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1752                                  "not vectorized: scalar dependence cycle.\n");
1753               return false;
1754             }
1755
1756           if (STMT_VINFO_RELEVANT_P (stmt_info))
1757             {
1758               need_to_vectorize = true;
1759               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1760                 ok = vectorizable_induction (phi, NULL, NULL);
1761             }
1762
1763           if (ok && STMT_VINFO_LIVE_P (stmt_info))
1764             ok = vectorizable_live_operation (phi, NULL, NULL, -1, NULL);
1765
1766           if (!ok)
1767             {
1768               if (dump_enabled_p ())
1769                 {
1770                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1771                                    "not vectorized: relevant phi not "
1772                                    "supported: ");
1773                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1774                 }
1775               return false;
1776             }
1777         }
1778
1779       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1780            gsi_next (&si))
1781         {
1782           gimple *stmt = gsi_stmt (si);
1783           if (!gimple_clobber_p (stmt)
1784               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1785             return false;
1786         }
1787     } /* bbs */
1788
1789   /* All operations in the loop are either irrelevant (deal with loop
1790      control, or dead), or only used outside the loop and can be moved
1791      out of the loop (e.g. invariants, inductions).  The loop can be
1792      optimized away by scalar optimizations.  We're better off not
1793      touching this loop.  */
1794   if (!need_to_vectorize)
1795     {
1796       if (dump_enabled_p ())
1797         dump_printf_loc (MSG_NOTE, vect_location,
1798                          "All the computation can be taken out of the loop.\n");
1799       if (dump_enabled_p ())
1800         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1801                          "not vectorized: redundant loop. no profit to "
1802                          "vectorize.\n");
1803       return false;
1804     }
1805
1806   return true;
1807 }
1808
1809
1810 /* Function vect_analyze_loop_2.
1811
1812    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1813    for it.  The different analyses will record information in the
1814    loop_vec_info struct.  */
1815 static bool
1816 vect_analyze_loop_2 (loop_vec_info loop_vinfo, bool &fatal)
1817 {
1818   bool ok;
1819   int max_vf = MAX_VECTORIZATION_FACTOR;
1820   int min_vf = 2;
1821   unsigned int n_stmts = 0;
1822
1823   /* The first group of checks is independent of the vector size.  */
1824   fatal = true;
1825
1826   /* Find all data references in the loop (which correspond to vdefs/vuses)
1827      and analyze their evolution in the loop.  */
1828
1829   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1830
1831   loop_p loop = LOOP_VINFO_LOOP (loop_vinfo);
1832   if (!find_loop_nest (loop, &LOOP_VINFO_LOOP_NEST (loop_vinfo)))
1833     {
1834       if (dump_enabled_p ())
1835         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1836                          "not vectorized: loop nest containing two "
1837                          "or more consecutive inner loops cannot be "
1838                          "vectorized\n");
1839       return false;
1840     }
1841
1842   for (unsigned i = 0; i < loop->num_nodes; i++)
1843     for (gimple_stmt_iterator gsi = gsi_start_bb (bbs[i]);
1844          !gsi_end_p (gsi); gsi_next (&gsi))
1845       {
1846         gimple *stmt = gsi_stmt (gsi);
1847         if (is_gimple_debug (stmt))
1848           continue;
1849         ++n_stmts;
1850         if (!find_data_references_in_stmt (loop, stmt,
1851                                            &LOOP_VINFO_DATAREFS (loop_vinfo)))
1852           {
1853             if (is_gimple_call (stmt) && loop->safelen)
1854               {
1855                 tree fndecl = gimple_call_fndecl (stmt), op;
1856                 if (fndecl != NULL_TREE)
1857                   {
1858                     cgraph_node *node = cgraph_node::get (fndecl);
1859                     if (node != NULL && node->simd_clones != NULL)
1860                       {
1861                         unsigned int j, n = gimple_call_num_args (stmt);
1862                         for (j = 0; j < n; j++)
1863                           {
1864                             op = gimple_call_arg (stmt, j);
1865                             if (DECL_P (op)
1866                                 || (REFERENCE_CLASS_P (op)
1867                                     && get_base_address (op)))
1868                               break;
1869                           }
1870                         op = gimple_call_lhs (stmt);
1871                         /* Ignore #pragma omp declare simd functions
1872                            if they don't have data references in the
1873                            call stmt itself.  */
1874                         if (j == n
1875                             && !(op
1876                                  && (DECL_P (op)
1877                                      || (REFERENCE_CLASS_P (op)
1878                                          && get_base_address (op)))))
1879                           continue;
1880                       }
1881                   }
1882               }
1883             if (dump_enabled_p ())
1884               dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1885                                "not vectorized: loop contains function "
1886                                "calls or data references that cannot "
1887                                "be analyzed\n");
1888             return false;
1889           }
1890       }
1891
1892   /* Analyze the data references and also adjust the minimal
1893      vectorization factor according to the loads and stores.  */
1894
1895   ok = vect_analyze_data_refs (loop_vinfo, &min_vf);
1896   if (!ok)
1897     {
1898       if (dump_enabled_p ())
1899         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1900                          "bad data references.\n");
1901       return false;
1902     }
1903
1904   /* Classify all cross-iteration scalar data-flow cycles.
1905      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1906   vect_analyze_scalar_cycles (loop_vinfo);
1907
1908   vect_pattern_recog (loop_vinfo);
1909
1910   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1911
1912   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1913      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1914
1915   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1916   if (!ok)
1917     {
1918       if (dump_enabled_p ())
1919         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1920                          "bad data access.\n");
1921       return false;
1922     }
1923
1924   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1925
1926   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1927   if (!ok)
1928     {
1929       if (dump_enabled_p ())
1930         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1931                          "unexpected pattern.\n");
1932       return false;
1933     }
1934
1935   /* While the rest of the analysis below depends on it in some way.  */
1936   fatal = false;
1937
1938   /* Analyze data dependences between the data-refs in the loop
1939      and adjust the maximum vectorization factor according to
1940      the dependences.
1941      FORNOW: fail at the first data dependence that we encounter.  */
1942
1943   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1944   if (!ok
1945       || max_vf < min_vf)
1946     {
1947       if (dump_enabled_p ())
1948             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1949                              "bad data dependence.\n");
1950       return false;
1951     }
1952
1953   ok = vect_determine_vectorization_factor (loop_vinfo);
1954   if (!ok)
1955     {
1956       if (dump_enabled_p ())
1957         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1958                          "can't determine vectorization factor.\n");
1959       return false;
1960     }
1961   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1962     {
1963       if (dump_enabled_p ())
1964         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1965                          "bad data dependence.\n");
1966       return false;
1967     }
1968
1969   /* Compute the scalar iteration cost.  */
1970   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1971
1972   int saved_vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1973   HOST_WIDE_INT estimated_niter;
1974   unsigned th;
1975   int min_scalar_loop_bound;
1976
1977   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1978   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1979   if (!ok)
1980     return false;
1981
1982   /* If there are any SLP instances mark them as pure_slp.  */
1983   bool slp = vect_make_slp_decision (loop_vinfo);
1984   if (slp)
1985     {
1986       /* Find stmts that need to be both vectorized and SLPed.  */
1987       vect_detect_hybrid_slp (loop_vinfo);
1988
1989       /* Update the vectorization factor based on the SLP decision.  */
1990       vect_update_vf_for_slp (loop_vinfo);
1991     }
1992
1993   /* This is the point where we can re-start analysis with SLP forced off.  */
1994 start_over:
1995
1996   /* Now the vectorization factor is final.  */
1997   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1998   gcc_assert (vectorization_factor != 0);
1999
2000   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
2001     dump_printf_loc (MSG_NOTE, vect_location,
2002                      "vectorization_factor = %d, niters = "
2003                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
2004                      LOOP_VINFO_INT_NITERS (loop_vinfo));
2005
2006   HOST_WIDE_INT max_niter
2007     = likely_max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2008   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2009        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
2010       || (max_niter != -1
2011           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
2012     {
2013       if (dump_enabled_p ())
2014         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2015                          "not vectorized: iteration count smaller than "
2016                          "vectorization factor.\n");
2017       return false;
2018     }
2019
2020   /* Analyze the alignment of the data-refs in the loop.
2021      Fail if a data reference is found that cannot be vectorized.  */
2022
2023   ok = vect_analyze_data_refs_alignment (loop_vinfo);
2024   if (!ok)
2025     {
2026       if (dump_enabled_p ())
2027         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2028                          "bad data alignment.\n");
2029       return false;
2030     }
2031
2032   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
2033      It is important to call pruning after vect_analyze_data_ref_accesses,
2034      since we use grouping information gathered by interleaving analysis.  */
2035   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
2036   if (!ok)
2037     return false;
2038
2039   /* This pass will decide on using loop versioning and/or loop peeling in
2040      order to enhance the alignment of data references in the loop.  */
2041   ok = vect_enhance_data_refs_alignment (loop_vinfo);
2042   if (!ok)
2043     {
2044       if (dump_enabled_p ())
2045         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2046                          "bad data alignment.\n");
2047       return false;
2048     }
2049
2050   if (slp)
2051     {
2052       /* Analyze operations in the SLP instances.  Note this may
2053          remove unsupported SLP instances which makes the above
2054          SLP kind detection invalid.  */
2055       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
2056       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
2057                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2058       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
2059         goto again;
2060     }
2061
2062   /* Scan all the remaining operations in the loop that are not subject
2063      to SLP and make sure they are vectorizable.  */
2064   ok = vect_analyze_loop_operations (loop_vinfo);
2065   if (!ok)
2066     {
2067       if (dump_enabled_p ())
2068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2069                          "bad operation or unsupported loop bound.\n");
2070       return false;
2071     }
2072
2073   /* Analyze cost.  Decide if worth while to vectorize.  */
2074   int min_profitable_estimate, min_profitable_iters;
2075   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
2076                                       &min_profitable_estimate);
2077
2078   if (min_profitable_iters < 0)
2079     {
2080       if (dump_enabled_p ())
2081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2082                          "not vectorized: vectorization not profitable.\n");
2083       if (dump_enabled_p ())
2084         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2085                          "not vectorized: vector version will never be "
2086                          "profitable.\n");
2087       goto again;
2088     }
2089
2090   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
2091                             * vectorization_factor) - 1);
2092
2093   /* Use the cost model only if it is more conservative than user specified
2094      threshold.  */
2095   th = (unsigned) min_scalar_loop_bound;
2096   if (min_profitable_iters
2097       && (!min_scalar_loop_bound
2098           || min_profitable_iters > min_scalar_loop_bound))
2099     th = (unsigned) min_profitable_iters;
2100
2101   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
2102
2103   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2104       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
2105     {
2106       if (dump_enabled_p ())
2107         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2108                          "not vectorized: vectorization not profitable.\n");
2109       if (dump_enabled_p ())
2110         dump_printf_loc (MSG_NOTE, vect_location,
2111                          "not vectorized: iteration count smaller than user "
2112                          "specified loop bound parameter or minimum profitable "
2113                          "iterations (whichever is more conservative).\n");
2114       goto again;
2115     }
2116
2117   estimated_niter
2118     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
2119   if (estimated_niter == -1)
2120     estimated_niter = max_niter;
2121   if (estimated_niter != -1
2122       && ((unsigned HOST_WIDE_INT) estimated_niter
2123           <= MAX (th, (unsigned)min_profitable_estimate)))
2124     {
2125       if (dump_enabled_p ())
2126         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2127                          "not vectorized: estimated iteration count too "
2128                          "small.\n");
2129       if (dump_enabled_p ())
2130         dump_printf_loc (MSG_NOTE, vect_location,
2131                          "not vectorized: estimated iteration count smaller "
2132                          "than specified loop bound parameter or minimum "
2133                          "profitable iterations (whichever is more "
2134                          "conservative).\n");
2135       goto again;
2136     }
2137
2138   /* Decide whether we need to create an epilogue loop to handle
2139      remaining scalar iterations.  */
2140   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
2141         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2142        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2143
2144   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2145       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
2146     {
2147       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
2148                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
2149           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
2150         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2151     }
2152   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
2153            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
2154                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
2155                /* In case of versioning, check if the maximum number of
2156                   iterations is greater than th.  If they are identical,
2157                   the epilogue is unnecessary.  */
2158                && (!LOOP_REQUIRES_VERSIONING (loop_vinfo)
2159                    || (unsigned HOST_WIDE_INT) max_niter > th)))
2160     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
2161
2162   /* If an epilogue loop is required make sure we can create one.  */
2163   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
2164       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
2165     {
2166       if (dump_enabled_p ())
2167         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
2168       if (!vect_can_advance_ivs_p (loop_vinfo)
2169           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
2170                                            single_exit (LOOP_VINFO_LOOP
2171                                                          (loop_vinfo))))
2172         {
2173           if (dump_enabled_p ())
2174             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2175                              "not vectorized: can't create required "
2176                              "epilog loop\n");
2177           goto again;
2178         }
2179     }
2180
2181   gcc_assert (vectorization_factor
2182               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
2183
2184   /* Ok to vectorize!  */
2185   return true;
2186
2187 again:
2188   /* Try again with SLP forced off but if we didn't do any SLP there is
2189      no point in re-trying.  */
2190   if (!slp)
2191     return false;
2192
2193   /* If there are reduction chains re-trying will fail anyway.  */
2194   if (! LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).is_empty ())
2195     return false;
2196
2197   /* Likewise if the grouped loads or stores in the SLP cannot be handled
2198      via interleaving or lane instructions.  */
2199   slp_instance instance;
2200   slp_tree node;
2201   unsigned i, j;
2202   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
2203     {
2204       stmt_vec_info vinfo;
2205       vinfo = vinfo_for_stmt
2206           (SLP_TREE_SCALAR_STMTS (SLP_INSTANCE_TREE (instance))[0]);
2207       if (! STMT_VINFO_GROUPED_ACCESS (vinfo))
2208         continue;
2209       vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2210       unsigned int size = STMT_VINFO_GROUP_SIZE (vinfo);
2211       tree vectype = STMT_VINFO_VECTYPE (vinfo);
2212       if (! vect_store_lanes_supported (vectype, size)
2213           && ! vect_grouped_store_supported (vectype, size))
2214         return false;
2215       FOR_EACH_VEC_ELT (SLP_INSTANCE_LOADS (instance), j, node)
2216         {
2217           vinfo = vinfo_for_stmt (SLP_TREE_SCALAR_STMTS (node)[0]);
2218           vinfo = vinfo_for_stmt (STMT_VINFO_GROUP_FIRST_ELEMENT (vinfo));
2219           bool single_element_p = !STMT_VINFO_GROUP_NEXT_ELEMENT (vinfo);
2220           size = STMT_VINFO_GROUP_SIZE (vinfo);
2221           vectype = STMT_VINFO_VECTYPE (vinfo);
2222           if (! vect_load_lanes_supported (vectype, size)
2223               && ! vect_grouped_load_supported (vectype, single_element_p,
2224                                                 size))
2225             return false;
2226         }
2227     }
2228
2229   if (dump_enabled_p ())
2230     dump_printf_loc (MSG_NOTE, vect_location,
2231                      "re-trying with SLP disabled\n");
2232
2233   /* Roll back state appropriately.  No SLP this time.  */
2234   slp = false;
2235   /* Restore vectorization factor as it were without SLP.  */
2236   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = saved_vectorization_factor;
2237   /* Free the SLP instances.  */
2238   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), j, instance)
2239     vect_free_slp_instance (instance);
2240   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
2241   /* Reset SLP type to loop_vect on all stmts.  */
2242   for (i = 0; i < LOOP_VINFO_LOOP (loop_vinfo)->num_nodes; ++i)
2243     {
2244       basic_block bb = LOOP_VINFO_BBS (loop_vinfo)[i];
2245       for (gimple_stmt_iterator si = gsi_start_bb (bb);
2246            !gsi_end_p (si); gsi_next (&si))
2247         {
2248           stmt_vec_info stmt_info = vinfo_for_stmt (gsi_stmt (si));
2249           STMT_SLP_TYPE (stmt_info) = loop_vect;
2250           if (STMT_VINFO_IN_PATTERN_P (stmt_info))
2251             {
2252               stmt_info = vinfo_for_stmt (STMT_VINFO_RELATED_STMT (stmt_info));
2253               STMT_SLP_TYPE (stmt_info) = loop_vect;
2254               for (gimple_stmt_iterator pi
2255                      = gsi_start (STMT_VINFO_PATTERN_DEF_SEQ (stmt_info));
2256                    !gsi_end_p (pi); gsi_next (&pi))
2257                 {
2258                   gimple *pstmt = gsi_stmt (pi);
2259                   STMT_SLP_TYPE (vinfo_for_stmt (pstmt)) = loop_vect;
2260                 }
2261             }
2262         }
2263     }
2264   /* Free optimized alias test DDRS.  */
2265   LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).release ();
2266   /* Reset target cost data.  */
2267   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
2268   LOOP_VINFO_TARGET_COST_DATA (loop_vinfo)
2269     = init_cost (LOOP_VINFO_LOOP (loop_vinfo));
2270   /* Reset assorted flags.  */
2271   LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = false;
2272   LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) = false;
2273   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = 0;
2274
2275   goto start_over;
2276 }
2277
2278 /* Function vect_analyze_loop.
2279
2280    Apply a set of analyses on LOOP, and create a loop_vec_info struct
2281    for it.  The different analyses will record information in the
2282    loop_vec_info struct.  */
2283 loop_vec_info
2284 vect_analyze_loop (struct loop *loop)
2285 {
2286   loop_vec_info loop_vinfo;
2287   unsigned int vector_sizes;
2288
2289   /* Autodetect first vector size we try.  */
2290   current_vector_size = 0;
2291   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
2292
2293   if (dump_enabled_p ())
2294     dump_printf_loc (MSG_NOTE, vect_location,
2295                      "===== analyze_loop_nest =====\n");
2296
2297   if (loop_outer (loop)
2298       && loop_vec_info_for_loop (loop_outer (loop))
2299       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2300     {
2301       if (dump_enabled_p ())
2302         dump_printf_loc (MSG_NOTE, vect_location,
2303                          "outer-loop already vectorized.\n");
2304       return NULL;
2305     }
2306
2307   while (1)
2308     {
2309       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2310       loop_vinfo = vect_analyze_loop_form (loop);
2311       if (!loop_vinfo)
2312         {
2313           if (dump_enabled_p ())
2314             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2315                              "bad loop form.\n");
2316           return NULL;
2317         }
2318
2319       bool fatal = false;
2320       if (vect_analyze_loop_2 (loop_vinfo, fatal))
2321         {
2322           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2323
2324           return loop_vinfo;
2325         }
2326
2327       destroy_loop_vec_info (loop_vinfo, true);
2328
2329       vector_sizes &= ~current_vector_size;
2330       if (fatal
2331           || vector_sizes == 0
2332           || current_vector_size == 0)
2333         return NULL;
2334
2335       /* Try the next biggest vector size.  */
2336       current_vector_size = 1 << floor_log2 (vector_sizes);
2337       if (dump_enabled_p ())
2338         dump_printf_loc (MSG_NOTE, vect_location,
2339                          "***** Re-trying analysis with "
2340                          "vector size %d\n", current_vector_size);
2341     }
2342 }
2343
2344
2345 /* Function reduction_code_for_scalar_code
2346
2347    Input:
2348    CODE - tree_code of a reduction operations.
2349
2350    Output:
2351    REDUC_CODE - the corresponding tree-code to be used to reduce the
2352       vector of partial results into a single scalar result, or ERROR_MARK
2353       if the operation is a supported reduction operation, but does not have
2354       such a tree-code.
2355
2356    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2357
2358 static bool
2359 reduction_code_for_scalar_code (enum tree_code code,
2360                                 enum tree_code *reduc_code)
2361 {
2362   switch (code)
2363     {
2364       case MAX_EXPR:
2365         *reduc_code = REDUC_MAX_EXPR;
2366         return true;
2367
2368       case MIN_EXPR:
2369         *reduc_code = REDUC_MIN_EXPR;
2370         return true;
2371
2372       case PLUS_EXPR:
2373         *reduc_code = REDUC_PLUS_EXPR;
2374         return true;
2375
2376       case MULT_EXPR:
2377       case MINUS_EXPR:
2378       case BIT_IOR_EXPR:
2379       case BIT_XOR_EXPR:
2380       case BIT_AND_EXPR:
2381         *reduc_code = ERROR_MARK;
2382         return true;
2383
2384       default:
2385        return false;
2386     }
2387 }
2388
2389
2390 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2391    STMT is printed with a message MSG. */
2392
2393 static void
2394 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2395 {
2396   dump_printf_loc (msg_type, vect_location, "%s", msg);
2397   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2398 }
2399
2400
2401 /* Detect SLP reduction of the form:
2402
2403    #a1 = phi <a5, a0>
2404    a2 = operation (a1)
2405    a3 = operation (a2)
2406    a4 = operation (a3)
2407    a5 = operation (a4)
2408
2409    #a = phi <a5>
2410
2411    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2412    FIRST_STMT is the first reduction stmt in the chain
2413    (a2 = operation (a1)).
2414
2415    Return TRUE if a reduction chain was detected.  */
2416
2417 static bool
2418 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2419                        gimple *first_stmt)
2420 {
2421   struct loop *loop = (gimple_bb (phi))->loop_father;
2422   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2423   enum tree_code code;
2424   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2425   stmt_vec_info use_stmt_info, current_stmt_info;
2426   tree lhs;
2427   imm_use_iterator imm_iter;
2428   use_operand_p use_p;
2429   int nloop_uses, size = 0, n_out_of_loop_uses;
2430   bool found = false;
2431
2432   if (loop != vect_loop)
2433     return false;
2434
2435   lhs = PHI_RESULT (phi);
2436   code = gimple_assign_rhs_code (first_stmt);
2437   while (1)
2438     {
2439       nloop_uses = 0;
2440       n_out_of_loop_uses = 0;
2441       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2442         {
2443           gimple *use_stmt = USE_STMT (use_p);
2444           if (is_gimple_debug (use_stmt))
2445             continue;
2446
2447           /* Check if we got back to the reduction phi.  */
2448           if (use_stmt == phi)
2449             {
2450               loop_use_stmt = use_stmt;
2451               found = true;
2452               break;
2453             }
2454
2455           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2456             {
2457               loop_use_stmt = use_stmt;
2458               nloop_uses++;
2459             }
2460            else
2461              n_out_of_loop_uses++;
2462
2463            /* There are can be either a single use in the loop or two uses in
2464               phi nodes.  */
2465            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2466              return false;
2467         }
2468
2469       if (found)
2470         break;
2471
2472       /* We reached a statement with no loop uses.  */
2473       if (nloop_uses == 0)
2474         return false;
2475
2476       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2477       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2478         return false;
2479
2480       if (!is_gimple_assign (loop_use_stmt)
2481           || code != gimple_assign_rhs_code (loop_use_stmt)
2482           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2483         return false;
2484
2485       /* Insert USE_STMT into reduction chain.  */
2486       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2487       if (current_stmt)
2488         {
2489           current_stmt_info = vinfo_for_stmt (current_stmt);
2490           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2491           GROUP_FIRST_ELEMENT (use_stmt_info)
2492             = GROUP_FIRST_ELEMENT (current_stmt_info);
2493         }
2494       else
2495         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2496
2497       lhs = gimple_assign_lhs (loop_use_stmt);
2498       current_stmt = loop_use_stmt;
2499       size++;
2500    }
2501
2502   if (!found || loop_use_stmt != phi || size < 2)
2503     return false;
2504
2505   /* Swap the operands, if needed, to make the reduction operand be the second
2506      operand.  */
2507   lhs = PHI_RESULT (phi);
2508   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2509   while (next_stmt)
2510     {
2511       if (gimple_assign_rhs2 (next_stmt) == lhs)
2512         {
2513           tree op = gimple_assign_rhs1 (next_stmt);
2514           gimple *def_stmt = NULL;
2515
2516           if (TREE_CODE (op) == SSA_NAME)
2517             def_stmt = SSA_NAME_DEF_STMT (op);
2518
2519           /* Check that the other def is either defined in the loop
2520              ("vect_internal_def"), or it's an induction (defined by a
2521              loop-header phi-node).  */
2522           if (def_stmt
2523               && gimple_bb (def_stmt)
2524               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2525               && (is_gimple_assign (def_stmt)
2526                   || is_gimple_call (def_stmt)
2527                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2528                            == vect_induction_def
2529                   || (gimple_code (def_stmt) == GIMPLE_PHI
2530                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2531                                   == vect_internal_def
2532                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2533             {
2534               lhs = gimple_assign_lhs (next_stmt);
2535               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2536               continue;
2537             }
2538
2539           return false;
2540         }
2541       else
2542         {
2543           tree op = gimple_assign_rhs2 (next_stmt);
2544           gimple *def_stmt = NULL;
2545
2546           if (TREE_CODE (op) == SSA_NAME)
2547             def_stmt = SSA_NAME_DEF_STMT (op);
2548
2549           /* Check that the other def is either defined in the loop
2550             ("vect_internal_def"), or it's an induction (defined by a
2551             loop-header phi-node).  */
2552           if (def_stmt
2553               && gimple_bb (def_stmt)
2554               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2555               && (is_gimple_assign (def_stmt)
2556                   || is_gimple_call (def_stmt)
2557                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2558                               == vect_induction_def
2559                   || (gimple_code (def_stmt) == GIMPLE_PHI
2560                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2561                                   == vect_internal_def
2562                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2563             {
2564               if (dump_enabled_p ())
2565                 {
2566                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2567                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2568                 }
2569
2570               swap_ssa_operands (next_stmt,
2571                                  gimple_assign_rhs1_ptr (next_stmt),
2572                                  gimple_assign_rhs2_ptr (next_stmt));
2573               update_stmt (next_stmt);
2574
2575               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2576                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2577             }
2578           else
2579             return false;
2580         }
2581
2582       lhs = gimple_assign_lhs (next_stmt);
2583       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2584     }
2585
2586   /* Save the chain for further analysis in SLP detection.  */
2587   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2588   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2589   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2590
2591   return true;
2592 }
2593
2594
2595 /* Function vect_is_simple_reduction_1
2596
2597    (1) Detect a cross-iteration def-use cycle that represents a simple
2598    reduction computation.  We look for the following pattern:
2599
2600    loop_header:
2601      a1 = phi < a0, a2 >
2602      a3 = ...
2603      a2 = operation (a3, a1)
2604
2605    or
2606
2607    a3 = ...
2608    loop_header:
2609      a1 = phi < a0, a2 >
2610      a2 = operation (a3, a1)
2611
2612    such that:
2613    1. operation is commutative and associative and it is safe to
2614       change the order of the computation (if CHECK_REDUCTION is true)
2615    2. no uses for a2 in the loop (a2 is used out of the loop)
2616    3. no uses of a1 in the loop besides the reduction operation
2617    4. no uses of a1 outside the loop.
2618
2619    Conditions 1,4 are tested here.
2620    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2621
2622    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2623    nested cycles, if CHECK_REDUCTION is false.
2624
2625    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2626    reductions:
2627
2628      a1 = phi < a0, a2 >
2629      inner loop (def of a3)
2630      a2 = phi < a3 >
2631
2632    (4) Detect condition expressions, ie:
2633      for (int i = 0; i < N; i++)
2634        if (a[i] < val)
2635         ret_val = a[i];
2636
2637 */
2638
2639 static gimple *
2640 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2641                           bool check_reduction, bool *double_reduc,
2642                           bool need_wrapping_integral_overflow,
2643                           enum vect_reduction_type *v_reduc_type)
2644 {
2645   struct loop *loop = (gimple_bb (phi))->loop_father;
2646   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2647   edge latch_e = loop_latch_edge (loop);
2648   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2649   gimple *def_stmt, *def1 = NULL, *def2 = NULL, *phi_use_stmt = NULL;
2650   enum tree_code orig_code, code;
2651   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2652   tree type;
2653   int nloop_uses;
2654   tree name;
2655   imm_use_iterator imm_iter;
2656   use_operand_p use_p;
2657   bool phi_def;
2658
2659   *double_reduc = false;
2660   *v_reduc_type = TREE_CODE_REDUCTION;
2661
2662   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2663      otherwise, we assume outer loop vectorization.  */
2664   gcc_assert ((check_reduction && loop == vect_loop)
2665               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2666
2667   name = PHI_RESULT (phi);
2668   /* ???  If there are no uses of the PHI result the inner loop reduction
2669      won't be detected as possibly double-reduction by vectorizable_reduction
2670      because that tries to walk the PHI arg from the preheader edge which
2671      can be constant.  See PR60382.  */
2672   if (has_zero_uses (name))
2673     return NULL;
2674   nloop_uses = 0;
2675   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2676     {
2677       gimple *use_stmt = USE_STMT (use_p);
2678       if (is_gimple_debug (use_stmt))
2679         continue;
2680
2681       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2682         {
2683           if (dump_enabled_p ())
2684             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2685                              "intermediate value used outside loop.\n");
2686
2687           return NULL;
2688         }
2689
2690       nloop_uses++;
2691       if (nloop_uses > 1)
2692         {
2693           if (dump_enabled_p ())
2694             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2695                              "reduction used in loop.\n");
2696           return NULL;
2697         }
2698
2699       phi_use_stmt = use_stmt;
2700     }
2701
2702   if (TREE_CODE (loop_arg) != SSA_NAME)
2703     {
2704       if (dump_enabled_p ())
2705         {
2706           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2707                            "reduction: not ssa_name: ");
2708           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2709           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2710         }
2711       return NULL;
2712     }
2713
2714   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2715   if (!def_stmt)
2716     {
2717       if (dump_enabled_p ())
2718         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2719                          "reduction: no def_stmt.\n");
2720       return NULL;
2721     }
2722
2723   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2724     {
2725       if (dump_enabled_p ())
2726         dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2727       return NULL;
2728     }
2729
2730   if (is_gimple_assign (def_stmt))
2731     {
2732       name = gimple_assign_lhs (def_stmt);
2733       phi_def = false;
2734     }
2735   else
2736     {
2737       name = PHI_RESULT (def_stmt);
2738       phi_def = true;
2739     }
2740
2741   nloop_uses = 0;
2742   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2743     {
2744       gimple *use_stmt = USE_STMT (use_p);
2745       if (is_gimple_debug (use_stmt))
2746         continue;
2747       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2748         nloop_uses++;
2749       if (nloop_uses > 1)
2750         {
2751           if (dump_enabled_p ())
2752             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2753                              "reduction used in loop.\n");
2754           return NULL;
2755         }
2756     }
2757
2758   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2759      defined in the inner loop.  */
2760   if (phi_def)
2761     {
2762       op1 = PHI_ARG_DEF (def_stmt, 0);
2763
2764       if (gimple_phi_num_args (def_stmt) != 1
2765           || TREE_CODE (op1) != SSA_NAME)
2766         {
2767           if (dump_enabled_p ())
2768             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2769                              "unsupported phi node definition.\n");
2770
2771           return NULL;
2772         }
2773
2774       def1 = SSA_NAME_DEF_STMT (op1);
2775       if (gimple_bb (def1)
2776           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2777           && loop->inner
2778           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2779           && is_gimple_assign (def1)
2780           && flow_bb_inside_loop_p (loop->inner, gimple_bb (phi_use_stmt)))
2781         {
2782           if (dump_enabled_p ())
2783             report_vect_op (MSG_NOTE, def_stmt,
2784                             "detected double reduction: ");
2785
2786           *double_reduc = true;
2787           return def_stmt;
2788         }
2789
2790       return NULL;
2791     }
2792
2793   code = orig_code = gimple_assign_rhs_code (def_stmt);
2794
2795   /* We can handle "res -= x[i]", which is non-associative by
2796      simply rewriting this into "res += -x[i]".  Avoid changing
2797      gimple instruction for the first simple tests and only do this
2798      if we're allowed to change code at all.  */
2799   if (code == MINUS_EXPR
2800       && (op1 = gimple_assign_rhs1 (def_stmt))
2801       && TREE_CODE (op1) == SSA_NAME
2802       && SSA_NAME_DEF_STMT (op1) == phi)
2803     code = PLUS_EXPR;
2804
2805   if (code == COND_EXPR)
2806     {
2807       if (check_reduction)
2808         *v_reduc_type = COND_REDUCTION;
2809     }
2810   else if (!commutative_tree_code (code) || !associative_tree_code (code))
2811     {
2812       if (dump_enabled_p ())
2813         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2814                         "reduction: not commutative/associative: ");
2815       return NULL;
2816     }
2817
2818   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2819     {
2820       if (code != COND_EXPR)
2821         {
2822           if (dump_enabled_p ())
2823             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2824                             "reduction: not binary operation: ");
2825
2826           return NULL;
2827         }
2828
2829       op3 = gimple_assign_rhs1 (def_stmt);
2830       if (COMPARISON_CLASS_P (op3))
2831         {
2832           op4 = TREE_OPERAND (op3, 1);
2833           op3 = TREE_OPERAND (op3, 0);
2834         }
2835
2836       op1 = gimple_assign_rhs2 (def_stmt);
2837       op2 = gimple_assign_rhs3 (def_stmt);
2838
2839       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2840         {
2841           if (dump_enabled_p ())
2842             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2843                             "reduction: uses not ssa_names: ");
2844
2845           return NULL;
2846         }
2847     }
2848   else
2849     {
2850       op1 = gimple_assign_rhs1 (def_stmt);
2851       op2 = gimple_assign_rhs2 (def_stmt);
2852
2853       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2854         {
2855           if (dump_enabled_p ())
2856             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2857                             "reduction: uses not ssa_names: ");
2858
2859           return NULL;
2860         }
2861    }
2862
2863   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2864   if ((TREE_CODE (op1) == SSA_NAME
2865        && !types_compatible_p (type,TREE_TYPE (op1)))
2866       || (TREE_CODE (op2) == SSA_NAME
2867           && !types_compatible_p (type, TREE_TYPE (op2)))
2868       || (op3 && TREE_CODE (op3) == SSA_NAME
2869           && !types_compatible_p (type, TREE_TYPE (op3)))
2870       || (op4 && TREE_CODE (op4) == SSA_NAME
2871           && !types_compatible_p (type, TREE_TYPE (op4))))
2872     {
2873       if (dump_enabled_p ())
2874         {
2875           dump_printf_loc (MSG_NOTE, vect_location,
2876                            "reduction: multiple types: operation type: ");
2877           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2878           dump_printf (MSG_NOTE, ", operands types: ");
2879           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2880                              TREE_TYPE (op1));
2881           dump_printf (MSG_NOTE, ",");
2882           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2883                              TREE_TYPE (op2));
2884           if (op3)
2885             {
2886               dump_printf (MSG_NOTE, ",");
2887               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2888                                  TREE_TYPE (op3));
2889             }
2890
2891           if (op4)
2892             {
2893               dump_printf (MSG_NOTE, ",");
2894               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2895                                  TREE_TYPE (op4));
2896             }
2897           dump_printf (MSG_NOTE, "\n");
2898         }
2899
2900       return NULL;
2901     }
2902
2903   /* Check that it's ok to change the order of the computation.
2904      Generally, when vectorizing a reduction we change the order of the
2905      computation.  This may change the behavior of the program in some
2906      cases, so we need to check that this is ok.  One exception is when
2907      vectorizing an outer-loop: the inner-loop is executed sequentially,
2908      and therefore vectorizing reductions in the inner-loop during
2909      outer-loop vectorization is safe.  */
2910
2911   if (*v_reduc_type != COND_REDUCTION
2912       && check_reduction)
2913     {
2914       /* CHECKME: check for !flag_finite_math_only too?  */
2915       if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math)
2916         {
2917           /* Changing the order of operations changes the semantics.  */
2918           if (dump_enabled_p ())
2919             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2920                         "reduction: unsafe fp math optimization: ");
2921           return NULL;
2922         }
2923       else if (INTEGRAL_TYPE_P (type))
2924         {
2925           if (!operation_no_trapping_overflow (type, code))
2926             {
2927               /* Changing the order of operations changes the semantics.  */
2928               if (dump_enabled_p ())
2929                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2930                                 "reduction: unsafe int math optimization"
2931                                 " (overflow traps): ");
2932               return NULL;
2933             }
2934           if (need_wrapping_integral_overflow
2935               && !TYPE_OVERFLOW_WRAPS (type)
2936               && operation_can_overflow (code))
2937             {
2938               /* Changing the order of operations changes the semantics.  */
2939               if (dump_enabled_p ())
2940                 report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2941                                 "reduction: unsafe int math optimization"
2942                                 " (overflow doesn't wrap): ");
2943               return NULL;
2944             }
2945         }
2946       else if (SAT_FIXED_POINT_TYPE_P (type))
2947         {
2948           /* Changing the order of operations changes the semantics.  */
2949           if (dump_enabled_p ())
2950           report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2951                           "reduction: unsafe fixed-point math optimization: ");
2952           return NULL;
2953         }
2954     }
2955
2956   /* Reduction is safe. We're dealing with one of the following:
2957      1) integer arithmetic and no trapv
2958      2) floating point arithmetic, and special flags permit this optimization
2959      3) nested cycle (i.e., outer loop vectorization).  */
2960   if (TREE_CODE (op1) == SSA_NAME)
2961     def1 = SSA_NAME_DEF_STMT (op1);
2962
2963   if (TREE_CODE (op2) == SSA_NAME)
2964     def2 = SSA_NAME_DEF_STMT (op2);
2965
2966   if (code != COND_EXPR
2967       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2968     {
2969       if (dump_enabled_p ())
2970         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2971       return NULL;
2972     }
2973
2974   /* Check that one def is the reduction def, defined by PHI,
2975      the other def is either defined in the loop ("vect_internal_def"),
2976      or it's an induction (defined by a loop-header phi-node).  */
2977
2978   if (def2 && def2 == phi
2979       && (code == COND_EXPR
2980           || !def1 || gimple_nop_p (def1)
2981           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2982           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2983               && (is_gimple_assign (def1)
2984                   || is_gimple_call (def1)
2985                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2986                       == vect_induction_def
2987                   || (gimple_code (def1) == GIMPLE_PHI
2988                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2989                           == vect_internal_def
2990                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2991     {
2992       if (dump_enabled_p ())
2993         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2994       return def_stmt;
2995     }
2996
2997   if (def1 && def1 == phi
2998       && (code == COND_EXPR
2999           || !def2 || gimple_nop_p (def2)
3000           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
3001           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
3002               && (is_gimple_assign (def2)
3003                   || is_gimple_call (def2)
3004                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3005                       == vect_induction_def
3006                   || (gimple_code (def2) == GIMPLE_PHI
3007                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
3008                           == vect_internal_def
3009                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
3010     {
3011       if (check_reduction
3012           && orig_code != MINUS_EXPR)
3013         {
3014           if (code == COND_EXPR)
3015             {
3016               /* No current known use where this case would be useful.  */
3017               if (dump_enabled_p ())
3018                 report_vect_op (MSG_NOTE, def_stmt,
3019                                 "detected reduction: cannot currently swap "
3020                                 "operands for cond_expr");
3021               return NULL;
3022             }
3023
3024           /* Swap operands (just for simplicity - so that the rest of the code
3025              can assume that the reduction variable is always the last (second)
3026              argument).  */
3027           if (dump_enabled_p ())
3028             report_vect_op (MSG_NOTE, def_stmt,
3029                             "detected reduction: need to swap operands: ");
3030
3031           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
3032                              gimple_assign_rhs2_ptr (def_stmt));
3033
3034           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
3035             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
3036         }
3037       else
3038         {
3039           if (dump_enabled_p ())
3040             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
3041         }
3042
3043       return def_stmt;
3044     }
3045
3046   /* Try to find SLP reduction chain.  */
3047   if (check_reduction && code != COND_EXPR
3048       && vect_is_slp_reduction (loop_info, phi, def_stmt))
3049     {
3050       if (dump_enabled_p ())
3051         report_vect_op (MSG_NOTE, def_stmt,
3052                         "reduction: detected reduction chain: ");
3053
3054       return def_stmt;
3055     }
3056
3057   if (dump_enabled_p ())
3058     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
3059                     "reduction: unknown pattern: ");
3060
3061   return NULL;
3062 }
3063
3064 /* Wrapper around vect_is_simple_reduction_1, which will modify code
3065    in-place if it enables detection of more reductions.  Arguments
3066    as there.  */
3067
3068 gimple *
3069 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
3070                              bool check_reduction, bool *double_reduc,
3071                              bool need_wrapping_integral_overflow)
3072 {
3073   enum vect_reduction_type v_reduc_type;
3074   return vect_is_simple_reduction (loop_info, phi, check_reduction,
3075                                    double_reduc,
3076                                    need_wrapping_integral_overflow,
3077                                    &v_reduc_type);
3078 }
3079
3080 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
3081 int
3082 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
3083                              int *peel_iters_epilogue,
3084                              stmt_vector_for_cost *scalar_cost_vec,
3085                              stmt_vector_for_cost *prologue_cost_vec,
3086                              stmt_vector_for_cost *epilogue_cost_vec)
3087 {
3088   int retval = 0;
3089   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3090
3091   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
3092     {
3093       *peel_iters_epilogue = vf/2;
3094       if (dump_enabled_p ())
3095         dump_printf_loc (MSG_NOTE, vect_location,
3096                          "cost model: epilogue peel iters set to vf/2 "
3097                          "because loop iterations are unknown .\n");
3098
3099       /* If peeled iterations are known but number of scalar loop
3100          iterations are unknown, count a taken branch per peeled loop.  */
3101       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3102                                  NULL, 0, vect_prologue);
3103       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
3104                                  NULL, 0, vect_epilogue);
3105     }
3106   else
3107     {
3108       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
3109       peel_iters_prologue = niters < peel_iters_prologue ?
3110                             niters : peel_iters_prologue;
3111       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
3112       /* If we need to peel for gaps, but no peeling is required, we have to
3113          peel VF iterations.  */
3114       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
3115         *peel_iters_epilogue = vf;
3116     }
3117
3118   stmt_info_for_cost *si;
3119   int j;
3120   if (peel_iters_prologue)
3121     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3122       retval += record_stmt_cost (prologue_cost_vec,
3123                                   si->count * peel_iters_prologue,
3124                                   si->kind, NULL, si->misalign,
3125                                   vect_prologue);
3126   if (*peel_iters_epilogue)
3127     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
3128       retval += record_stmt_cost (epilogue_cost_vec,
3129                                   si->count * *peel_iters_epilogue,
3130                                   si->kind, NULL, si->misalign,
3131                                   vect_epilogue);
3132
3133   return retval;
3134 }
3135
3136 /* Function vect_estimate_min_profitable_iters
3137
3138    Return the number of iterations required for the vector version of the
3139    loop to be profitable relative to the cost of the scalar version of the
3140    loop.
3141
3142    *RET_MIN_PROFITABLE_NITERS is a cost model profitability threshold
3143    of iterations for vectorization.  -1 value means loop vectorization
3144    is not profitable.  This returned value may be used for dynamic
3145    profitability check.
3146
3147    *RET_MIN_PROFITABLE_ESTIMATE is a profitability threshold to be used
3148    for static check against estimated number of iterations.  */
3149
3150 static void
3151 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
3152                                     int *ret_min_profitable_niters,
3153                                     int *ret_min_profitable_estimate)
3154 {
3155   int min_profitable_iters;
3156   int min_profitable_estimate;
3157   int peel_iters_prologue;
3158   int peel_iters_epilogue;
3159   unsigned vec_inside_cost = 0;
3160   int vec_outside_cost = 0;
3161   unsigned vec_prologue_cost = 0;
3162   unsigned vec_epilogue_cost = 0;
3163   int scalar_single_iter_cost = 0;
3164   int scalar_outside_cost = 0;
3165   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3166   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
3167   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3168
3169   /* Cost model disabled.  */
3170   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
3171     {
3172       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
3173       *ret_min_profitable_niters = 0;
3174       *ret_min_profitable_estimate = 0;
3175       return;
3176     }
3177
3178   /* Requires loop versioning tests to handle misalignment.  */
3179   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
3180     {
3181       /*  FIXME: Make cost depend on complexity of individual check.  */
3182       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
3183       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3184                             vect_prologue);
3185       dump_printf (MSG_NOTE,
3186                    "cost model: Adding cost of checks for loop "
3187                    "versioning to treat misalignment.\n");
3188     }
3189
3190   /* Requires loop versioning with alias checks.  */
3191   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3192     {
3193       /*  FIXME: Make cost depend on complexity of individual check.  */
3194       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
3195       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
3196                             vect_prologue);
3197       dump_printf (MSG_NOTE,
3198                    "cost model: Adding cost of checks for loop "
3199                    "versioning aliasing.\n");
3200     }
3201
3202   /* Requires loop versioning with niter checks.  */
3203   if (LOOP_REQUIRES_VERSIONING_FOR_NITERS (loop_vinfo))
3204     {
3205       /*  FIXME: Make cost depend on complexity of individual check.  */
3206       (void) add_stmt_cost (target_cost_data, 1, vector_stmt, NULL, 0,
3207                             vect_prologue);
3208       dump_printf (MSG_NOTE,
3209                    "cost model: Adding cost of checks for loop "
3210                    "versioning niters.\n");
3211     }
3212
3213   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3214     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
3215                           vect_prologue);
3216
3217   /* Count statements in scalar loop.  Using this as scalar cost for a single
3218      iteration for now.
3219
3220      TODO: Add outer loop support.
3221
3222      TODO: Consider assigning different costs to different scalar
3223      statements.  */
3224
3225   scalar_single_iter_cost
3226     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
3227
3228   /* Add additional cost for the peeled instructions in prologue and epilogue
3229      loop.
3230
3231      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
3232      at compile-time - we assume it's vf/2 (the worst would be vf-1).
3233
3234      TODO: Build an expression that represents peel_iters for prologue and
3235      epilogue to be used in a run-time test.  */
3236
3237   if (npeel  < 0)
3238     {
3239       peel_iters_prologue = vf/2;
3240       dump_printf (MSG_NOTE, "cost model: "
3241                    "prologue peel iters set to vf/2.\n");
3242
3243       /* If peeling for alignment is unknown, loop bound of main loop becomes
3244          unknown.  */
3245       peel_iters_epilogue = vf/2;
3246       dump_printf (MSG_NOTE, "cost model: "
3247                    "epilogue peel iters set to vf/2 because "
3248                    "peeling for alignment is unknown.\n");
3249
3250       /* If peeled iterations are unknown, count a taken branch and a not taken
3251          branch per peeled loop. Even if scalar loop iterations are known,
3252          vector iterations are not known since peeled prologue iterations are
3253          not known. Hence guards remain the same.  */
3254       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3255                             NULL, 0, vect_prologue);
3256       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3257                             NULL, 0, vect_prologue);
3258       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
3259                             NULL, 0, vect_epilogue);
3260       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
3261                             NULL, 0, vect_epilogue);
3262       stmt_info_for_cost *si;
3263       int j;
3264       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
3265         {
3266           struct _stmt_vec_info *stmt_info
3267             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3268           (void) add_stmt_cost (target_cost_data,
3269                                 si->count * peel_iters_prologue,
3270                                 si->kind, stmt_info, si->misalign,
3271                                 vect_prologue);
3272           (void) add_stmt_cost (target_cost_data,
3273                                 si->count * peel_iters_epilogue,
3274                                 si->kind, stmt_info, si->misalign,
3275                                 vect_epilogue);
3276         }
3277     }
3278   else
3279     {
3280       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
3281       stmt_info_for_cost *si;
3282       int j;
3283       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3284
3285       prologue_cost_vec.create (2);
3286       epilogue_cost_vec.create (2);
3287       peel_iters_prologue = npeel;
3288
3289       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
3290                                           &peel_iters_epilogue,
3291                                           &LOOP_VINFO_SCALAR_ITERATION_COST
3292                                             (loop_vinfo),
3293                                           &prologue_cost_vec,
3294                                           &epilogue_cost_vec);
3295
3296       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
3297         {
3298           struct _stmt_vec_info *stmt_info
3299             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3300           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3301                                 si->misalign, vect_prologue);
3302         }
3303
3304       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
3305         {
3306           struct _stmt_vec_info *stmt_info
3307             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
3308           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
3309                                 si->misalign, vect_epilogue);
3310         }
3311
3312       prologue_cost_vec.release ();
3313       epilogue_cost_vec.release ();
3314     }
3315
3316   /* FORNOW: The scalar outside cost is incremented in one of the
3317      following ways:
3318
3319      1. The vectorizer checks for alignment and aliasing and generates
3320      a condition that allows dynamic vectorization.  A cost model
3321      check is ANDED with the versioning condition.  Hence scalar code
3322      path now has the added cost of the versioning check.
3323
3324        if (cost > th & versioning_check)
3325          jmp to vector code
3326
3327      Hence run-time scalar is incremented by not-taken branch cost.
3328
3329      2. The vectorizer then checks if a prologue is required.  If the
3330      cost model check was not done before during versioning, it has to
3331      be done before the prologue check.
3332
3333        if (cost <= th)
3334          prologue = scalar_iters
3335        if (prologue == 0)
3336          jmp to vector code
3337        else
3338          execute prologue
3339        if (prologue == num_iters)
3340          go to exit
3341
3342      Hence the run-time scalar cost is incremented by a taken branch,
3343      plus a not-taken branch, plus a taken branch cost.
3344
3345      3. The vectorizer then checks if an epilogue is required.  If the
3346      cost model check was not done before during prologue check, it
3347      has to be done with the epilogue check.
3348
3349        if (prologue == 0)
3350          jmp to vector code
3351        else
3352          execute prologue
3353        if (prologue == num_iters)
3354          go to exit
3355        vector code:
3356          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3357            jmp to epilogue
3358
3359      Hence the run-time scalar cost should be incremented by 2 taken
3360      branches.
3361
3362      TODO: The back end may reorder the BBS's differently and reverse
3363      conditions/branch directions.  Change the estimates below to
3364      something more reasonable.  */
3365
3366   /* If the number of iterations is known and we do not do versioning, we can
3367      decide whether to vectorize at compile time.  Hence the scalar version
3368      do not carry cost model guard costs.  */
3369   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3370       || LOOP_REQUIRES_VERSIONING (loop_vinfo))
3371     {
3372       /* Cost model check occurs at versioning.  */
3373       if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
3374         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3375       else
3376         {
3377           /* Cost model check occurs at prologue generation.  */
3378           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3379             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3380               + vect_get_stmt_cost (cond_branch_not_taken);
3381           /* Cost model check occurs at epilogue generation.  */
3382           else
3383             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3384         }
3385     }
3386
3387   /* Complete the target-specific cost calculations.  */
3388   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3389                &vec_inside_cost, &vec_epilogue_cost);
3390
3391   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3392
3393   if (dump_enabled_p ())
3394     {
3395       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3396       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3397                    vec_inside_cost);
3398       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3399                    vec_prologue_cost);
3400       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3401                    vec_epilogue_cost);
3402       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3403                    scalar_single_iter_cost);
3404       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3405                    scalar_outside_cost);
3406       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3407                    vec_outside_cost);
3408       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3409                    peel_iters_prologue);
3410       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3411                    peel_iters_epilogue);
3412     }
3413
3414   /* Calculate number of iterations required to make the vector version
3415      profitable, relative to the loop bodies only.  The following condition
3416      must hold true:
3417      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3418      where
3419      SIC = scalar iteration cost, VIC = vector iteration cost,
3420      VOC = vector outside cost, VF = vectorization factor,
3421      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3422      SOC = scalar outside cost for run time cost model check.  */
3423
3424   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3425     {
3426       if (vec_outside_cost <= 0)
3427         min_profitable_iters = 1;
3428       else
3429         {
3430           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3431                                   - vec_inside_cost * peel_iters_prologue
3432                                   - vec_inside_cost * peel_iters_epilogue)
3433                                  / ((scalar_single_iter_cost * vf)
3434                                     - vec_inside_cost);
3435
3436           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3437               <= (((int) vec_inside_cost * min_profitable_iters)
3438                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3439             min_profitable_iters++;
3440         }
3441     }
3442   /* vector version will never be profitable.  */
3443   else
3444     {
3445       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3446         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3447                     "did not happen for a simd loop");
3448
3449       if (dump_enabled_p ())
3450         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3451                          "cost model: the vector iteration cost = %d "
3452                          "divided by the scalar iteration cost = %d "
3453                          "is greater or equal to the vectorization factor = %d"
3454                          ".\n",
3455                          vec_inside_cost, scalar_single_iter_cost, vf);
3456       *ret_min_profitable_niters = -1;
3457       *ret_min_profitable_estimate = -1;
3458       return;
3459     }
3460
3461   dump_printf (MSG_NOTE,
3462                "  Calculated minimum iters for profitability: %d\n",
3463                min_profitable_iters);
3464
3465   min_profitable_iters =
3466         min_profitable_iters < vf ? vf : min_profitable_iters;
3467
3468   /* Because the condition we create is:
3469      if (niters <= min_profitable_iters)
3470        then skip the vectorized loop.  */
3471   min_profitable_iters--;
3472
3473   if (dump_enabled_p ())
3474     dump_printf_loc (MSG_NOTE, vect_location,
3475                      "  Runtime profitability threshold = %d\n",
3476                      min_profitable_iters);
3477
3478   *ret_min_profitable_niters = min_profitable_iters;
3479
3480   /* Calculate number of iterations required to make the vector version
3481      profitable, relative to the loop bodies only.
3482
3483      Non-vectorized variant is SIC * niters and it must win over vector
3484      variant on the expected loop trip count.  The following condition must hold true:
3485      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3486
3487   if (vec_outside_cost <= 0)
3488     min_profitable_estimate = 1;
3489   else
3490     {
3491       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3492                                  - vec_inside_cost * peel_iters_prologue
3493                                  - vec_inside_cost * peel_iters_epilogue)
3494                                  / ((scalar_single_iter_cost * vf)
3495                                    - vec_inside_cost);
3496     }
3497   min_profitable_estimate --;
3498   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3499   if (dump_enabled_p ())
3500     dump_printf_loc (MSG_NOTE, vect_location,
3501                      "  Static estimate profitability threshold = %d\n",
3502                      min_profitable_estimate);
3503
3504   *ret_min_profitable_estimate = min_profitable_estimate;
3505 }
3506
3507 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3508    vector elements (not bits) for a vector of mode MODE.  */
3509 static void
3510 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3511                               unsigned char *sel)
3512 {
3513   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3514
3515   for (i = 0; i < nelt; i++)
3516     sel[i] = (i + offset) & (2*nelt - 1);
3517 }
3518
3519 /* Checks whether the target supports whole-vector shifts for vectors of mode
3520    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3521    it supports vec_perm_const with masks for all necessary shift amounts.  */
3522 static bool
3523 have_whole_vector_shift (enum machine_mode mode)
3524 {
3525   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3526     return true;
3527
3528   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3529     return false;
3530
3531   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3532   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3533
3534   for (i = nelt/2; i >= 1; i/=2)
3535     {
3536       calc_vec_perm_mask_for_shift (mode, i, sel);
3537       if (!can_vec_perm_p (mode, false, sel))
3538         return false;
3539     }
3540   return true;
3541 }
3542
3543 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3544
3545 static tree
3546 get_reduction_op (gimple *stmt, int reduc_index)
3547 {
3548   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3549     {
3550     case GIMPLE_SINGLE_RHS:
3551       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3552                   == ternary_op);
3553       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3554     case GIMPLE_UNARY_RHS:
3555       return gimple_assign_rhs1 (stmt);
3556     case GIMPLE_BINARY_RHS:
3557       return (reduc_index
3558               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3559     case GIMPLE_TERNARY_RHS:
3560       return gimple_op (stmt, reduc_index + 1);
3561     default:
3562       gcc_unreachable ();
3563     }
3564 }
3565
3566 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3567    functions. Design better to avoid maintenance issues.  */
3568
3569 /* Function vect_model_reduction_cost.
3570
3571    Models cost for a reduction operation, including the vector ops
3572    generated within the strip-mine loop, the initial definition before
3573    the loop, and the epilogue code that must be generated.  */
3574
3575 static bool
3576 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3577                            int ncopies, int reduc_index)
3578 {
3579   int prologue_cost = 0, epilogue_cost = 0;
3580   enum tree_code code;
3581   optab optab;
3582   tree vectype;
3583   gimple *stmt, *orig_stmt;
3584   tree reduction_op;
3585   machine_mode mode;
3586   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3587   struct loop *loop = NULL;
3588   void *target_cost_data;
3589
3590   if (loop_vinfo)
3591     {
3592       loop = LOOP_VINFO_LOOP (loop_vinfo);
3593       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3594     }
3595   else
3596     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3597
3598   /* Condition reductions generate two reductions in the loop.  */
3599   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3600     ncopies *= 2;
3601
3602   /* Cost of reduction op inside loop.  */
3603   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3604                                         stmt_info, 0, vect_body);
3605   stmt = STMT_VINFO_STMT (stmt_info);
3606
3607   reduction_op = get_reduction_op (stmt, reduc_index);
3608
3609   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3610   if (!vectype)
3611     {
3612       if (dump_enabled_p ())
3613         {
3614           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3615                            "unsupported data-type ");
3616           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3617                              TREE_TYPE (reduction_op));
3618           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3619         }
3620       return false;
3621    }
3622
3623   mode = TYPE_MODE (vectype);
3624   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3625
3626   if (!orig_stmt)
3627     orig_stmt = STMT_VINFO_STMT (stmt_info);
3628
3629   code = gimple_assign_rhs_code (orig_stmt);
3630
3631   /* Add in cost for initial definition.
3632      For cond reduction we have four vectors: initial index, step, initial
3633      result of the data reduction, initial value of the index reduction.  */
3634   int prologue_stmts = STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
3635                        == COND_REDUCTION ? 4 : 1;
3636   prologue_cost += add_stmt_cost (target_cost_data, prologue_stmts,
3637                                   scalar_to_vec, stmt_info, 0,
3638                                   vect_prologue);
3639
3640   /* Determine cost of epilogue code.
3641
3642      We have a reduction operator that will reduce the vector in one statement.
3643      Also requires scalar extract.  */
3644
3645   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3646     {
3647       if (reduc_code != ERROR_MARK)
3648         {
3649           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
3650             {
3651               /* An EQ stmt and an COND_EXPR stmt.  */
3652               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3653                                               vector_stmt, stmt_info, 0,
3654                                               vect_epilogue);
3655               /* Reduction of the max index and a reduction of the found
3656                  values.  */
3657               epilogue_cost += add_stmt_cost (target_cost_data, 2,
3658                                               vec_to_scalar, stmt_info, 0,
3659                                               vect_epilogue);
3660               /* A broadcast of the max value.  */
3661               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3662                                               scalar_to_vec, stmt_info, 0,
3663                                               vect_epilogue);
3664             }
3665           else
3666             {
3667               epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3668                                               stmt_info, 0, vect_epilogue);
3669               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3670                                               vec_to_scalar, stmt_info, 0,
3671                                               vect_epilogue);
3672             }
3673         }
3674       else
3675         {
3676           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3677           tree bitsize =
3678             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3679           int element_bitsize = tree_to_uhwi (bitsize);
3680           int nelements = vec_size_in_bits / element_bitsize;
3681
3682           optab = optab_for_tree_code (code, vectype, optab_default);
3683
3684           /* We have a whole vector shift available.  */
3685           if (VECTOR_MODE_P (mode)
3686               && optab_handler (optab, mode) != CODE_FOR_nothing
3687               && have_whole_vector_shift (mode))
3688             {
3689               /* Final reduction via vector shifts and the reduction operator.
3690                  Also requires scalar extract.  */
3691               epilogue_cost += add_stmt_cost (target_cost_data,
3692                                               exact_log2 (nelements) * 2,
3693                                               vector_stmt, stmt_info, 0,
3694                                               vect_epilogue);
3695               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3696                                               vec_to_scalar, stmt_info, 0,
3697                                               vect_epilogue);
3698             }
3699           else
3700             /* Use extracts and reduction op for final reduction.  For N
3701                elements, we have N extracts and N-1 reduction ops.  */
3702             epilogue_cost += add_stmt_cost (target_cost_data,
3703                                             nelements + nelements - 1,
3704                                             vector_stmt, stmt_info, 0,
3705                                             vect_epilogue);
3706         }
3707     }
3708
3709   if (dump_enabled_p ())
3710     dump_printf (MSG_NOTE,
3711                  "vect_model_reduction_cost: inside_cost = %d, "
3712                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3713                  prologue_cost, epilogue_cost);
3714
3715   return true;
3716 }
3717
3718
3719 /* Function vect_model_induction_cost.
3720
3721    Models cost for induction operations.  */
3722
3723 static void
3724 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3725 {
3726   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3727   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3728   unsigned inside_cost, prologue_cost;
3729
3730   /* loop cost for vec_loop.  */
3731   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3732                                stmt_info, 0, vect_body);
3733
3734   /* prologue cost for vec_init and vec_step.  */
3735   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3736                                  stmt_info, 0, vect_prologue);
3737
3738   if (dump_enabled_p ())
3739     dump_printf_loc (MSG_NOTE, vect_location,
3740                      "vect_model_induction_cost: inside_cost = %d, "
3741                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3742 }
3743
3744
3745 /* Function get_initial_def_for_induction
3746
3747    Input:
3748    STMT - a stmt that performs an induction operation in the loop.
3749    IV_PHI - the initial value of the induction variable
3750
3751    Output:
3752    Return a vector variable, initialized with the first VF values of
3753    the induction variable.  E.g., for an iv with IV_PHI='X' and
3754    evolution S, for a vector of 4 units, we want to return:
3755    [X, X + S, X + 2*S, X + 3*S].  */
3756
3757 static tree
3758 get_initial_def_for_induction (gimple *iv_phi)
3759 {
3760   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3761   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3762   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3763   tree vectype;
3764   int nunits;
3765   edge pe = loop_preheader_edge (loop);
3766   struct loop *iv_loop;
3767   basic_block new_bb;
3768   tree new_vec, vec_init, vec_step, t;
3769   tree new_name;
3770   gimple *new_stmt;
3771   gphi *induction_phi;
3772   tree induc_def, vec_def, vec_dest;
3773   tree init_expr, step_expr;
3774   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3775   int i;
3776   int ncopies;
3777   tree expr;
3778   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3779   bool nested_in_vect_loop = false;
3780   gimple_seq stmts;
3781   imm_use_iterator imm_iter;
3782   use_operand_p use_p;
3783   gimple *exit_phi;
3784   edge latch_e;
3785   tree loop_arg;
3786   gimple_stmt_iterator si;
3787   basic_block bb = gimple_bb (iv_phi);
3788   tree stepvectype;
3789   tree resvectype;
3790
3791   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3792   if (nested_in_vect_loop_p (loop, iv_phi))
3793     {
3794       nested_in_vect_loop = true;
3795       iv_loop = loop->inner;
3796     }
3797   else
3798     iv_loop = loop;
3799   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3800
3801   latch_e = loop_latch_edge (iv_loop);
3802   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3803
3804   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3805   gcc_assert (step_expr != NULL_TREE);
3806
3807   pe = loop_preheader_edge (iv_loop);
3808   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3809                                      loop_preheader_edge (iv_loop));
3810
3811   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3812   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3813   gcc_assert (vectype);
3814   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3815   ncopies = vf / nunits;
3816
3817   gcc_assert (phi_info);
3818   gcc_assert (ncopies >= 1);
3819
3820   /* Convert the step to the desired type.  */
3821   stmts = NULL;
3822   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3823   if (stmts)
3824     {
3825       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3826       gcc_assert (!new_bb);
3827     }
3828
3829   /* Find the first insertion point in the BB.  */
3830   si = gsi_after_labels (bb);
3831
3832   /* Create the vector that holds the initial_value of the induction.  */
3833   if (nested_in_vect_loop)
3834     {
3835       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3836          been created during vectorization of previous stmts.  We obtain it
3837          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3838       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3839       /* If the initial value is not of proper type, convert it.  */
3840       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3841         {
3842           new_stmt
3843             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3844                                                           vect_simple_var,
3845                                                           "vec_iv_"),
3846                                    VIEW_CONVERT_EXPR,
3847                                    build1 (VIEW_CONVERT_EXPR, vectype,
3848                                            vec_init));
3849           vec_init = gimple_assign_lhs (new_stmt);
3850           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3851                                                  new_stmt);
3852           gcc_assert (!new_bb);
3853           set_vinfo_for_stmt (new_stmt,
3854                               new_stmt_vec_info (new_stmt, loop_vinfo));
3855         }
3856     }
3857   else
3858     {
3859       vec<constructor_elt, va_gc> *v;
3860
3861       /* iv_loop is the loop to be vectorized. Create:
3862          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3863       stmts = NULL;
3864       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3865
3866       vec_alloc (v, nunits);
3867       bool constant_p = is_gimple_min_invariant (new_name);
3868       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3869       for (i = 1; i < nunits; i++)
3870         {
3871           /* Create: new_name_i = new_name + step_expr  */
3872           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3873                                    new_name, step_expr);
3874           if (!is_gimple_min_invariant (new_name))
3875             constant_p = false;
3876           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3877         }
3878       if (stmts)
3879         {
3880           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3881           gcc_assert (!new_bb);
3882         }
3883
3884       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3885       if (constant_p)
3886         new_vec = build_vector_from_ctor (vectype, v);
3887       else
3888         new_vec = build_constructor (vectype, v);
3889       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3890     }
3891
3892
3893   /* Create the vector that holds the step of the induction.  */
3894   if (nested_in_vect_loop)
3895     /* iv_loop is nested in the loop to be vectorized. Generate:
3896        vec_step = [S, S, S, S]  */
3897     new_name = step_expr;
3898   else
3899     {
3900       /* iv_loop is the loop to be vectorized. Generate:
3901           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3902       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3903         {
3904           expr = build_int_cst (integer_type_node, vf);
3905           expr = fold_convert (TREE_TYPE (step_expr), expr);
3906         }
3907       else
3908         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3909       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3910                               expr, step_expr);
3911       if (TREE_CODE (step_expr) == SSA_NAME)
3912         new_name = vect_init_vector (iv_phi, new_name,
3913                                      TREE_TYPE (step_expr), NULL);
3914     }
3915
3916   t = unshare_expr (new_name);
3917   gcc_assert (CONSTANT_CLASS_P (new_name)
3918               || TREE_CODE (new_name) == SSA_NAME);
3919   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3920   gcc_assert (stepvectype);
3921   new_vec = build_vector_from_val (stepvectype, t);
3922   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3923
3924
3925   /* Create the following def-use cycle:
3926      loop prolog:
3927          vec_init = ...
3928          vec_step = ...
3929      loop:
3930          vec_iv = PHI <vec_init, vec_loop>
3931          ...
3932          STMT
3933          ...
3934          vec_loop = vec_iv + vec_step;  */
3935
3936   /* Create the induction-phi that defines the induction-operand.  */
3937   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3938   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3939   set_vinfo_for_stmt (induction_phi,
3940                       new_stmt_vec_info (induction_phi, loop_vinfo));
3941   induc_def = PHI_RESULT (induction_phi);
3942
3943   /* Create the iv update inside the loop  */
3944   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3945   vec_def = make_ssa_name (vec_dest, new_stmt);
3946   gimple_assign_set_lhs (new_stmt, vec_def);
3947   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3948   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3949
3950   /* Set the arguments of the phi node:  */
3951   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3952   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3953                UNKNOWN_LOCATION);
3954
3955
3956   /* In case that vectorization factor (VF) is bigger than the number
3957      of elements that we can fit in a vectype (nunits), we have to generate
3958      more than one vector stmt - i.e - we need to "unroll" the
3959      vector stmt by a factor VF/nunits.  For more details see documentation
3960      in vectorizable_operation.  */
3961
3962   if (ncopies > 1)
3963     {
3964       stmt_vec_info prev_stmt_vinfo;
3965       /* FORNOW. This restriction should be relaxed.  */
3966       gcc_assert (!nested_in_vect_loop);
3967
3968       /* Create the vector that holds the step of the induction.  */
3969       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3970         {
3971           expr = build_int_cst (integer_type_node, nunits);
3972           expr = fold_convert (TREE_TYPE (step_expr), expr);
3973         }
3974       else
3975         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3976       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3977                               expr, step_expr);
3978       if (TREE_CODE (step_expr) == SSA_NAME)
3979         new_name = vect_init_vector (iv_phi, new_name,
3980                                      TREE_TYPE (step_expr), NULL);
3981       t = unshare_expr (new_name);
3982       gcc_assert (CONSTANT_CLASS_P (new_name)
3983                   || TREE_CODE (new_name) == SSA_NAME);
3984       new_vec = build_vector_from_val (stepvectype, t);
3985       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3986
3987       vec_def = induc_def;
3988       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3989       for (i = 1; i < ncopies; i++)
3990         {
3991           /* vec_i = vec_prev + vec_step  */
3992           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3993                                           vec_def, vec_step);
3994           vec_def = make_ssa_name (vec_dest, new_stmt);
3995           gimple_assign_set_lhs (new_stmt, vec_def);
3996
3997           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3998           if (!useless_type_conversion_p (resvectype, vectype))
3999             {
4000               new_stmt
4001                 = gimple_build_assign
4002                         (vect_get_new_vect_var (resvectype, vect_simple_var,
4003                                                 "vec_iv_"),
4004                          VIEW_CONVERT_EXPR,
4005                          build1 (VIEW_CONVERT_EXPR, resvectype,
4006                                  gimple_assign_lhs (new_stmt)));
4007               gimple_assign_set_lhs (new_stmt,
4008                                      make_ssa_name
4009                                        (gimple_assign_lhs (new_stmt), new_stmt));
4010               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4011             }
4012           set_vinfo_for_stmt (new_stmt,
4013                               new_stmt_vec_info (new_stmt, loop_vinfo));
4014           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
4015           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
4016         }
4017     }
4018
4019   if (nested_in_vect_loop)
4020     {
4021       /* Find the loop-closed exit-phi of the induction, and record
4022          the final vector of induction results:  */
4023       exit_phi = NULL;
4024       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
4025         {
4026           gimple *use_stmt = USE_STMT (use_p);
4027           if (is_gimple_debug (use_stmt))
4028             continue;
4029
4030           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
4031             {
4032               exit_phi = use_stmt;
4033               break;
4034             }
4035         }
4036       if (exit_phi)
4037         {
4038           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
4039           /* FORNOW. Currently not supporting the case that an inner-loop induction
4040              is not used in the outer-loop (i.e. only outside the outer-loop).  */
4041           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
4042                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
4043
4044           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
4045           if (dump_enabled_p ())
4046             {
4047               dump_printf_loc (MSG_NOTE, vect_location,
4048                                "vector of inductions after inner-loop:");
4049               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
4050             }
4051         }
4052     }
4053
4054
4055   if (dump_enabled_p ())
4056     {
4057       dump_printf_loc (MSG_NOTE, vect_location,
4058                        "transform induction: created def-use cycle: ");
4059       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
4060       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
4061                         SSA_NAME_DEF_STMT (vec_def), 0);
4062     }
4063
4064   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
4065   if (!useless_type_conversion_p (resvectype, vectype))
4066     {
4067       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
4068                                                              vect_simple_var,
4069                                                              "vec_iv_"),
4070                                       VIEW_CONVERT_EXPR,
4071                                       build1 (VIEW_CONVERT_EXPR, resvectype,
4072                                               induc_def));
4073       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
4074       gimple_assign_set_lhs (new_stmt, induc_def);
4075       si = gsi_after_labels (bb);
4076       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
4077       set_vinfo_for_stmt (new_stmt,
4078                           new_stmt_vec_info (new_stmt, loop_vinfo));
4079       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
4080         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
4081     }
4082
4083   return induc_def;
4084 }
4085
4086
4087 /* Function get_initial_def_for_reduction
4088
4089    Input:
4090    STMT - a stmt that performs a reduction operation in the loop.
4091    INIT_VAL - the initial value of the reduction variable
4092
4093    Output:
4094    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
4095         of the reduction (used for adjusting the epilog - see below).
4096    Return a vector variable, initialized according to the operation that STMT
4097         performs. This vector will be used as the initial value of the
4098         vector of partial results.
4099
4100    Option1 (adjust in epilog): Initialize the vector as follows:
4101      add/bit or/xor:    [0,0,...,0,0]
4102      mult/bit and:      [1,1,...,1,1]
4103      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
4104    and when necessary (e.g. add/mult case) let the caller know
4105    that it needs to adjust the result by init_val.
4106
4107    Option2: Initialize the vector as follows:
4108      add/bit or/xor:    [init_val,0,0,...,0]
4109      mult/bit and:      [init_val,1,1,...,1]
4110      min/max/cond_expr: [init_val,init_val,...,init_val]
4111    and no adjustments are needed.
4112
4113    For example, for the following code:
4114
4115    s = init_val;
4116    for (i=0;i<n;i++)
4117      s = s + a[i];
4118
4119    STMT is 's = s + a[i]', and the reduction variable is 's'.
4120    For a vector of 4 units, we want to return either [0,0,0,init_val],
4121    or [0,0,0,0] and let the caller know that it needs to adjust
4122    the result at the end by 'init_val'.
4123
4124    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
4125    initialization vector is simpler (same element in all entries), if
4126    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
4127
4128    A cost model should help decide between these two schemes.  */
4129
4130 tree
4131 get_initial_def_for_reduction (gimple *stmt, tree init_val,
4132                                tree *adjustment_def)
4133 {
4134   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
4135   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
4136   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4137   tree scalar_type = TREE_TYPE (init_val);
4138   tree vectype = get_vectype_for_scalar_type (scalar_type);
4139   int nunits;
4140   enum tree_code code = gimple_assign_rhs_code (stmt);
4141   tree def_for_init;
4142   tree init_def;
4143   tree *elts;
4144   int i;
4145   bool nested_in_vect_loop = false;
4146   REAL_VALUE_TYPE real_init_val = dconst0;
4147   int int_init_val = 0;
4148   gimple *def_stmt = NULL;
4149   gimple_seq stmts = NULL;
4150
4151   gcc_assert (vectype);
4152   nunits = TYPE_VECTOR_SUBPARTS (vectype);
4153
4154   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
4155               || SCALAR_FLOAT_TYPE_P (scalar_type));
4156
4157   if (nested_in_vect_loop_p (loop, stmt))
4158     nested_in_vect_loop = true;
4159   else
4160     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
4161
4162   /* In case of double reduction we only create a vector variable to be put
4163      in the reduction phi node.  The actual statement creation is done in
4164      vect_create_epilog_for_reduction.  */
4165   if (adjustment_def && nested_in_vect_loop
4166       && TREE_CODE (init_val) == SSA_NAME
4167       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
4168       && gimple_code (def_stmt) == GIMPLE_PHI
4169       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
4170       && vinfo_for_stmt (def_stmt)
4171       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
4172           == vect_double_reduction_def)
4173     {
4174       *adjustment_def = NULL;
4175       return vect_create_destination_var (init_val, vectype);
4176     }
4177
4178   /* In case of a nested reduction do not use an adjustment def as
4179      that case is not supported by the epilogue generation correctly
4180      if ncopies is not one.  */
4181   if (adjustment_def && nested_in_vect_loop)
4182     {
4183       *adjustment_def = NULL;
4184       return vect_get_vec_def_for_operand (init_val, stmt);
4185     }
4186
4187   switch (code)
4188     {
4189       case WIDEN_SUM_EXPR:
4190       case DOT_PROD_EXPR:
4191       case SAD_EXPR:
4192       case PLUS_EXPR:
4193       case MINUS_EXPR:
4194       case BIT_IOR_EXPR:
4195       case BIT_XOR_EXPR:
4196       case MULT_EXPR:
4197       case BIT_AND_EXPR:
4198         /* ADJUSMENT_DEF is NULL when called from
4199            vect_create_epilog_for_reduction to vectorize double reduction.  */
4200         if (adjustment_def)
4201           *adjustment_def = init_val;
4202
4203         if (code == MULT_EXPR)
4204           {
4205             real_init_val = dconst1;
4206             int_init_val = 1;
4207           }
4208
4209         if (code == BIT_AND_EXPR)
4210           int_init_val = -1;
4211
4212         if (SCALAR_FLOAT_TYPE_P (scalar_type))
4213           def_for_init = build_real (scalar_type, real_init_val);
4214         else
4215           def_for_init = build_int_cst (scalar_type, int_init_val);
4216
4217         /* Create a vector of '0' or '1' except the first element.  */
4218         elts = XALLOCAVEC (tree, nunits);
4219         for (i = nunits - 2; i >= 0; --i)
4220           elts[i + 1] = def_for_init;
4221
4222         /* Option1: the first element is '0' or '1' as well.  */
4223         if (adjustment_def)
4224           {
4225             elts[0] = def_for_init;
4226             init_def = build_vector (vectype, elts);
4227             break;
4228           }
4229
4230         /* Option2: the first element is INIT_VAL.  */
4231         elts[0] = init_val;
4232         if (TREE_CONSTANT (init_val))
4233           init_def = build_vector (vectype, elts);
4234         else
4235           {
4236             vec<constructor_elt, va_gc> *v;
4237             vec_alloc (v, nunits);
4238             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
4239             for (i = 1; i < nunits; ++i)
4240               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
4241             init_def = build_constructor (vectype, v);
4242           }
4243
4244         break;
4245
4246       case MIN_EXPR:
4247       case MAX_EXPR:
4248       case COND_EXPR:
4249         if (adjustment_def)
4250           {
4251             *adjustment_def = NULL_TREE;
4252             if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_vinfo) != COND_REDUCTION)
4253               {
4254                 init_def = vect_get_vec_def_for_operand (init_val, stmt);
4255                 break;
4256               }
4257           }
4258         init_val = gimple_convert (&stmts, TREE_TYPE (vectype), init_val);
4259         if (! gimple_seq_empty_p (stmts))
4260           gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4261         init_def = build_vector_from_val (vectype, init_val);
4262         break;
4263
4264       default:
4265         gcc_unreachable ();
4266     }
4267
4268   return init_def;
4269 }
4270
4271 /* Function vect_create_epilog_for_reduction
4272
4273    Create code at the loop-epilog to finalize the result of a reduction
4274    computation.
4275
4276    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
4277      reduction statements.
4278    STMT is the scalar reduction stmt that is being vectorized.
4279    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
4280      number of elements that we can fit in a vectype (nunits).  In this case
4281      we have to generate more than one vector stmt - i.e - we need to "unroll"
4282      the vector stmt by a factor VF/nunits.  For more details see documentation
4283      in vectorizable_operation.
4284    REDUC_CODE is the tree-code for the epilog reduction.
4285    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
4286      computation.
4287    REDUC_INDEX is the index of the operand in the right hand side of the
4288      statement that is defined by REDUCTION_PHI.
4289    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
4290    SLP_NODE is an SLP node containing a group of reduction statements. The
4291      first one in this group is STMT.
4292    INDUCTION_INDEX is the index of the loop for condition reductions.
4293      Otherwise it is undefined.
4294
4295    This function:
4296    1. Creates the reduction def-use cycles: sets the arguments for
4297       REDUCTION_PHIS:
4298       The loop-entry argument is the vectorized initial-value of the reduction.
4299       The loop-latch argument is taken from VECT_DEFS - the vector of partial
4300       sums.
4301    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
4302       by applying the operation specified by REDUC_CODE if available, or by
4303       other means (whole-vector shifts or a scalar loop).
4304       The function also creates a new phi node at the loop exit to preserve
4305       loop-closed form, as illustrated below.
4306
4307      The flow at the entry to this function:
4308
4309         loop:
4310           vec_def = phi <null, null>            # REDUCTION_PHI
4311           VECT_DEF = vector_stmt                # vectorized form of STMT
4312           s_loop = scalar_stmt                  # (scalar) STMT
4313         loop_exit:
4314           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4315           use <s_out0>
4316           use <s_out0>
4317
4318      The above is transformed by this function into:
4319
4320         loop:
4321           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4322           VECT_DEF = vector_stmt                # vectorized form of STMT
4323           s_loop = scalar_stmt                  # (scalar) STMT
4324         loop_exit:
4325           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4326           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4327           v_out2 = reduce <v_out1>
4328           s_out3 = extract_field <v_out2, 0>
4329           s_out4 = adjust_result <s_out3>
4330           use <s_out4>
4331           use <s_out4>
4332 */
4333
4334 static void
4335 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
4336                                   int ncopies, enum tree_code reduc_code,
4337                                   vec<gimple *> reduction_phis,
4338                                   int reduc_index, bool double_reduc,
4339                                   slp_tree slp_node, tree induction_index)
4340 {
4341   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4342   stmt_vec_info prev_phi_info;
4343   tree vectype;
4344   machine_mode mode;
4345   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4346   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4347   basic_block exit_bb;
4348   tree scalar_dest;
4349   tree scalar_type;
4350   gimple *new_phi = NULL, *phi;
4351   gimple_stmt_iterator exit_gsi;
4352   tree vec_dest;
4353   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4354   gimple *epilog_stmt = NULL;
4355   enum tree_code code = gimple_assign_rhs_code (stmt);
4356   gimple *exit_phi;
4357   tree bitsize;
4358   tree adjustment_def = NULL;
4359   tree vec_initial_def = NULL;
4360   tree reduction_op, expr, def, initial_def = NULL;
4361   tree orig_name, scalar_result;
4362   imm_use_iterator imm_iter, phi_imm_iter;
4363   use_operand_p use_p, phi_use_p;
4364   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
4365   bool nested_in_vect_loop = false;
4366   auto_vec<gimple *> new_phis;
4367   auto_vec<gimple *> inner_phis;
4368   enum vect_def_type dt = vect_unknown_def_type;
4369   int j, i;
4370   auto_vec<tree> scalar_results;
4371   unsigned int group_size = 1, k, ratio;
4372   auto_vec<tree> vec_initial_defs;
4373   auto_vec<gimple *> phis;
4374   bool slp_reduc = false;
4375   tree new_phi_result;
4376   gimple *inner_phi = NULL;
4377
4378   if (slp_node)
4379     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4380
4381   if (nested_in_vect_loop_p (loop, stmt))
4382     {
4383       outer_loop = loop;
4384       loop = loop->inner;
4385       nested_in_vect_loop = true;
4386       gcc_assert (!slp_node);
4387     }
4388
4389   reduction_op = get_reduction_op (stmt, reduc_index);
4390
4391   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4392   gcc_assert (vectype);
4393   mode = TYPE_MODE (vectype);
4394
4395   /* 1. Create the reduction def-use cycle:
4396      Set the arguments of REDUCTION_PHIS, i.e., transform
4397
4398         loop:
4399           vec_def = phi <null, null>            # REDUCTION_PHI
4400           VECT_DEF = vector_stmt                # vectorized form of STMT
4401           ...
4402
4403      into:
4404
4405         loop:
4406           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4407           VECT_DEF = vector_stmt                # vectorized form of STMT
4408           ...
4409
4410      (in case of SLP, do it for all the phis). */
4411
4412   /* Get the loop-entry arguments.  */
4413   enum vect_def_type initial_def_dt = vect_unknown_def_type;
4414   if (slp_node)
4415     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4416                        NULL, slp_node, reduc_index);
4417   else
4418     {
4419       /* Get at the scalar def before the loop, that defines the initial value
4420          of the reduction variable.  */
4421       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
4422       initial_def = PHI_ARG_DEF_FROM_EDGE (def_stmt,
4423                                            loop_preheader_edge (loop));
4424       vect_is_simple_use (initial_def, loop_vinfo, &def_stmt, &initial_def_dt);
4425       vec_initial_def = get_initial_def_for_reduction (stmt, initial_def,
4426                                                        &adjustment_def);
4427       vec_initial_defs.create (1);
4428       vec_initial_defs.quick_push (vec_initial_def);
4429     }
4430
4431   /* Set phi nodes arguments.  */
4432   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4433     {
4434       tree vec_init_def, def;
4435       gimple_seq stmts;
4436       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4437                                            true, NULL_TREE);
4438       if (stmts)
4439         gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4440
4441       def = vect_defs[i];
4442       for (j = 0; j < ncopies; j++)
4443         {
4444           if (j != 0)
4445             {
4446               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4447               if (nested_in_vect_loop)
4448                 vec_init_def
4449                   = vect_get_vec_def_for_stmt_copy (initial_def_dt,
4450                                                     vec_init_def);
4451             }
4452
4453           /* Set the loop-entry arg of the reduction-phi.  */
4454
4455           if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4456               == INTEGER_INDUC_COND_REDUCTION)
4457             {
4458               /* Initialise the reduction phi to zero.  This prevents initial
4459                  values of non-zero interferring with the reduction op.  */
4460               gcc_assert (ncopies == 1);
4461               gcc_assert (i == 0);
4462
4463               tree vec_init_def_type = TREE_TYPE (vec_init_def);
4464               tree zero_vec = build_zero_cst (vec_init_def_type);
4465
4466               add_phi_arg (as_a <gphi *> (phi), zero_vec,
4467                            loop_preheader_edge (loop), UNKNOWN_LOCATION);
4468             }
4469           else
4470             add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4471                          loop_preheader_edge (loop), UNKNOWN_LOCATION);
4472
4473           /* Set the loop-latch arg for the reduction-phi.  */
4474           if (j > 0)
4475             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4476
4477           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4478                        UNKNOWN_LOCATION);
4479
4480           if (dump_enabled_p ())
4481             {
4482               dump_printf_loc (MSG_NOTE, vect_location,
4483                                "transform reduction: created def-use cycle: ");
4484               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4485               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4486             }
4487         }
4488     }
4489
4490   /* 2. Create epilog code.
4491         The reduction epilog code operates across the elements of the vector
4492         of partial results computed by the vectorized loop.
4493         The reduction epilog code consists of:
4494
4495         step 1: compute the scalar result in a vector (v_out2)
4496         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4497         step 3: adjust the scalar result (s_out3) if needed.
4498
4499         Step 1 can be accomplished using one the following three schemes:
4500           (scheme 1) using reduc_code, if available.
4501           (scheme 2) using whole-vector shifts, if available.
4502           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4503                      combined.
4504
4505           The overall epilog code looks like this:
4506
4507           s_out0 = phi <s_loop>         # original EXIT_PHI
4508           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4509           v_out2 = reduce <v_out1>              # step 1
4510           s_out3 = extract_field <v_out2, 0>    # step 2
4511           s_out4 = adjust_result <s_out3>       # step 3
4512
4513           (step 3 is optional, and steps 1 and 2 may be combined).
4514           Lastly, the uses of s_out0 are replaced by s_out4.  */
4515
4516
4517   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4518          v_out1 = phi <VECT_DEF>
4519          Store them in NEW_PHIS.  */
4520
4521   exit_bb = single_exit (loop)->dest;
4522   prev_phi_info = NULL;
4523   new_phis.create (vect_defs.length ());
4524   FOR_EACH_VEC_ELT (vect_defs, i, def)
4525     {
4526       for (j = 0; j < ncopies; j++)
4527         {
4528           tree new_def = copy_ssa_name (def);
4529           phi = create_phi_node (new_def, exit_bb);
4530           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4531           if (j == 0)
4532             new_phis.quick_push (phi);
4533           else
4534             {
4535               def = vect_get_vec_def_for_stmt_copy (dt, def);
4536               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4537             }
4538
4539           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4540           prev_phi_info = vinfo_for_stmt (phi);
4541         }
4542     }
4543
4544   /* The epilogue is created for the outer-loop, i.e., for the loop being
4545      vectorized.  Create exit phis for the outer loop.  */
4546   if (double_reduc)
4547     {
4548       loop = outer_loop;
4549       exit_bb = single_exit (loop)->dest;
4550       inner_phis.create (vect_defs.length ());
4551       FOR_EACH_VEC_ELT (new_phis, i, phi)
4552         {
4553           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4554           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4555           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4556                            PHI_RESULT (phi));
4557           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4558                                                             loop_vinfo));
4559           inner_phis.quick_push (phi);
4560           new_phis[i] = outer_phi;
4561           prev_phi_info = vinfo_for_stmt (outer_phi);
4562           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4563             {
4564               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4565               new_result = copy_ssa_name (PHI_RESULT (phi));
4566               outer_phi = create_phi_node (new_result, exit_bb);
4567               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4568                                PHI_RESULT (phi));
4569               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4570                                                                 loop_vinfo));
4571               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4572               prev_phi_info = vinfo_for_stmt (outer_phi);
4573             }
4574         }
4575     }
4576
4577   exit_gsi = gsi_after_labels (exit_bb);
4578
4579   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4580          (i.e. when reduc_code is not available) and in the final adjustment
4581          code (if needed).  Also get the original scalar reduction variable as
4582          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4583          represents a reduction pattern), the tree-code and scalar-def are
4584          taken from the original stmt that the pattern-stmt (STMT) replaces.
4585          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4586          are taken from STMT.  */
4587
4588   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4589   if (!orig_stmt)
4590     {
4591       /* Regular reduction  */
4592       orig_stmt = stmt;
4593     }
4594   else
4595     {
4596       /* Reduction pattern  */
4597       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4598       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4599       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4600     }
4601
4602   code = gimple_assign_rhs_code (orig_stmt);
4603   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4604      partial results are added and not subtracted.  */
4605   if (code == MINUS_EXPR)
4606     code = PLUS_EXPR;
4607
4608   scalar_dest = gimple_assign_lhs (orig_stmt);
4609   scalar_type = TREE_TYPE (scalar_dest);
4610   scalar_results.create (group_size);
4611   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4612   bitsize = TYPE_SIZE (scalar_type);
4613
4614   /* In case this is a reduction in an inner-loop while vectorizing an outer
4615      loop - we don't need to extract a single scalar result at the end of the
4616      inner-loop (unless it is double reduction, i.e., the use of reduction is
4617      outside the outer-loop).  The final vector of partial results will be used
4618      in the vectorized outer-loop, or reduced to a scalar result at the end of
4619      the outer-loop.  */
4620   if (nested_in_vect_loop && !double_reduc)
4621     goto vect_finalize_reduction;
4622
4623   /* SLP reduction without reduction chain, e.g.,
4624      # a1 = phi <a2, a0>
4625      # b1 = phi <b2, b0>
4626      a2 = operation (a1)
4627      b2 = operation (b1)  */
4628   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4629
4630   /* In case of reduction chain, e.g.,
4631      # a1 = phi <a3, a0>
4632      a2 = operation (a1)
4633      a3 = operation (a2),
4634
4635      we may end up with more than one vector result.  Here we reduce them to
4636      one vector.  */
4637   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4638     {
4639       tree first_vect = PHI_RESULT (new_phis[0]);
4640       tree tmp;
4641       gassign *new_vec_stmt = NULL;
4642
4643       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4644       for (k = 1; k < new_phis.length (); k++)
4645         {
4646           gimple *next_phi = new_phis[k];
4647           tree second_vect = PHI_RESULT (next_phi);
4648
4649           tmp = build2 (code, vectype,  first_vect, second_vect);
4650           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4651           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4652           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4653           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4654         }
4655
4656       new_phi_result = first_vect;
4657       if (new_vec_stmt)
4658         {
4659           new_phis.truncate (0);
4660           new_phis.safe_push (new_vec_stmt);
4661         }
4662     }
4663   else
4664     new_phi_result = PHI_RESULT (new_phis[0]);
4665
4666   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
4667     {
4668       /* For condition reductions, we have a vector (NEW_PHI_RESULT) containing
4669          various data values where the condition matched and another vector
4670          (INDUCTION_INDEX) containing all the indexes of those matches.  We
4671          need to extract the last matching index (which will be the index with
4672          highest value) and use this to index into the data vector.
4673          For the case where there were no matches, the data vector will contain
4674          all default values and the index vector will be all zeros.  */
4675
4676       /* Get various versions of the type of the vector of indexes.  */
4677       tree index_vec_type = TREE_TYPE (induction_index);
4678       gcc_checking_assert (TYPE_UNSIGNED (index_vec_type));
4679       tree index_scalar_type = TREE_TYPE (index_vec_type);
4680       tree index_vec_cmp_type = build_same_sized_truth_vector_type
4681         (index_vec_type);
4682
4683       /* Get an unsigned integer version of the type of the data vector.  */
4684       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
4685       tree scalar_type_unsigned = make_unsigned_type (scalar_precision);
4686       tree vectype_unsigned = build_vector_type
4687         (scalar_type_unsigned, TYPE_VECTOR_SUBPARTS (vectype));
4688
4689       /* First we need to create a vector (ZERO_VEC) of zeros and another
4690          vector (MAX_INDEX_VEC) filled with the last matching index, which we
4691          can create using a MAX reduction and then expanding.
4692          In the case where the loop never made any matches, the max index will
4693          be zero.  */
4694
4695       /* Vector of {0, 0, 0,...}.  */
4696       tree zero_vec = make_ssa_name (vectype);
4697       tree zero_vec_rhs = build_zero_cst (vectype);
4698       gimple *zero_vec_stmt = gimple_build_assign (zero_vec, zero_vec_rhs);
4699       gsi_insert_before (&exit_gsi, zero_vec_stmt, GSI_SAME_STMT);
4700
4701       /* Find maximum value from the vector of found indexes.  */
4702       tree max_index = make_ssa_name (index_scalar_type);
4703       gimple *max_index_stmt = gimple_build_assign (max_index, REDUC_MAX_EXPR,
4704                                                     induction_index);
4705       gsi_insert_before (&exit_gsi, max_index_stmt, GSI_SAME_STMT);
4706
4707       /* Vector of {max_index, max_index, max_index,...}.  */
4708       tree max_index_vec = make_ssa_name (index_vec_type);
4709       tree max_index_vec_rhs = build_vector_from_val (index_vec_type,
4710                                                       max_index);
4711       gimple *max_index_vec_stmt = gimple_build_assign (max_index_vec,
4712                                                         max_index_vec_rhs);
4713       gsi_insert_before (&exit_gsi, max_index_vec_stmt, GSI_SAME_STMT);
4714
4715       /* Next we compare the new vector (MAX_INDEX_VEC) full of max indexes
4716          with the vector (INDUCTION_INDEX) of found indexes, choosing values
4717          from the data vector (NEW_PHI_RESULT) for matches, 0 (ZERO_VEC)
4718          otherwise.  Only one value should match, resulting in a vector
4719          (VEC_COND) with one data value and the rest zeros.
4720          In the case where the loop never made any matches, every index will
4721          match, resulting in a vector with all data values (which will all be
4722          the default value).  */
4723
4724       /* Compare the max index vector to the vector of found indexes to find
4725          the position of the max value.  */
4726       tree vec_compare = make_ssa_name (index_vec_cmp_type);
4727       gimple *vec_compare_stmt = gimple_build_assign (vec_compare, EQ_EXPR,
4728                                                       induction_index,
4729                                                       max_index_vec);
4730       gsi_insert_before (&exit_gsi, vec_compare_stmt, GSI_SAME_STMT);
4731
4732       /* Use the compare to choose either values from the data vector or
4733          zero.  */
4734       tree vec_cond = make_ssa_name (vectype);
4735       gimple *vec_cond_stmt = gimple_build_assign (vec_cond, VEC_COND_EXPR,
4736                                                    vec_compare, new_phi_result,
4737                                                    zero_vec);
4738       gsi_insert_before (&exit_gsi, vec_cond_stmt, GSI_SAME_STMT);
4739
4740       /* Finally we need to extract the data value from the vector (VEC_COND)
4741          into a scalar (MATCHED_DATA_REDUC).  Logically we want to do a OR
4742          reduction, but because this doesn't exist, we can use a MAX reduction
4743          instead.  The data value might be signed or a float so we need to cast
4744          it first.
4745          In the case where the loop never made any matches, the data values are
4746          all identical, and so will reduce down correctly.  */
4747
4748       /* Make the matched data values unsigned.  */
4749       tree vec_cond_cast = make_ssa_name (vectype_unsigned);
4750       tree vec_cond_cast_rhs = build1 (VIEW_CONVERT_EXPR, vectype_unsigned,
4751                                        vec_cond);
4752       gimple *vec_cond_cast_stmt = gimple_build_assign (vec_cond_cast,
4753                                                         VIEW_CONVERT_EXPR,
4754                                                         vec_cond_cast_rhs);
4755       gsi_insert_before (&exit_gsi, vec_cond_cast_stmt, GSI_SAME_STMT);
4756
4757       /* Reduce down to a scalar value.  */
4758       tree data_reduc = make_ssa_name (scalar_type_unsigned);
4759       optab ot = optab_for_tree_code (REDUC_MAX_EXPR, vectype_unsigned,
4760                                       optab_default);
4761       gcc_assert (optab_handler (ot, TYPE_MODE (vectype_unsigned))
4762                   != CODE_FOR_nothing);
4763       gimple *data_reduc_stmt = gimple_build_assign (data_reduc,
4764                                                      REDUC_MAX_EXPR,
4765                                                      vec_cond_cast);
4766       gsi_insert_before (&exit_gsi, data_reduc_stmt, GSI_SAME_STMT);
4767
4768       /* Convert the reduced value back to the result type and set as the
4769          result.  */
4770       tree data_reduc_cast = build1 (VIEW_CONVERT_EXPR, scalar_type,
4771                                      data_reduc);
4772       epilog_stmt = gimple_build_assign (new_scalar_dest, data_reduc_cast);
4773       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4774       gimple_assign_set_lhs (epilog_stmt, new_temp);
4775       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4776       scalar_results.safe_push (new_temp);
4777     }
4778
4779   /* 2.3 Create the reduction code, using one of the three schemes described
4780          above. In SLP we simply need to extract all the elements from the
4781          vector (without reducing them), so we use scalar shifts.  */
4782   else if (reduc_code != ERROR_MARK && !slp_reduc)
4783     {
4784       tree tmp;
4785       tree vec_elem_type;
4786
4787       /*** Case 1:  Create:
4788            v_out2 = reduc_expr <v_out1>  */
4789
4790       if (dump_enabled_p ())
4791         dump_printf_loc (MSG_NOTE, vect_location,
4792                          "Reduce using direct vector reduction.\n");
4793
4794       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4795       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4796         {
4797           tree tmp_dest =
4798               vect_create_destination_var (scalar_dest, vec_elem_type);
4799           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4800           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4801           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4802           gimple_assign_set_lhs (epilog_stmt, new_temp);
4803           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4804
4805           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4806         }
4807       else
4808         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4809
4810       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4811       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4812       gimple_assign_set_lhs (epilog_stmt, new_temp);
4813       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4814
4815       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
4816           == INTEGER_INDUC_COND_REDUCTION)
4817         {
4818           /* Earlier we set the initial value to be zero.  Check the result
4819              and if it is zero then replace with the original initial
4820              value.  */
4821           tree zero = build_zero_cst (scalar_type);
4822           tree zcompare = build2 (EQ_EXPR, boolean_type_node, new_temp, zero);
4823
4824           tmp = make_ssa_name (new_scalar_dest);
4825           epilog_stmt = gimple_build_assign (tmp, COND_EXPR, zcompare,
4826                                              initial_def, new_temp);
4827           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4828           new_temp = tmp;
4829         }
4830
4831       scalar_results.safe_push (new_temp);
4832     }
4833   else
4834     {
4835       bool reduce_with_shift = have_whole_vector_shift (mode);
4836       int element_bitsize = tree_to_uhwi (bitsize);
4837       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4838       tree vec_temp;
4839
4840       /* Regardless of whether we have a whole vector shift, if we're
4841          emulating the operation via tree-vect-generic, we don't want
4842          to use it.  Only the first round of the reduction is likely
4843          to still be profitable via emulation.  */
4844       /* ??? It might be better to emit a reduction tree code here, so that
4845          tree-vect-generic can expand the first round via bit tricks.  */
4846       if (!VECTOR_MODE_P (mode))
4847         reduce_with_shift = false;
4848       else
4849         {
4850           optab optab = optab_for_tree_code (code, vectype, optab_default);
4851           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4852             reduce_with_shift = false;
4853         }
4854
4855       if (reduce_with_shift && !slp_reduc)
4856         {
4857           int nelements = vec_size_in_bits / element_bitsize;
4858           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4859
4860           int elt_offset;
4861
4862           tree zero_vec = build_zero_cst (vectype);
4863           /*** Case 2: Create:
4864              for (offset = nelements/2; offset >= 1; offset/=2)
4865                 {
4866                   Create:  va' = vec_shift <va, offset>
4867                   Create:  va = vop <va, va'>
4868                 }  */
4869
4870           tree rhs;
4871
4872           if (dump_enabled_p ())
4873             dump_printf_loc (MSG_NOTE, vect_location,
4874                              "Reduce using vector shifts\n");
4875
4876           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4877           new_temp = new_phi_result;
4878           for (elt_offset = nelements / 2;
4879                elt_offset >= 1;
4880                elt_offset /= 2)
4881             {
4882               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4883               tree mask = vect_gen_perm_mask_any (vectype, sel);
4884               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4885                                                  new_temp, zero_vec, mask);
4886               new_name = make_ssa_name (vec_dest, epilog_stmt);
4887               gimple_assign_set_lhs (epilog_stmt, new_name);
4888               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4889
4890               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4891                                                  new_temp);
4892               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4893               gimple_assign_set_lhs (epilog_stmt, new_temp);
4894               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4895             }
4896
4897           /* 2.4  Extract the final scalar result.  Create:
4898              s_out3 = extract_field <v_out2, bitpos>  */
4899
4900           if (dump_enabled_p ())
4901             dump_printf_loc (MSG_NOTE, vect_location,
4902                              "extract scalar result\n");
4903
4904           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4905                         bitsize, bitsize_zero_node);
4906           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4907           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4908           gimple_assign_set_lhs (epilog_stmt, new_temp);
4909           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4910           scalar_results.safe_push (new_temp);
4911         }
4912       else
4913         {
4914           /*** Case 3: Create:
4915              s = extract_field <v_out2, 0>
4916              for (offset = element_size;
4917                   offset < vector_size;
4918                   offset += element_size;)
4919                {
4920                  Create:  s' = extract_field <v_out2, offset>
4921                  Create:  s = op <s, s'>  // For non SLP cases
4922                }  */
4923
4924           if (dump_enabled_p ())
4925             dump_printf_loc (MSG_NOTE, vect_location,
4926                              "Reduce using scalar code.\n");
4927
4928           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4929           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4930             {
4931               int bit_offset;
4932               if (gimple_code (new_phi) == GIMPLE_PHI)
4933                 vec_temp = PHI_RESULT (new_phi);
4934               else
4935                 vec_temp = gimple_assign_lhs (new_phi);
4936               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4937                             bitsize_zero_node);
4938               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4939               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4940               gimple_assign_set_lhs (epilog_stmt, new_temp);
4941               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4942
4943               /* In SLP we don't need to apply reduction operation, so we just
4944                  collect s' values in SCALAR_RESULTS.  */
4945               if (slp_reduc)
4946                 scalar_results.safe_push (new_temp);
4947
4948               for (bit_offset = element_bitsize;
4949                    bit_offset < vec_size_in_bits;
4950                    bit_offset += element_bitsize)
4951                 {
4952                   tree bitpos = bitsize_int (bit_offset);
4953                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4954                                      bitsize, bitpos);
4955
4956                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4957                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4958                   gimple_assign_set_lhs (epilog_stmt, new_name);
4959                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4960
4961                   if (slp_reduc)
4962                     {
4963                       /* In SLP we don't need to apply reduction operation, so
4964                          we just collect s' values in SCALAR_RESULTS.  */
4965                       new_temp = new_name;
4966                       scalar_results.safe_push (new_name);
4967                     }
4968                   else
4969                     {
4970                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4971                                                          new_name, new_temp);
4972                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4973                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4974                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4975                     }
4976                 }
4977             }
4978
4979           /* The only case where we need to reduce scalar results in SLP, is
4980              unrolling.  If the size of SCALAR_RESULTS is greater than
4981              GROUP_SIZE, we reduce them combining elements modulo
4982              GROUP_SIZE.  */
4983           if (slp_reduc)
4984             {
4985               tree res, first_res, new_res;
4986               gimple *new_stmt;
4987
4988               /* Reduce multiple scalar results in case of SLP unrolling.  */
4989               for (j = group_size; scalar_results.iterate (j, &res);
4990                    j++)
4991                 {
4992                   first_res = scalar_results[j % group_size];
4993                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4994                                                   first_res, res);
4995                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4996                   gimple_assign_set_lhs (new_stmt, new_res);
4997                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4998                   scalar_results[j % group_size] = new_res;
4999                 }
5000             }
5001           else
5002             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
5003             scalar_results.safe_push (new_temp);
5004         }
5005     }
5006
5007 vect_finalize_reduction:
5008
5009   if (double_reduc)
5010     loop = loop->inner;
5011
5012   /* 2.5 Adjust the final result by the initial value of the reduction
5013          variable. (When such adjustment is not needed, then
5014          'adjustment_def' is zero).  For example, if code is PLUS we create:
5015          new_temp = loop_exit_def + adjustment_def  */
5016
5017   if (adjustment_def)
5018     {
5019       gcc_assert (!slp_reduc);
5020       if (nested_in_vect_loop)
5021         {
5022           new_phi = new_phis[0];
5023           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
5024           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
5025           new_dest = vect_create_destination_var (scalar_dest, vectype);
5026         }
5027       else
5028         {
5029           new_temp = scalar_results[0];
5030           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
5031           expr = build2 (code, scalar_type, new_temp, adjustment_def);
5032           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
5033         }
5034
5035       epilog_stmt = gimple_build_assign (new_dest, expr);
5036       new_temp = make_ssa_name (new_dest, epilog_stmt);
5037       gimple_assign_set_lhs (epilog_stmt, new_temp);
5038       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
5039       if (nested_in_vect_loop)
5040         {
5041           set_vinfo_for_stmt (epilog_stmt,
5042                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
5043           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
5044                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
5045
5046           if (!double_reduc)
5047             scalar_results.quick_push (new_temp);
5048           else
5049             scalar_results[0] = new_temp;
5050         }
5051       else
5052         scalar_results[0] = new_temp;
5053
5054       new_phis[0] = epilog_stmt;
5055     }
5056
5057   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
5058           phis with new adjusted scalar results, i.e., replace use <s_out0>
5059           with use <s_out4>.
5060
5061      Transform:
5062         loop_exit:
5063           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5064           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5065           v_out2 = reduce <v_out1>
5066           s_out3 = extract_field <v_out2, 0>
5067           s_out4 = adjust_result <s_out3>
5068           use <s_out0>
5069           use <s_out0>
5070
5071      into:
5072
5073         loop_exit:
5074           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
5075           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
5076           v_out2 = reduce <v_out1>
5077           s_out3 = extract_field <v_out2, 0>
5078           s_out4 = adjust_result <s_out3>
5079           use <s_out4>
5080           use <s_out4> */
5081
5082
5083   /* In SLP reduction chain we reduce vector results into one vector if
5084      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
5085      the last stmt in the reduction chain, since we are looking for the loop
5086      exit phi node.  */
5087   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
5088     {
5089       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
5090       /* Handle reduction patterns.  */
5091       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
5092         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
5093
5094       scalar_dest = gimple_assign_lhs (dest_stmt);
5095       group_size = 1;
5096     }
5097
5098   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
5099      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
5100      need to match SCALAR_RESULTS with corresponding statements.  The first
5101      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
5102      the first vector stmt, etc.
5103      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
5104   if (group_size > new_phis.length ())
5105     {
5106       ratio = group_size / new_phis.length ();
5107       gcc_assert (!(group_size % new_phis.length ()));
5108     }
5109   else
5110     ratio = 1;
5111
5112   for (k = 0; k < group_size; k++)
5113     {
5114       if (k % ratio == 0)
5115         {
5116           epilog_stmt = new_phis[k / ratio];
5117           reduction_phi = reduction_phis[k / ratio];
5118           if (double_reduc)
5119             inner_phi = inner_phis[k / ratio];
5120         }
5121
5122       if (slp_reduc)
5123         {
5124           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
5125
5126           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
5127           /* SLP statements can't participate in patterns.  */
5128           gcc_assert (!orig_stmt);
5129           scalar_dest = gimple_assign_lhs (current_stmt);
5130         }
5131
5132       phis.create (3);
5133       /* Find the loop-closed-use at the loop exit of the original scalar
5134          result.  (The reduction result is expected to have two immediate uses -
5135          one at the latch block, and one at the loop exit).  */
5136       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5137         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
5138             && !is_gimple_debug (USE_STMT (use_p)))
5139           phis.safe_push (USE_STMT (use_p));
5140
5141       /* While we expect to have found an exit_phi because of loop-closed-ssa
5142          form we can end up without one if the scalar cycle is dead.  */
5143
5144       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5145         {
5146           if (outer_loop)
5147             {
5148               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
5149               gphi *vect_phi;
5150
5151               /* FORNOW. Currently not supporting the case that an inner-loop
5152                  reduction is not used in the outer-loop (but only outside the
5153                  outer-loop), unless it is double reduction.  */
5154               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5155                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
5156                           || double_reduc);
5157
5158               if (double_reduc)
5159                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
5160               else
5161                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
5162               if (!double_reduc
5163                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
5164                       != vect_double_reduction_def)
5165                 continue;
5166
5167               /* Handle double reduction:
5168
5169                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
5170                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
5171                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
5172                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
5173
5174                  At that point the regular reduction (stmt2 and stmt3) is
5175                  already vectorized, as well as the exit phi node, stmt4.
5176                  Here we vectorize the phi node of double reduction, stmt1, and
5177                  update all relevant statements.  */
5178
5179               /* Go through all the uses of s2 to find double reduction phi
5180                  node, i.e., stmt1 above.  */
5181               orig_name = PHI_RESULT (exit_phi);
5182               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5183                 {
5184                   stmt_vec_info use_stmt_vinfo;
5185                   stmt_vec_info new_phi_vinfo;
5186                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
5187                   basic_block bb = gimple_bb (use_stmt);
5188                   gimple *use;
5189
5190                   /* Check that USE_STMT is really double reduction phi
5191                      node.  */
5192                   if (gimple_code (use_stmt) != GIMPLE_PHI
5193                       || gimple_phi_num_args (use_stmt) != 2
5194                       || bb->loop_father != outer_loop)
5195                     continue;
5196                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
5197                   if (!use_stmt_vinfo
5198                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
5199                           != vect_double_reduction_def)
5200                     continue;
5201
5202                   /* Create vector phi node for double reduction:
5203                      vs1 = phi <vs0, vs2>
5204                      vs1 was created previously in this function by a call to
5205                        vect_get_vec_def_for_operand and is stored in
5206                        vec_initial_def;
5207                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
5208                      vs0 is created here.  */
5209
5210                   /* Create vector phi node.  */
5211                   vect_phi = create_phi_node (vec_initial_def, bb);
5212                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
5213                                     loop_vec_info_for_loop (outer_loop));
5214                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
5215
5216                   /* Create vs0 - initial def of the double reduction phi.  */
5217                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
5218                                              loop_preheader_edge (outer_loop));
5219                   init_def = get_initial_def_for_reduction (stmt,
5220                                                           preheader_arg, NULL);
5221                   vect_phi_init = vect_init_vector (use_stmt, init_def,
5222                                                     vectype, NULL);
5223
5224                   /* Update phi node arguments with vs0 and vs2.  */
5225                   add_phi_arg (vect_phi, vect_phi_init,
5226                                loop_preheader_edge (outer_loop),
5227                                UNKNOWN_LOCATION);
5228                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
5229                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
5230                   if (dump_enabled_p ())
5231                     {
5232                       dump_printf_loc (MSG_NOTE, vect_location,
5233                                        "created double reduction phi node: ");
5234                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
5235                     }
5236
5237                   vect_phi_res = PHI_RESULT (vect_phi);
5238
5239                   /* Replace the use, i.e., set the correct vs1 in the regular
5240                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
5241                      loop is redundant.  */
5242                   use = reduction_phi;
5243                   for (j = 0; j < ncopies; j++)
5244                     {
5245                       edge pr_edge = loop_preheader_edge (loop);
5246                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
5247                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
5248                     }
5249                 }
5250             }
5251         }
5252
5253       phis.release ();
5254       if (nested_in_vect_loop)
5255         {
5256           if (double_reduc)
5257             loop = outer_loop;
5258           else
5259             continue;
5260         }
5261
5262       phis.create (3);
5263       /* Find the loop-closed-use at the loop exit of the original scalar
5264          result.  (The reduction result is expected to have two immediate uses,
5265          one at the latch block, and one at the loop exit).  For double
5266          reductions we are looking for exit phis of the outer loop.  */
5267       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
5268         {
5269           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
5270             {
5271               if (!is_gimple_debug (USE_STMT (use_p)))
5272                 phis.safe_push (USE_STMT (use_p));
5273             }
5274           else
5275             {
5276               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
5277                 {
5278                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
5279
5280                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
5281                     {
5282                       if (!flow_bb_inside_loop_p (loop,
5283                                              gimple_bb (USE_STMT (phi_use_p)))
5284                           && !is_gimple_debug (USE_STMT (phi_use_p)))
5285                         phis.safe_push (USE_STMT (phi_use_p));
5286                     }
5287                 }
5288             }
5289         }
5290
5291       FOR_EACH_VEC_ELT (phis, i, exit_phi)
5292         {
5293           /* Replace the uses:  */
5294           orig_name = PHI_RESULT (exit_phi);
5295           scalar_result = scalar_results[k];
5296           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
5297             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
5298               SET_USE (use_p, scalar_result);
5299         }
5300
5301       phis.release ();
5302     }
5303 }
5304
5305
5306 /* Function is_nonwrapping_integer_induction.
5307
5308    Check if STMT (which is part of loop LOOP) both increments and
5309    does not cause overflow.  */
5310
5311 static bool
5312 is_nonwrapping_integer_induction (gimple *stmt, struct loop *loop)
5313 {
5314   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
5315   tree base = STMT_VINFO_LOOP_PHI_EVOLUTION_BASE_UNCHANGED (stmt_vinfo);
5316   tree step = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo);
5317   tree lhs_type = TREE_TYPE (gimple_phi_result (stmt));
5318   widest_int ni, max_loop_value, lhs_max;
5319   bool overflow = false;
5320
5321   /* Make sure the loop is integer based.  */
5322   if (TREE_CODE (base) != INTEGER_CST
5323       || TREE_CODE (step) != INTEGER_CST)
5324     return false;
5325
5326   /* Check that the induction increments.  */
5327   if (tree_int_cst_sgn (step) == -1)
5328     return false;
5329
5330   /* Check that the max size of the loop will not wrap.  */
5331
5332   if (TYPE_OVERFLOW_UNDEFINED (lhs_type))
5333     return true;
5334
5335   if (! max_stmt_executions (loop, &ni))
5336     return false;
5337
5338   max_loop_value = wi::mul (wi::to_widest (step), ni, TYPE_SIGN (lhs_type),
5339                             &overflow);
5340   if (overflow)
5341     return false;
5342
5343   max_loop_value = wi::add (wi::to_widest (base), max_loop_value,
5344                             TYPE_SIGN (lhs_type), &overflow);
5345   if (overflow)
5346     return false;
5347
5348   return (wi::min_precision (max_loop_value, TYPE_SIGN (lhs_type))
5349           <= TYPE_PRECISION (lhs_type));
5350 }
5351
5352 /* Function vectorizable_reduction.
5353
5354    Check if STMT performs a reduction operation that can be vectorized.
5355    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
5356    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
5357    Return FALSE if not a vectorizable STMT, TRUE otherwise.
5358
5359    This function also handles reduction idioms (patterns) that have been
5360    recognized in advance during vect_pattern_recog.  In this case, STMT may be
5361    of this form:
5362      X = pattern_expr (arg0, arg1, ..., X)
5363    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
5364    sequence that had been detected and replaced by the pattern-stmt (STMT).
5365
5366    This function also handles reduction of condition expressions, for example:
5367      for (int i = 0; i < N; i++)
5368        if (a[i] < value)
5369          last = a[i];
5370    This is handled by vectorising the loop and creating an additional vector
5371    containing the loop indexes for which "a[i] < value" was true.  In the
5372    function epilogue this is reduced to a single max value and then used to
5373    index into the vector of results.
5374
5375    In some cases of reduction patterns, the type of the reduction variable X is
5376    different than the type of the other arguments of STMT.
5377    In such cases, the vectype that is used when transforming STMT into a vector
5378    stmt is different than the vectype that is used to determine the
5379    vectorization factor, because it consists of a different number of elements
5380    than the actual number of elements that are being operated upon in parallel.
5381
5382    For example, consider an accumulation of shorts into an int accumulator.
5383    On some targets it's possible to vectorize this pattern operating on 8
5384    shorts at a time (hence, the vectype for purposes of determining the
5385    vectorization factor should be V8HI); on the other hand, the vectype that
5386    is used to create the vector form is actually V4SI (the type of the result).
5387
5388    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
5389    indicates what is the actual level of parallelism (V8HI in the example), so
5390    that the right vectorization factor would be derived.  This vectype
5391    corresponds to the type of arguments to the reduction stmt, and should *NOT*
5392    be used to create the vectorized stmt.  The right vectype for the vectorized
5393    stmt is obtained from the type of the result X:
5394         get_vectype_for_scalar_type (TREE_TYPE (X))
5395
5396    This means that, contrary to "regular" reductions (or "regular" stmts in
5397    general), the following equation:
5398       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
5399    does *NOT* necessarily hold for reduction patterns.  */
5400
5401 bool
5402 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
5403                         gimple **vec_stmt, slp_tree slp_node)
5404 {
5405   tree vec_dest;
5406   tree scalar_dest;
5407   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
5408   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5409   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
5410   tree vectype_in = NULL_TREE;
5411   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5412   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5413   enum tree_code code, orig_code, epilog_reduc_code;
5414   machine_mode vec_mode;
5415   int op_type;
5416   optab optab, reduc_optab;
5417   tree new_temp = NULL_TREE;
5418   gimple *def_stmt;
5419   enum vect_def_type dt;
5420   gphi *new_phi = NULL;
5421   tree scalar_type;
5422   bool is_simple_use;
5423   gimple *orig_stmt;
5424   stmt_vec_info orig_stmt_info;
5425   tree expr = NULL_TREE;
5426   int i;
5427   int ncopies;
5428   int epilog_copies;
5429   stmt_vec_info prev_stmt_info, prev_phi_info;
5430   bool single_defuse_cycle = false;
5431   tree reduc_def = NULL_TREE;
5432   gimple *new_stmt = NULL;
5433   int j;
5434   tree ops[3];
5435   bool nested_cycle = false, found_nested_cycle_def = false;
5436   gimple *reduc_def_stmt = NULL;
5437   bool double_reduc = false, dummy;
5438   basic_block def_bb;
5439   struct loop * def_stmt_loop, *outer_loop = NULL;
5440   tree def_arg;
5441   gimple *def_arg_stmt;
5442   auto_vec<tree> vec_oprnds0;
5443   auto_vec<tree> vec_oprnds1;
5444   auto_vec<tree> vect_defs;
5445   auto_vec<gimple *> phis;
5446   int vec_num;
5447   tree def0, def1, tem, op0, op1 = NULL_TREE;
5448   bool first_p = true;
5449   tree cr_index_scalar_type = NULL_TREE, cr_index_vector_type = NULL_TREE;
5450   gimple *cond_expr_induction_def_stmt = NULL;
5451
5452   /* In case of reduction chain we switch to the first stmt in the chain, but
5453      we don't update STMT_INFO, since only the last stmt is marked as reduction
5454      and has reduction properties.  */
5455   if (GROUP_FIRST_ELEMENT (stmt_info)
5456       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
5457     {
5458       stmt = GROUP_FIRST_ELEMENT (stmt_info);
5459       first_p = false;
5460     }
5461
5462   if (nested_in_vect_loop_p (loop, stmt))
5463     {
5464       outer_loop = loop;
5465       loop = loop->inner;
5466       nested_cycle = true;
5467     }
5468
5469   /* 1. Is vectorizable reduction?  */
5470   /* Not supportable if the reduction variable is used in the loop, unless
5471      it's a reduction chain.  */
5472   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
5473       && !GROUP_FIRST_ELEMENT (stmt_info))
5474     return false;
5475
5476   /* Reductions that are not used even in an enclosing outer-loop,
5477      are expected to be "live" (used out of the loop).  */
5478   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
5479       && !STMT_VINFO_LIVE_P (stmt_info))
5480     return false;
5481
5482   /* Make sure it was already recognized as a reduction computation.  */
5483   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
5484       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
5485     return false;
5486
5487   /* 2. Has this been recognized as a reduction pattern?
5488
5489      Check if STMT represents a pattern that has been recognized
5490      in earlier analysis stages.  For stmts that represent a pattern,
5491      the STMT_VINFO_RELATED_STMT field records the last stmt in
5492      the original sequence that constitutes the pattern.  */
5493
5494   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
5495   if (orig_stmt)
5496     {
5497       orig_stmt_info = vinfo_for_stmt (orig_stmt);
5498       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
5499       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
5500     }
5501
5502   /* 3. Check the operands of the operation.  The first operands are defined
5503         inside the loop body. The last operand is the reduction variable,
5504         which is defined by the loop-header-phi.  */
5505
5506   gcc_assert (is_gimple_assign (stmt));
5507
5508   /* Flatten RHS.  */
5509   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
5510     {
5511     case GIMPLE_SINGLE_RHS:
5512       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
5513       if (op_type == ternary_op)
5514         {
5515           tree rhs = gimple_assign_rhs1 (stmt);
5516           ops[0] = TREE_OPERAND (rhs, 0);
5517           ops[1] = TREE_OPERAND (rhs, 1);
5518           ops[2] = TREE_OPERAND (rhs, 2);
5519           code = TREE_CODE (rhs);
5520         }
5521       else
5522         return false;
5523       break;
5524
5525     case GIMPLE_BINARY_RHS:
5526       code = gimple_assign_rhs_code (stmt);
5527       op_type = TREE_CODE_LENGTH (code);
5528       gcc_assert (op_type == binary_op);
5529       ops[0] = gimple_assign_rhs1 (stmt);
5530       ops[1] = gimple_assign_rhs2 (stmt);
5531       break;
5532
5533     case GIMPLE_TERNARY_RHS:
5534       code = gimple_assign_rhs_code (stmt);
5535       op_type = TREE_CODE_LENGTH (code);
5536       gcc_assert (op_type == ternary_op);
5537       ops[0] = gimple_assign_rhs1 (stmt);
5538       ops[1] = gimple_assign_rhs2 (stmt);
5539       ops[2] = gimple_assign_rhs3 (stmt);
5540       break;
5541
5542     case GIMPLE_UNARY_RHS:
5543       return false;
5544
5545     default:
5546       gcc_unreachable ();
5547     }
5548   /* The default is that the reduction variable is the last in statement.  */
5549   int reduc_index = op_type - 1;
5550   if (code == MINUS_EXPR)
5551     reduc_index = 0;
5552
5553   if (code == COND_EXPR && slp_node)
5554     return false;
5555
5556   scalar_dest = gimple_assign_lhs (stmt);
5557   scalar_type = TREE_TYPE (scalar_dest);
5558   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5559       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5560     return false;
5561
5562   /* Do not try to vectorize bit-precision reductions.  */
5563   if ((TYPE_PRECISION (scalar_type)
5564        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5565     return false;
5566
5567   /* All uses but the last are expected to be defined in the loop.
5568      The last use is the reduction variable.  In case of nested cycle this
5569      assumption is not true: we use reduc_index to record the index of the
5570      reduction variable.  */
5571   for (i = 0; i < op_type; i++)
5572     {
5573       if (i == reduc_index)
5574         continue;
5575
5576       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5577       if (i == 0 && code == COND_EXPR)
5578         continue;
5579
5580       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
5581                                           &def_stmt, &dt, &tem);
5582       if (!vectype_in)
5583         vectype_in = tem;
5584       gcc_assert (is_simple_use);
5585
5586       if (dt != vect_internal_def
5587           && dt != vect_external_def
5588           && dt != vect_constant_def
5589           && dt != vect_induction_def
5590           && !(dt == vect_nested_cycle && nested_cycle))
5591         return false;
5592
5593       if (dt == vect_nested_cycle)
5594         {
5595           found_nested_cycle_def = true;
5596           reduc_def_stmt = def_stmt;
5597           reduc_index = i;
5598         }
5599
5600       if (i == 1 && code == COND_EXPR && dt == vect_induction_def)
5601         cond_expr_induction_def_stmt = def_stmt;
5602     }
5603
5604   is_simple_use = vect_is_simple_use (ops[reduc_index], loop_vinfo,
5605                                       &def_stmt, &dt, &tem);
5606   if (!vectype_in)
5607     vectype_in = tem;
5608   gcc_assert (is_simple_use);
5609   if (!found_nested_cycle_def)
5610     reduc_def_stmt = def_stmt;
5611
5612   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5613     return false;
5614
5615   if (!(dt == vect_reduction_def
5616         || dt == vect_nested_cycle
5617         || ((dt == vect_internal_def || dt == vect_external_def
5618              || dt == vect_constant_def || dt == vect_induction_def)
5619             && nested_cycle && found_nested_cycle_def)))
5620     {
5621       /* For pattern recognized stmts, orig_stmt might be a reduction,
5622          but some helper statements for the pattern might not, or
5623          might be COND_EXPRs with reduction uses in the condition.  */
5624       gcc_assert (orig_stmt);
5625       return false;
5626     }
5627
5628   enum vect_reduction_type v_reduc_type;
5629   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5630                                           !nested_cycle, &dummy, false,
5631                                           &v_reduc_type);
5632
5633   /* If we have a condition reduction, see if we can simplify it further.  */
5634   if (v_reduc_type == COND_REDUCTION
5635       && cond_expr_induction_def_stmt != NULL
5636       && is_nonwrapping_integer_induction (cond_expr_induction_def_stmt, loop))
5637     {
5638       if (dump_enabled_p ())
5639         dump_printf_loc (MSG_NOTE, vect_location,
5640                          "condition expression based on integer induction.\n");
5641       STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = INTEGER_INDUC_COND_REDUCTION;
5642     }
5643   else
5644    STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) = v_reduc_type;
5645
5646   if (orig_stmt)
5647     gcc_assert (tmp == orig_stmt
5648                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5649   else
5650     /* We changed STMT to be the first stmt in reduction chain, hence we
5651        check that in this case the first element in the chain is STMT.  */
5652     gcc_assert (stmt == tmp
5653                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5654
5655   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5656     return false;
5657
5658   if (slp_node)
5659     ncopies = 1;
5660   else
5661     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5662                / TYPE_VECTOR_SUBPARTS (vectype_in));
5663
5664   gcc_assert (ncopies >= 1);
5665
5666   vec_mode = TYPE_MODE (vectype_in);
5667
5668   if (code == COND_EXPR)
5669     {
5670       /* Only call during the analysis stage, otherwise we'll lose
5671          STMT_VINFO_TYPE.  */
5672       if (!vec_stmt && !vectorizable_condition (stmt, gsi, NULL,
5673                                                 ops[reduc_index], 0, NULL))
5674         {
5675           if (dump_enabled_p ())
5676             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5677                              "unsupported condition in reduction\n");
5678           return false;
5679         }
5680     }
5681   else
5682     {
5683       /* 4. Supportable by target?  */
5684
5685       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5686           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5687         {
5688           /* Shifts and rotates are only supported by vectorizable_shifts,
5689              not vectorizable_reduction.  */
5690           if (dump_enabled_p ())
5691             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5692                              "unsupported shift or rotation.\n");
5693           return false;
5694         }
5695
5696       /* 4.1. check support for the operation in the loop  */
5697       optab = optab_for_tree_code (code, vectype_in, optab_default);
5698       if (!optab)
5699         {
5700           if (dump_enabled_p ())
5701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5702                              "no optab.\n");
5703
5704           return false;
5705         }
5706
5707       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5708         {
5709           if (dump_enabled_p ())
5710             dump_printf (MSG_NOTE, "op not supported by target.\n");
5711
5712           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5713               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5714                   < vect_min_worthwhile_factor (code))
5715             return false;
5716
5717           if (dump_enabled_p ())
5718             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5719         }
5720
5721       /* Worthwhile without SIMD support?  */
5722       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5723           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5724              < vect_min_worthwhile_factor (code))
5725         {
5726           if (dump_enabled_p ())
5727             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5728                              "not worthwhile without SIMD support.\n");
5729
5730           return false;
5731         }
5732     }
5733
5734   /* 4.2. Check support for the epilog operation.
5735
5736           If STMT represents a reduction pattern, then the type of the
5737           reduction variable may be different than the type of the rest
5738           of the arguments.  For example, consider the case of accumulation
5739           of shorts into an int accumulator; The original code:
5740                         S1: int_a = (int) short_a;
5741           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5742
5743           was replaced with:
5744                         STMT: int_acc = widen_sum <short_a, int_acc>
5745
5746           This means that:
5747           1. The tree-code that is used to create the vector operation in the
5748              epilog code (that reduces the partial results) is not the
5749              tree-code of STMT, but is rather the tree-code of the original
5750              stmt from the pattern that STMT is replacing.  I.e, in the example
5751              above we want to use 'widen_sum' in the loop, but 'plus' in the
5752              epilog.
5753           2. The type (mode) we use to check available target support
5754              for the vector operation to be created in the *epilog*, is
5755              determined by the type of the reduction variable (in the example
5756              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5757              However the type (mode) we use to check available target support
5758              for the vector operation to be created *inside the loop*, is
5759              determined by the type of the other arguments to STMT (in the
5760              example we'd check this: optab_handler (widen_sum_optab,
5761              vect_short_mode)).
5762
5763           This is contrary to "regular" reductions, in which the types of all
5764           the arguments are the same as the type of the reduction variable.
5765           For "regular" reductions we can therefore use the same vector type
5766           (and also the same tree-code) when generating the epilog code and
5767           when generating the code inside the loop.  */
5768
5769   if (orig_stmt)
5770     {
5771       /* This is a reduction pattern: get the vectype from the type of the
5772          reduction variable, and get the tree-code from orig_stmt.  */
5773       gcc_assert (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5774                   == TREE_CODE_REDUCTION);
5775       orig_code = gimple_assign_rhs_code (orig_stmt);
5776       gcc_assert (vectype_out);
5777       vec_mode = TYPE_MODE (vectype_out);
5778     }
5779   else
5780     {
5781       /* Regular reduction: use the same vectype and tree-code as used for
5782          the vector code inside the loop can be used for the epilog code. */
5783       orig_code = code;
5784
5785       if (code == MINUS_EXPR)
5786         orig_code = PLUS_EXPR;
5787
5788       /* For simple condition reductions, replace with the actual expression
5789          we want to base our reduction around.  */
5790       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5791           == INTEGER_INDUC_COND_REDUCTION)
5792         orig_code = MAX_EXPR;
5793     }
5794
5795   if (nested_cycle)
5796     {
5797       def_bb = gimple_bb (reduc_def_stmt);
5798       def_stmt_loop = def_bb->loop_father;
5799       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5800                                        loop_preheader_edge (def_stmt_loop));
5801       if (TREE_CODE (def_arg) == SSA_NAME
5802           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5803           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5804           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5805           && vinfo_for_stmt (def_arg_stmt)
5806           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5807               == vect_double_reduction_def)
5808         double_reduc = true;
5809     }
5810
5811   epilog_reduc_code = ERROR_MARK;
5812
5813   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == TREE_CODE_REDUCTION
5814       || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5815                 == INTEGER_INDUC_COND_REDUCTION)
5816     {
5817       if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5818         {
5819           reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5820                                          optab_default);
5821           if (!reduc_optab)
5822             {
5823               if (dump_enabled_p ())
5824                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5825                                  "no optab for reduction.\n");
5826
5827               epilog_reduc_code = ERROR_MARK;
5828             }
5829           else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5830             {
5831               if (dump_enabled_p ())
5832                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5833                                  "reduc op not supported by target.\n");
5834
5835               epilog_reduc_code = ERROR_MARK;
5836             }
5837
5838           /* When epilog_reduc_code is ERROR_MARK then a reduction will be
5839              generated in the epilog using multiple expressions.  This does not
5840              work for condition reductions.  */
5841           if (epilog_reduc_code == ERROR_MARK
5842               && STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5843                         == INTEGER_INDUC_COND_REDUCTION)
5844             {
5845               if (dump_enabled_p ())
5846                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5847                                  "no reduc code for scalar code.\n");
5848               return false;
5849             }
5850         }
5851       else
5852         {
5853           if (!nested_cycle || double_reduc)
5854             {
5855               if (dump_enabled_p ())
5856                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5857                                  "no reduc code for scalar code.\n");
5858
5859               return false;
5860             }
5861         }
5862     }
5863   else
5864     {
5865       int scalar_precision = GET_MODE_PRECISION (TYPE_MODE (scalar_type));
5866       cr_index_scalar_type = make_unsigned_type (scalar_precision);
5867       cr_index_vector_type = build_vector_type
5868         (cr_index_scalar_type, TYPE_VECTOR_SUBPARTS (vectype_out));
5869
5870       epilog_reduc_code = REDUC_MAX_EXPR;
5871       optab = optab_for_tree_code (REDUC_MAX_EXPR, cr_index_vector_type,
5872                                    optab_default);
5873       if (optab_handler (optab, TYPE_MODE (cr_index_vector_type))
5874           == CODE_FOR_nothing)
5875         {
5876           if (dump_enabled_p ())
5877             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5878                              "reduc max op not supported by target.\n");
5879           return false;
5880         }
5881     }
5882
5883   if ((double_reduc
5884        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION
5885        || STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info)
5886                 == INTEGER_INDUC_COND_REDUCTION)
5887       && ncopies > 1)
5888     {
5889       if (dump_enabled_p ())
5890         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5891                          "multiple types in double reduction or condition "
5892                          "reduction.\n");
5893       return false;
5894     }
5895
5896   /* In case of widenning multiplication by a constant, we update the type
5897      of the constant to be the type of the other operand.  We check that the
5898      constant fits the type in the pattern recognition pass.  */
5899   if (code == DOT_PROD_EXPR
5900       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5901     {
5902       if (TREE_CODE (ops[0]) == INTEGER_CST)
5903         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5904       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5905         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5906       else
5907         {
5908           if (dump_enabled_p ())
5909             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5910                              "invalid types in dot-prod\n");
5911
5912           return false;
5913         }
5914     }
5915
5916   if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
5917     {
5918       widest_int ni;
5919
5920       if (! max_loop_iterations (loop, &ni))
5921         {
5922           if (dump_enabled_p ())
5923             dump_printf_loc (MSG_NOTE, vect_location,
5924                              "loop count not known, cannot create cond "
5925                              "reduction.\n");
5926           return false;
5927         }
5928       /* Convert backedges to iterations.  */
5929       ni += 1;
5930
5931       /* The additional index will be the same type as the condition.  Check
5932          that the loop can fit into this less one (because we'll use up the
5933          zero slot for when there are no matches).  */
5934       tree max_index = TYPE_MAX_VALUE (cr_index_scalar_type);
5935       if (wi::geu_p (ni, wi::to_widest (max_index)))
5936         {
5937           if (dump_enabled_p ())
5938             dump_printf_loc (MSG_NOTE, vect_location,
5939                              "loop size is greater than data size.\n");
5940           return false;
5941         }
5942     }
5943
5944   if (!vec_stmt) /* transformation not required.  */
5945     {
5946       if (first_p
5947           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5948                                          reduc_index))
5949         return false;
5950       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5951       return true;
5952     }
5953
5954   /** Transform.  **/
5955
5956   if (dump_enabled_p ())
5957     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5958
5959   /* FORNOW: Multiple types are not supported for condition.  */
5960   if (code == COND_EXPR)
5961     gcc_assert (ncopies == 1);
5962
5963   /* Create the destination vector  */
5964   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5965
5966   /* In case the vectorization factor (VF) is bigger than the number
5967      of elements that we can fit in a vectype (nunits), we have to generate
5968      more than one vector stmt - i.e - we need to "unroll" the
5969      vector stmt by a factor VF/nunits.  For more details see documentation
5970      in vectorizable_operation.  */
5971
5972   /* If the reduction is used in an outer loop we need to generate
5973      VF intermediate results, like so (e.g. for ncopies=2):
5974         r0 = phi (init, r0)
5975         r1 = phi (init, r1)
5976         r0 = x0 + r0;
5977         r1 = x1 + r1;
5978     (i.e. we generate VF results in 2 registers).
5979     In this case we have a separate def-use cycle for each copy, and therefore
5980     for each copy we get the vector def for the reduction variable from the
5981     respective phi node created for this copy.
5982
5983     Otherwise (the reduction is unused in the loop nest), we can combine
5984     together intermediate results, like so (e.g. for ncopies=2):
5985         r = phi (init, r)
5986         r = x0 + r;
5987         r = x1 + r;
5988    (i.e. we generate VF/2 results in a single register).
5989    In this case for each copy we get the vector def for the reduction variable
5990    from the vectorized reduction operation generated in the previous iteration.
5991   */
5992
5993   if (STMT_VINFO_RELEVANT (stmt_info) <= vect_used_only_live)
5994     {
5995       single_defuse_cycle = true;
5996       epilog_copies = 1;
5997     }
5998   else
5999     epilog_copies = ncopies;
6000
6001   prev_stmt_info = NULL;
6002   prev_phi_info = NULL;
6003   if (slp_node)
6004     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6005   else
6006     {
6007       vec_num = 1;
6008       vec_oprnds0.create (1);
6009       if (op_type == ternary_op)
6010         vec_oprnds1.create (1);
6011     }
6012
6013   phis.create (vec_num);
6014   vect_defs.create (vec_num);
6015   if (!slp_node)
6016     vect_defs.quick_push (NULL_TREE);
6017
6018   for (j = 0; j < ncopies; j++)
6019     {
6020       if (j == 0 || !single_defuse_cycle)
6021         {
6022           for (i = 0; i < vec_num; i++)
6023             {
6024               /* Create the reduction-phi that defines the reduction
6025                  operand.  */
6026               new_phi = create_phi_node (vec_dest, loop->header);
6027               set_vinfo_for_stmt (new_phi,
6028                                   new_stmt_vec_info (new_phi, loop_vinfo));
6029                if (j == 0 || slp_node)
6030                  phis.quick_push (new_phi);
6031             }
6032         }
6033
6034       if (code == COND_EXPR)
6035         {
6036           gcc_assert (!slp_node);
6037           vectorizable_condition (stmt, gsi, vec_stmt,
6038                                   PHI_RESULT (phis[0]),
6039                                   reduc_index, NULL);
6040           /* Multiple types are not supported for condition.  */
6041           break;
6042         }
6043
6044       /* Handle uses.  */
6045       if (j == 0)
6046         {
6047           op0 = ops[!reduc_index];
6048           if (op_type == ternary_op)
6049             {
6050               if (reduc_index == 0)
6051                 op1 = ops[2];
6052               else
6053                 op1 = ops[1];
6054             }
6055
6056           if (slp_node)
6057             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
6058                                slp_node, -1);
6059           else
6060             {
6061               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
6062                                                             stmt);
6063               vec_oprnds0.quick_push (loop_vec_def0);
6064               if (op_type == ternary_op)
6065                {
6066                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
6067                  vec_oprnds1.quick_push (loop_vec_def1);
6068                }
6069             }
6070         }
6071       else
6072         {
6073           if (!slp_node)
6074             {
6075               enum vect_def_type dt;
6076               gimple *dummy_stmt;
6077
6078               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
6079                                   &dummy_stmt, &dt);
6080               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
6081                                                               loop_vec_def0);
6082               vec_oprnds0[0] = loop_vec_def0;
6083               if (op_type == ternary_op)
6084                 {
6085                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
6086                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
6087                                                                 loop_vec_def1);
6088                   vec_oprnds1[0] = loop_vec_def1;
6089                 }
6090             }
6091
6092           if (single_defuse_cycle)
6093             reduc_def = gimple_assign_lhs (new_stmt);
6094
6095           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
6096         }
6097
6098       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
6099         {
6100           if (slp_node)
6101             reduc_def = PHI_RESULT (phis[i]);
6102           else
6103             {
6104               if (!single_defuse_cycle || j == 0)
6105                 reduc_def = PHI_RESULT (new_phi);
6106             }
6107
6108           def1 = ((op_type == ternary_op)
6109                   ? vec_oprnds1[i] : NULL);
6110           if (op_type == binary_op)
6111             {
6112               if (reduc_index == 0)
6113                 expr = build2 (code, vectype_out, reduc_def, def0);
6114               else
6115                 expr = build2 (code, vectype_out, def0, reduc_def);
6116             }
6117           else
6118             {
6119               if (reduc_index == 0)
6120                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
6121               else
6122                 {
6123                   if (reduc_index == 1)
6124                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
6125                   else
6126                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
6127                 }
6128             }
6129
6130           new_stmt = gimple_build_assign (vec_dest, expr);
6131           new_temp = make_ssa_name (vec_dest, new_stmt);
6132           gimple_assign_set_lhs (new_stmt, new_temp);
6133           vect_finish_stmt_generation (stmt, new_stmt, gsi);
6134
6135           if (slp_node)
6136             {
6137               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
6138               vect_defs.quick_push (new_temp);
6139             }
6140           else
6141             vect_defs[0] = new_temp;
6142         }
6143
6144       if (slp_node)
6145         continue;
6146
6147       if (j == 0)
6148         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
6149       else
6150         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
6151
6152       prev_stmt_info = vinfo_for_stmt (new_stmt);
6153       prev_phi_info = vinfo_for_stmt (new_phi);
6154     }
6155
6156   tree indx_before_incr, indx_after_incr, cond_name = NULL;
6157
6158   /* Finalize the reduction-phi (set its arguments) and create the
6159      epilog reduction code.  */
6160   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
6161     {
6162       new_temp = gimple_assign_lhs (*vec_stmt);
6163       vect_defs[0] = new_temp;
6164
6165       /* For cond reductions we want to create a new vector (INDEX_COND_EXPR)
6166          which is updated with the current index of the loop for every match of
6167          the original loop's cond_expr (VEC_STMT).  This results in a vector
6168          containing the last time the condition passed for that vector lane.
6169          The first match will be a 1 to allow 0 to be used for non-matching
6170          indexes.  If there are no matches at all then the vector will be all
6171          zeroes.  */
6172       if (STMT_VINFO_VEC_REDUCTION_TYPE (stmt_info) == COND_REDUCTION)
6173         {
6174           int nunits_out = TYPE_VECTOR_SUBPARTS (vectype_out);
6175           int k;
6176
6177           gcc_assert (gimple_assign_rhs_code (*vec_stmt) == VEC_COND_EXPR);
6178
6179           /* First we create a simple vector induction variable which starts
6180              with the values {1,2,3,...} (SERIES_VECT) and increments by the
6181              vector size (STEP).  */
6182
6183           /* Create a {1,2,3,...} vector.  */
6184           tree *vtemp = XALLOCAVEC (tree, nunits_out);
6185           for (k = 0; k < nunits_out; ++k)
6186             vtemp[k] = build_int_cst (cr_index_scalar_type, k + 1);
6187           tree series_vect = build_vector (cr_index_vector_type, vtemp);
6188
6189           /* Create a vector of the step value.  */
6190           tree step = build_int_cst (cr_index_scalar_type, nunits_out);
6191           tree vec_step = build_vector_from_val (cr_index_vector_type, step);
6192
6193           /* Create an induction variable.  */
6194           gimple_stmt_iterator incr_gsi;
6195           bool insert_after;
6196           standard_iv_increment_position (loop, &incr_gsi, &insert_after);
6197           create_iv (series_vect, vec_step, NULL_TREE, loop, &incr_gsi,
6198                      insert_after, &indx_before_incr, &indx_after_incr);
6199
6200           /* Next create a new phi node vector (NEW_PHI_TREE) which starts
6201              filled with zeros (VEC_ZERO).  */
6202
6203           /* Create a vector of 0s.  */
6204           tree zero = build_zero_cst (cr_index_scalar_type);
6205           tree vec_zero = build_vector_from_val (cr_index_vector_type, zero);
6206
6207           /* Create a vector phi node.  */
6208           tree new_phi_tree = make_ssa_name (cr_index_vector_type);
6209           new_phi = create_phi_node (new_phi_tree, loop->header);
6210           set_vinfo_for_stmt (new_phi,
6211                               new_stmt_vec_info (new_phi, loop_vinfo));
6212           add_phi_arg (new_phi, vec_zero, loop_preheader_edge (loop),
6213                        UNKNOWN_LOCATION);
6214
6215           /* Now take the condition from the loops original cond_expr
6216              (VEC_STMT) and produce a new cond_expr (INDEX_COND_EXPR) which for
6217              every match uses values from the induction variable
6218              (INDEX_BEFORE_INCR) otherwise uses values from the phi node
6219              (NEW_PHI_TREE).
6220              Finally, we update the phi (NEW_PHI_TREE) to take the value of
6221              the new cond_expr (INDEX_COND_EXPR).  */
6222
6223           /* Duplicate the condition from vec_stmt.  */
6224           tree ccompare = unshare_expr (gimple_assign_rhs1 (*vec_stmt));
6225
6226           /* Create a conditional, where the condition is taken from vec_stmt
6227              (CCOMPARE), then is the induction index (INDEX_BEFORE_INCR) and
6228              else is the phi (NEW_PHI_TREE).  */
6229           tree index_cond_expr = build3 (VEC_COND_EXPR, cr_index_vector_type,
6230                                          ccompare, indx_before_incr,
6231                                          new_phi_tree);
6232           cond_name = make_ssa_name (cr_index_vector_type);
6233           gimple *index_condition = gimple_build_assign (cond_name,
6234                                                          index_cond_expr);
6235           gsi_insert_before (&incr_gsi, index_condition, GSI_SAME_STMT);
6236           stmt_vec_info index_vec_info = new_stmt_vec_info (index_condition,
6237                                                             loop_vinfo);
6238           STMT_VINFO_VECTYPE (index_vec_info) = cr_index_vector_type;
6239           set_vinfo_for_stmt (index_condition, index_vec_info);
6240
6241           /* Update the phi with the vec cond.  */
6242           add_phi_arg (new_phi, cond_name, loop_latch_edge (loop),
6243                        UNKNOWN_LOCATION);
6244         }
6245     }
6246
6247   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
6248                                     epilog_reduc_code, phis, reduc_index,
6249                                     double_reduc, slp_node, cond_name);
6250
6251   return true;
6252 }
6253
6254 /* Function vect_min_worthwhile_factor.
6255
6256    For a loop where we could vectorize the operation indicated by CODE,
6257    return the minimum vectorization factor that makes it worthwhile
6258    to use generic vectors.  */
6259 int
6260 vect_min_worthwhile_factor (enum tree_code code)
6261 {
6262   switch (code)
6263     {
6264     case PLUS_EXPR:
6265     case MINUS_EXPR:
6266     case NEGATE_EXPR:
6267       return 4;
6268
6269     case BIT_AND_EXPR:
6270     case BIT_IOR_EXPR:
6271     case BIT_XOR_EXPR:
6272     case BIT_NOT_EXPR:
6273       return 2;
6274
6275     default:
6276       return INT_MAX;
6277     }
6278 }
6279
6280
6281 /* Function vectorizable_induction
6282
6283    Check if PHI performs an induction computation that can be vectorized.
6284    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
6285    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
6286    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
6287
6288 bool
6289 vectorizable_induction (gimple *phi,
6290                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6291                         gimple **vec_stmt)
6292 {
6293   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
6294   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6295   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6296   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6297   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6298   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6299   tree vec_def;
6300
6301   gcc_assert (ncopies >= 1);
6302   /* FORNOW. These restrictions should be relaxed.  */
6303   if (nested_in_vect_loop_p (loop, phi))
6304     {
6305       imm_use_iterator imm_iter;
6306       use_operand_p use_p;
6307       gimple *exit_phi;
6308       edge latch_e;
6309       tree loop_arg;
6310
6311       if (ncopies > 1)
6312         {
6313           if (dump_enabled_p ())
6314             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6315                              "multiple types in nested loop.\n");
6316           return false;
6317         }
6318
6319       exit_phi = NULL;
6320       latch_e = loop_latch_edge (loop->inner);
6321       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
6322       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
6323         {
6324           gimple *use_stmt = USE_STMT (use_p);
6325           if (is_gimple_debug (use_stmt))
6326             continue;
6327
6328           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
6329             {
6330               exit_phi = use_stmt;
6331               break;
6332             }
6333         }
6334       if (exit_phi)
6335         {
6336           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
6337           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
6338                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
6339             {
6340               if (dump_enabled_p ())
6341                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
6342                                  "inner-loop induction only used outside "
6343                                  "of the outer vectorized loop.\n");
6344               return false;
6345             }
6346         }
6347     }
6348
6349   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6350     return false;
6351
6352   /* FORNOW: SLP not supported.  */
6353   if (STMT_SLP_TYPE (stmt_info))
6354     return false;
6355
6356   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
6357
6358   if (gimple_code (phi) != GIMPLE_PHI)
6359     return false;
6360
6361   if (!vec_stmt) /* transformation not required.  */
6362     {
6363       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
6364       if (dump_enabled_p ())
6365         dump_printf_loc (MSG_NOTE, vect_location,
6366                          "=== vectorizable_induction ===\n");
6367       vect_model_induction_cost (stmt_info, ncopies);
6368       return true;
6369     }
6370
6371   /** Transform.  **/
6372
6373   if (dump_enabled_p ())
6374     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
6375
6376   vec_def = get_initial_def_for_induction (phi);
6377   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
6378   return true;
6379 }
6380
6381 /* Function vectorizable_live_operation.
6382
6383    STMT computes a value that is used outside the loop.  Check if
6384    it can be supported.  */
6385
6386 bool
6387 vectorizable_live_operation (gimple *stmt,
6388                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
6389                              slp_tree slp_node, int slp_index,
6390                              gimple **vec_stmt)
6391 {
6392   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
6393   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
6394   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6395   imm_use_iterator imm_iter;
6396   tree lhs, lhs_type, bitsize, vec_bitsize;
6397   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
6398   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
6399   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
6400   gimple *use_stmt;
6401   auto_vec<tree> vec_oprnds;
6402
6403   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
6404
6405   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
6406     return false;
6407
6408   /* FORNOW.  CHECKME.  */
6409   if (nested_in_vect_loop_p (loop, stmt))
6410     return false;
6411
6412   /* If STMT is not relevant and it is a simple assignment and its inputs are
6413      invariant then it can remain in place, unvectorized.  The original last
6414      scalar value that it computes will be used.  */
6415   if (!STMT_VINFO_RELEVANT_P (stmt_info))
6416     {
6417       gcc_assert (is_simple_and_all_uses_invariant (stmt, loop_vinfo));
6418       if (dump_enabled_p ())
6419         dump_printf_loc (MSG_NOTE, vect_location,
6420                          "statement is simple and uses invariant.  Leaving in "
6421                          "place.\n");
6422       return true;
6423     }
6424
6425   if (!vec_stmt)
6426     /* No transformation required.  */
6427     return true;
6428
6429   /* If stmt has a related stmt, then use that for getting the lhs.  */
6430   if (is_pattern_stmt_p (stmt_info))
6431     stmt = STMT_VINFO_RELATED_STMT (stmt_info);
6432
6433   lhs = (is_a <gphi *> (stmt)) ? gimple_phi_result (stmt)
6434         : gimple_get_lhs (stmt);
6435   lhs_type = TREE_TYPE (lhs);
6436
6437   /* Find all uses of STMT outside the loop - there should be at least one.  */
6438   auto_vec<gimple *, 4> worklist;
6439   FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, lhs)
6440     if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
6441         && !is_gimple_debug (use_stmt))
6442       worklist.safe_push (use_stmt);
6443   gcc_assert (worklist.length () >= 1);
6444
6445   bitsize = TYPE_SIZE (TREE_TYPE (vectype));
6446   vec_bitsize = TYPE_SIZE (vectype);
6447
6448   /* Get the vectorized lhs of STMT and the lane to use (counted in bits).  */
6449   tree vec_lhs, bitstart;
6450   if (slp_node)
6451     {
6452       gcc_assert (slp_index >= 0);
6453
6454       int num_scalar = SLP_TREE_SCALAR_STMTS (slp_node).length ();
6455       int num_vec = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
6456
6457       /* Get the last occurrence of the scalar index from the concatenation of
6458          all the slp vectors. Calculate which slp vector it is and the index
6459          within.  */
6460       int pos = (num_vec * nunits) - num_scalar + slp_index;
6461       int vec_entry = pos / nunits;
6462       int vec_index = pos % nunits;
6463
6464       /* Get the correct slp vectorized stmt.  */
6465       vec_lhs = gimple_get_lhs (SLP_TREE_VEC_STMTS (slp_node)[vec_entry]);
6466
6467       /* Get entry to use.  */
6468       bitstart = build_int_cst (unsigned_type_node, vec_index);
6469       bitstart = int_const_binop (MULT_EXPR, bitsize, bitstart);
6470     }
6471   else
6472     {
6473       enum vect_def_type dt = STMT_VINFO_DEF_TYPE (stmt_info);
6474       vec_lhs = vect_get_vec_def_for_operand_1 (stmt, dt);
6475
6476       /* For multiple copies, get the last copy.  */
6477       for (int i = 1; i < ncopies; ++i)
6478         vec_lhs = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type,
6479                                                   vec_lhs);
6480
6481       /* Get the last lane in the vector.  */
6482       bitstart = int_const_binop (MINUS_EXPR, vec_bitsize, bitsize);
6483     }
6484
6485   /* Create a new vectorized stmt for the uses of STMT and insert outside the
6486      loop.  */
6487   gimple_seq stmts = NULL;
6488   tree new_tree = build3 (BIT_FIELD_REF, TREE_TYPE (vectype), vec_lhs, bitsize,
6489                           bitstart);
6490   new_tree = force_gimple_operand (fold_convert (lhs_type, new_tree), &stmts,
6491                                    true, NULL_TREE);
6492   if (stmts)
6493     gsi_insert_seq_on_edge_immediate (single_exit (loop), stmts);
6494
6495   /* Replace all uses of the USE_STMT in the worklist with the newly inserted
6496      statement.  */
6497   while (!worklist.is_empty ())
6498     {
6499       use_stmt = worklist.pop ();
6500       replace_uses_by (gimple_phi_result (use_stmt), new_tree);
6501       update_stmt (use_stmt);
6502     }
6503
6504   return true;
6505 }
6506
6507 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
6508
6509 static void
6510 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
6511 {
6512   ssa_op_iter op_iter;
6513   imm_use_iterator imm_iter;
6514   def_operand_p def_p;
6515   gimple *ustmt;
6516
6517   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
6518     {
6519       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
6520         {
6521           basic_block bb;
6522
6523           if (!is_gimple_debug (ustmt))
6524             continue;
6525
6526           bb = gimple_bb (ustmt);
6527
6528           if (!flow_bb_inside_loop_p (loop, bb))
6529             {
6530               if (gimple_debug_bind_p (ustmt))
6531                 {
6532                   if (dump_enabled_p ())
6533                     dump_printf_loc (MSG_NOTE, vect_location,
6534                                      "killing debug use\n");
6535
6536                   gimple_debug_bind_reset_value (ustmt);
6537                   update_stmt (ustmt);
6538                 }
6539               else
6540                 gcc_unreachable ();
6541             }
6542         }
6543     }
6544 }
6545
6546
6547 /* This function builds ni_name = number of iterations.  Statements
6548    are emitted on the loop preheader edge.  */
6549
6550 static tree
6551 vect_build_loop_niters (loop_vec_info loop_vinfo)
6552 {
6553   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
6554   if (TREE_CODE (ni) == INTEGER_CST)
6555     return ni;
6556   else
6557     {
6558       tree ni_name, var;
6559       gimple_seq stmts = NULL;
6560       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6561
6562       var = create_tmp_var (TREE_TYPE (ni), "niters");
6563       ni_name = force_gimple_operand (ni, &stmts, false, var);
6564       if (stmts)
6565         gsi_insert_seq_on_edge_immediate (pe, stmts);
6566
6567       return ni_name;
6568     }
6569 }
6570
6571
6572 /* This function generates the following statements:
6573
6574    ni_name = number of iterations loop executes
6575    ratio = ni_name / vf
6576    ratio_mult_vf_name = ratio * vf
6577
6578    and places them on the loop preheader edge.  */
6579
6580 static void
6581 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
6582                                  tree ni_name,
6583                                  tree *ratio_mult_vf_name_ptr,
6584                                  tree *ratio_name_ptr)
6585 {
6586   tree ni_minus_gap_name;
6587   tree var;
6588   tree ratio_name;
6589   tree ratio_mult_vf_name;
6590   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6591   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
6592   tree log_vf;
6593
6594   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
6595
6596   /* If epilogue loop is required because of data accesses with gaps, we
6597      subtract one iteration from the total number of iterations here for
6598      correct calculation of RATIO.  */
6599   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6600     {
6601       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6602                                        ni_name,
6603                                        build_one_cst (TREE_TYPE (ni_name)));
6604       if (!is_gimple_val (ni_minus_gap_name))
6605         {
6606           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
6607           gimple *stmts = NULL;
6608           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
6609                                                     true, var);
6610           gsi_insert_seq_on_edge_immediate (pe, stmts);
6611         }
6612     }
6613   else
6614     ni_minus_gap_name = ni_name;
6615
6616   /* Create: ratio = ni >> log2(vf) */
6617   /* ???  As we have ni == number of latch executions + 1, ni could
6618      have overflown to zero.  So avoid computing ratio based on ni
6619      but compute it using the fact that we know ratio will be at least
6620      one, thus via (ni - vf) >> log2(vf) + 1.  */
6621   ratio_name
6622     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
6623                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
6624                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
6625                                              ni_minus_gap_name,
6626                                              build_int_cst
6627                                                (TREE_TYPE (ni_name), vf)),
6628                                 log_vf),
6629                    build_int_cst (TREE_TYPE (ni_name), 1));
6630   if (!is_gimple_val (ratio_name))
6631     {
6632       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
6633       gimple *stmts = NULL;
6634       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
6635       gsi_insert_seq_on_edge_immediate (pe, stmts);
6636     }
6637   *ratio_name_ptr = ratio_name;
6638
6639   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
6640
6641   if (ratio_mult_vf_name_ptr)
6642     {
6643       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
6644                                         ratio_name, log_vf);
6645       if (!is_gimple_val (ratio_mult_vf_name))
6646         {
6647           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
6648           gimple *stmts = NULL;
6649           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
6650                                                      true, var);
6651           gsi_insert_seq_on_edge_immediate (pe, stmts);
6652         }
6653       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
6654     }
6655
6656   return;
6657 }
6658
6659
6660 /* Function vect_transform_loop.
6661
6662    The analysis phase has determined that the loop is vectorizable.
6663    Vectorize the loop - created vectorized stmts to replace the scalar
6664    stmts in the loop, and update the loop exit condition.  */
6665
6666 void
6667 vect_transform_loop (loop_vec_info loop_vinfo)
6668 {
6669   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
6670   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
6671   int nbbs = loop->num_nodes;
6672   int i;
6673   tree ratio = NULL;
6674   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
6675   bool grouped_store;
6676   bool slp_scheduled = false;
6677   gimple *stmt, *pattern_stmt;
6678   gimple_seq pattern_def_seq = NULL;
6679   gimple_stmt_iterator pattern_def_si = gsi_none ();
6680   bool transform_pattern_stmt = false;
6681   bool check_profitability = false;
6682   int th;
6683   /* Record number of iterations before we started tampering with the profile. */
6684   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
6685
6686   if (dump_enabled_p ())
6687     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
6688
6689   /* If profile is inprecise, we have chance to fix it up.  */
6690   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6691     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
6692
6693   /* Use the more conservative vectorization threshold.  If the number
6694      of iterations is constant assume the cost check has been performed
6695      by our caller.  If the threshold makes all loops profitable that
6696      run at least the vectorization factor number of times checking
6697      is pointless, too.  */
6698   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
6699   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
6700       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6701     {
6702       if (dump_enabled_p ())
6703         dump_printf_loc (MSG_NOTE, vect_location,
6704                          "Profitability threshold is %d loop iterations.\n",
6705                          th);
6706       check_profitability = true;
6707     }
6708
6709   /* Version the loop first, if required, so the profitability check
6710      comes first.  */
6711
6712   if (LOOP_REQUIRES_VERSIONING (loop_vinfo))
6713     {
6714       vect_loop_versioning (loop_vinfo, th, check_profitability);
6715       check_profitability = false;
6716     }
6717
6718   tree ni_name = vect_build_loop_niters (loop_vinfo);
6719   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
6720
6721   /* Peel the loop if there are data refs with unknown alignment.
6722      Only one data ref with unknown store is allowed.  */
6723
6724   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
6725     {
6726       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
6727                                      th, check_profitability);
6728       check_profitability = false;
6729       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
6730          be re-computed.  */
6731       ni_name = NULL_TREE;
6732     }
6733
6734   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
6735      compile time constant), or it is a constant that doesn't divide by the
6736      vectorization factor, then an epilog loop needs to be created.
6737      We therefore duplicate the loop: the original loop will be vectorized,
6738      and will compute the first (n/VF) iterations.  The second copy of the loop
6739      will remain scalar and will compute the remaining (n%VF) iterations.
6740      (VF is the vectorization factor).  */
6741
6742   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
6743       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
6744     {
6745       tree ratio_mult_vf;
6746       if (!ni_name)
6747         ni_name = vect_build_loop_niters (loop_vinfo);
6748       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6749                                        &ratio);
6750       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6751                                       th, check_profitability);
6752     }
6753   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6754     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6755                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6756   else
6757     {
6758       if (!ni_name)
6759         ni_name = vect_build_loop_niters (loop_vinfo);
6760       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6761     }
6762
6763   /* 1) Make sure the loop header has exactly two entries
6764      2) Make sure we have a preheader basic block.  */
6765
6766   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6767
6768   split_edge (loop_preheader_edge (loop));
6769
6770   /* FORNOW: the vectorizer supports only loops which body consist
6771      of one basic block (header + empty latch). When the vectorizer will
6772      support more involved loop forms, the order by which the BBs are
6773      traversed need to be reconsidered.  */
6774
6775   for (i = 0; i < nbbs; i++)
6776     {
6777       basic_block bb = bbs[i];
6778       stmt_vec_info stmt_info;
6779
6780       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6781            gsi_next (&si))
6782         {
6783           gphi *phi = si.phi ();
6784           if (dump_enabled_p ())
6785             {
6786               dump_printf_loc (MSG_NOTE, vect_location,
6787                                "------>vectorizing phi: ");
6788               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6789             }
6790           stmt_info = vinfo_for_stmt (phi);
6791           if (!stmt_info)
6792             continue;
6793
6794           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6795             vect_loop_kill_debug_uses (loop, phi);
6796
6797           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6798               && !STMT_VINFO_LIVE_P (stmt_info))
6799             continue;
6800
6801           if (STMT_VINFO_VECTYPE (stmt_info)
6802               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6803                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6804               && dump_enabled_p ())
6805             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6806
6807           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6808             {
6809               if (dump_enabled_p ())
6810                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6811               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6812             }
6813         }
6814
6815       pattern_stmt = NULL;
6816       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6817            !gsi_end_p (si) || transform_pattern_stmt;)
6818         {
6819           bool is_store;
6820
6821           if (transform_pattern_stmt)
6822             stmt = pattern_stmt;
6823           else
6824             {
6825               stmt = gsi_stmt (si);
6826               /* During vectorization remove existing clobber stmts.  */
6827               if (gimple_clobber_p (stmt))
6828                 {
6829                   unlink_stmt_vdef (stmt);
6830                   gsi_remove (&si, true);
6831                   release_defs (stmt);
6832                   continue;
6833                 }
6834             }
6835
6836           if (dump_enabled_p ())
6837             {
6838               dump_printf_loc (MSG_NOTE, vect_location,
6839                                "------>vectorizing statement: ");
6840               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6841             }
6842
6843           stmt_info = vinfo_for_stmt (stmt);
6844
6845           /* vector stmts created in the outer-loop during vectorization of
6846              stmts in an inner-loop may not have a stmt_info, and do not
6847              need to be vectorized.  */
6848           if (!stmt_info)
6849             {
6850               gsi_next (&si);
6851               continue;
6852             }
6853
6854           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6855             vect_loop_kill_debug_uses (loop, stmt);
6856
6857           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6858               && !STMT_VINFO_LIVE_P (stmt_info))
6859             {
6860               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6861                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6862                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6863                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6864                 {
6865                   stmt = pattern_stmt;
6866                   stmt_info = vinfo_for_stmt (stmt);
6867                 }
6868               else
6869                 {
6870                   gsi_next (&si);
6871                   continue;
6872                 }
6873             }
6874           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6875                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6876                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6877                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6878             transform_pattern_stmt = true;
6879
6880           /* If pattern statement has def stmts, vectorize them too.  */
6881           if (is_pattern_stmt_p (stmt_info))
6882             {
6883               if (pattern_def_seq == NULL)
6884                 {
6885                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6886                   pattern_def_si = gsi_start (pattern_def_seq);
6887                 }
6888               else if (!gsi_end_p (pattern_def_si))
6889                 gsi_next (&pattern_def_si);
6890               if (pattern_def_seq != NULL)
6891                 {
6892                   gimple *pattern_def_stmt = NULL;
6893                   stmt_vec_info pattern_def_stmt_info = NULL;
6894
6895                   while (!gsi_end_p (pattern_def_si))
6896                     {
6897                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6898                       pattern_def_stmt_info
6899                         = vinfo_for_stmt (pattern_def_stmt);
6900                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6901                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6902                         break;
6903                       gsi_next (&pattern_def_si);
6904                     }
6905
6906                   if (!gsi_end_p (pattern_def_si))
6907                     {
6908                       if (dump_enabled_p ())
6909                         {
6910                           dump_printf_loc (MSG_NOTE, vect_location,
6911                                            "==> vectorizing pattern def "
6912                                            "stmt: ");
6913                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6914                                             pattern_def_stmt, 0);
6915                         }
6916
6917                       stmt = pattern_def_stmt;
6918                       stmt_info = pattern_def_stmt_info;
6919                     }
6920                   else
6921                     {
6922                       pattern_def_si = gsi_none ();
6923                       transform_pattern_stmt = false;
6924                     }
6925                 }
6926               else
6927                 transform_pattern_stmt = false;
6928             }
6929
6930           if (STMT_VINFO_VECTYPE (stmt_info))
6931             {
6932               unsigned int nunits
6933                 = (unsigned int)
6934                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6935               if (!STMT_SLP_TYPE (stmt_info)
6936                   && nunits != (unsigned int) vectorization_factor
6937                   && dump_enabled_p ())
6938                   /* For SLP VF is set according to unrolling factor, and not
6939                      to vector size, hence for SLP this print is not valid.  */
6940                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6941             }
6942
6943           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6944              reached.  */
6945           if (STMT_SLP_TYPE (stmt_info))
6946             {
6947               if (!slp_scheduled)
6948                 {
6949                   slp_scheduled = true;
6950
6951                   if (dump_enabled_p ())
6952                     dump_printf_loc (MSG_NOTE, vect_location,
6953                                      "=== scheduling SLP instances ===\n");
6954
6955                   vect_schedule_slp (loop_vinfo);
6956                 }
6957
6958               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6959               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6960                 {
6961                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6962                     {
6963                       pattern_def_seq = NULL;
6964                       gsi_next (&si);
6965                     }
6966                   continue;
6967                 }
6968             }
6969
6970           /* -------- vectorize statement ------------ */
6971           if (dump_enabled_p ())
6972             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6973
6974           grouped_store = false;
6975           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6976           if (is_store)
6977             {
6978               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6979                 {
6980                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6981                      interleaving chain was completed - free all the stores in
6982                      the chain.  */
6983                   gsi_next (&si);
6984                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6985                 }
6986               else
6987                 {
6988                   /* Free the attached stmt_vec_info and remove the stmt.  */
6989                   gimple *store = gsi_stmt (si);
6990                   free_stmt_vec_info (store);
6991                   unlink_stmt_vdef (store);
6992                   gsi_remove (&si, true);
6993                   release_defs (store);
6994                 }
6995
6996               /* Stores can only appear at the end of pattern statements.  */
6997               gcc_assert (!transform_pattern_stmt);
6998               pattern_def_seq = NULL;
6999             }
7000           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
7001             {
7002               pattern_def_seq = NULL;
7003               gsi_next (&si);
7004             }
7005         }                       /* stmts in BB */
7006     }                           /* BBs in loop */
7007
7008   slpeel_make_loop_iterate_ntimes (loop, ratio);
7009
7010   /* Reduce loop iterations by the vectorization factor.  */
7011   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
7012                       expected_iterations / vectorization_factor);
7013   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
7014     {
7015       if (loop->nb_iterations_upper_bound != 0)
7016         loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
7017       if (loop->nb_iterations_likely_upper_bound != 0)
7018         loop->nb_iterations_likely_upper_bound
7019            = loop->nb_iterations_likely_upper_bound - 1;
7020     }
7021   loop->nb_iterations_upper_bound
7022     = wi::udiv_floor (loop->nb_iterations_upper_bound + 1,
7023                       vectorization_factor) - 1;
7024   loop->nb_iterations_likely_upper_bound
7025     = wi::udiv_floor (loop->nb_iterations_likely_upper_bound + 1,
7026                       vectorization_factor) - 1;
7027
7028   if (loop->any_estimate)
7029     {
7030       loop->nb_iterations_estimate
7031         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
7032        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
7033            && loop->nb_iterations_estimate != 0)
7034          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
7035     }
7036
7037   if (dump_enabled_p ())
7038     {
7039       dump_printf_loc (MSG_NOTE, vect_location,
7040                        "LOOP VECTORIZED\n");
7041       if (loop->inner)
7042         dump_printf_loc (MSG_NOTE, vect_location,
7043                          "OUTER LOOP VECTORIZED\n");
7044       dump_printf (MSG_NOTE, "\n");
7045     }
7046
7047   /* Free SLP instances here because otherwise stmt reference counting
7048      won't work.  */
7049   slp_instance instance;
7050   FOR_EACH_VEC_ELT (LOOP_VINFO_SLP_INSTANCES (loop_vinfo), i, instance)
7051     vect_free_slp_instance (instance);
7052   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
7053   /* Clear-up safelen field since its value is invalid after vectorization
7054      since vectorized loop can have loop-carried dependencies.  */
7055   loop->safelen = 0;
7056 }
7057
7058 /* The code below is trying to perform simple optimization - revert
7059    if-conversion for masked stores, i.e. if the mask of a store is zero
7060    do not perform it and all stored value producers also if possible.
7061    For example,
7062      for (i=0; i<n; i++)
7063        if (c[i])
7064         {
7065           p1[i] += 1;
7066           p2[i] = p3[i] +2;
7067         }
7068    this transformation will produce the following semi-hammock:
7069
7070    if (!mask__ifc__42.18_165 == { 0, 0, 0, 0, 0, 0, 0, 0 })
7071      {
7072        vect__11.19_170 = MASK_LOAD (vectp_p1.20_168, 0B, mask__ifc__42.18_165);
7073        vect__12.22_172 = vect__11.19_170 + vect_cst__171;
7074        MASK_STORE (vectp_p1.23_175, 0B, mask__ifc__42.18_165, vect__12.22_172);
7075        vect__18.25_182 = MASK_LOAD (vectp_p3.26_180, 0B, mask__ifc__42.18_165);
7076        vect__19.28_184 = vect__18.25_182 + vect_cst__183;
7077        MASK_STORE (vectp_p2.29_187, 0B, mask__ifc__42.18_165, vect__19.28_184);
7078      }
7079 */
7080
7081 void
7082 optimize_mask_stores (struct loop *loop)
7083 {
7084   basic_block *bbs = get_loop_body (loop);
7085   unsigned nbbs = loop->num_nodes;
7086   unsigned i;
7087   basic_block bb;
7088   gimple_stmt_iterator gsi;
7089   gimple *stmt;
7090   auto_vec<gimple *> worklist;
7091
7092   vect_location = find_loop_location (loop);
7093   /* Pick up all masked stores in loop if any.  */
7094   for (i = 0; i < nbbs; i++)
7095     {
7096       bb = bbs[i];
7097       for (gsi = gsi_start_bb (bb); !gsi_end_p (gsi);
7098            gsi_next (&gsi))
7099         {
7100           stmt = gsi_stmt (gsi);
7101           if (is_gimple_call (stmt)
7102               && gimple_call_internal_p (stmt)
7103               && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
7104             worklist.safe_push (stmt);
7105         }
7106     }
7107
7108   free (bbs);
7109   if (worklist.is_empty ())
7110     return;
7111
7112   /* Loop has masked stores.  */
7113   while (!worklist.is_empty ())
7114     {
7115       gimple *last, *last_store;
7116       edge e, efalse;
7117       tree mask;
7118       basic_block store_bb, join_bb;
7119       gimple_stmt_iterator gsi_to;
7120       tree vdef, new_vdef;
7121       gphi *phi;
7122       tree vectype;
7123       tree zero;
7124
7125       last = worklist.pop ();
7126       mask = gimple_call_arg (last, 2);
7127       bb = gimple_bb (last);
7128       /* Create new bb.  */
7129       e = split_block (bb, last);
7130       join_bb = e->dest;
7131       store_bb = create_empty_bb (bb);
7132       add_bb_to_loop (store_bb, loop);
7133       e->flags = EDGE_TRUE_VALUE;
7134       efalse = make_edge (bb, store_bb, EDGE_FALSE_VALUE);
7135       /* Put STORE_BB to likely part.  */
7136       efalse->probability = PROB_UNLIKELY;
7137       store_bb->frequency = PROB_ALWAYS - EDGE_FREQUENCY (efalse);
7138       make_edge (store_bb, join_bb, EDGE_FALLTHRU);
7139       if (dom_info_available_p (CDI_DOMINATORS))
7140         set_immediate_dominator (CDI_DOMINATORS, store_bb, bb);
7141       if (dump_enabled_p ())
7142         dump_printf_loc (MSG_NOTE, vect_location,
7143                          "Create new block %d to sink mask stores.",
7144                          store_bb->index);
7145       /* Create vector comparison with boolean result.  */
7146       vectype = TREE_TYPE (mask);
7147       zero = build_zero_cst (vectype);
7148       stmt = gimple_build_cond (EQ_EXPR, mask, zero, NULL_TREE, NULL_TREE);
7149       gsi = gsi_last_bb (bb);
7150       gsi_insert_after (&gsi, stmt, GSI_SAME_STMT);
7151       /* Create new PHI node for vdef of the last masked store:
7152          .MEM_2 = VDEF <.MEM_1>
7153          will be converted to
7154          .MEM.3 = VDEF <.MEM_1>
7155          and new PHI node will be created in join bb
7156          .MEM_2 = PHI <.MEM_1, .MEM_3>
7157       */
7158       vdef = gimple_vdef (last);
7159       new_vdef = make_ssa_name (gimple_vop (cfun), last);
7160       gimple_set_vdef (last, new_vdef);
7161       phi = create_phi_node (vdef, join_bb);
7162       add_phi_arg (phi, new_vdef, EDGE_SUCC (store_bb, 0), UNKNOWN_LOCATION);
7163
7164       /* Put all masked stores with the same mask to STORE_BB if possible.  */
7165       while (true)
7166         {
7167           gimple_stmt_iterator gsi_from;
7168           gimple *stmt1 = NULL;
7169
7170           /* Move masked store to STORE_BB.  */
7171           last_store = last;
7172           gsi = gsi_for_stmt (last);
7173           gsi_from = gsi;
7174           /* Shift GSI to the previous stmt for further traversal.  */
7175           gsi_prev (&gsi);
7176           gsi_to = gsi_start_bb (store_bb);
7177           gsi_move_before (&gsi_from, &gsi_to);
7178           /* Setup GSI_TO to the non-empty block start.  */
7179           gsi_to = gsi_start_bb (store_bb);
7180           if (dump_enabled_p ())
7181             {
7182               dump_printf_loc (MSG_NOTE, vect_location,
7183                                "Move stmt to created bb\n");
7184               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, last, 0);
7185             }
7186           /* Move all stored value producers if possible.  */
7187           while (!gsi_end_p (gsi))
7188             {
7189               tree lhs;
7190               imm_use_iterator imm_iter;
7191               use_operand_p use_p;
7192               bool res;
7193
7194               /* Skip debug statements.  */
7195               if (is_gimple_debug (gsi_stmt (gsi)))
7196                 {
7197                   gsi_prev (&gsi);
7198                   continue;
7199                 }
7200               stmt1 = gsi_stmt (gsi);
7201               /* Do not consider statements writing to memory or having
7202                  volatile operand.  */
7203               if (gimple_vdef (stmt1)
7204                   || gimple_has_volatile_ops (stmt1))
7205                 break;
7206               gsi_from = gsi;
7207               gsi_prev (&gsi);
7208               lhs = gimple_get_lhs (stmt1);
7209               if (!lhs)
7210                 break;
7211
7212               /* LHS of vectorized stmt must be SSA_NAME.  */
7213               if (TREE_CODE (lhs) != SSA_NAME)
7214                 break;
7215
7216               if (!VECTOR_TYPE_P (TREE_TYPE (lhs)))
7217                 {
7218                   /* Remove dead scalar statement.  */
7219                   if (has_zero_uses (lhs))
7220                     {
7221                       gsi_remove (&gsi_from, true);
7222                       continue;
7223                     }
7224                 }
7225
7226               /* Check that LHS does not have uses outside of STORE_BB.  */
7227               res = true;
7228               FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
7229                 {
7230                   gimple *use_stmt;
7231                   use_stmt = USE_STMT (use_p);
7232                   if (is_gimple_debug (use_stmt))
7233                     continue;
7234                   if (gimple_bb (use_stmt) != store_bb)
7235                     {
7236                       res = false;
7237                       break;
7238                     }
7239                 }
7240               if (!res)
7241                 break;
7242
7243               if (gimple_vuse (stmt1)
7244                   && gimple_vuse (stmt1) != gimple_vuse (last_store))
7245                 break;
7246
7247               /* Can move STMT1 to STORE_BB.  */
7248               if (dump_enabled_p ())
7249                 {
7250                   dump_printf_loc (MSG_NOTE, vect_location,
7251                                    "Move stmt to created bb\n");
7252                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt1, 0);
7253                 }
7254               gsi_move_before (&gsi_from, &gsi_to);
7255               /* Shift GSI_TO for further insertion.  */
7256               gsi_prev (&gsi_to);
7257             }
7258           /* Put other masked stores with the same mask to STORE_BB.  */
7259           if (worklist.is_empty ()
7260               || gimple_call_arg (worklist.last (), 2) != mask
7261               || worklist.last () != stmt1)
7262             break;
7263           last = worklist.pop ();
7264         }
7265       add_phi_arg (phi, gimple_vuse (last_store), e, UNKNOWN_LOCATION);
7266     }
7267 }