gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "basic-block.h"
  30 #include "gimple-pretty-print.h"
  31 #include "tree-ssa-alias.h"
  32 #include "internal-fn.h"
  33 #include "gimple-expr.h"
  34 #include "is-a.h"
  35 #include "gimple.h"
  36 #include "gimplify.h"
  37 #include "gimple-iterator.h"
  38 #include "gimplify-me.h"
  39 #include "gimple-ssa.h"
  40 #include "tree-phinodes.h"
  41 #include "ssa-iterators.h"
  42 #include "stringpool.h"
  43 #include "tree-ssanames.h"
  44 #include "tree-ssa-loop-ivopts.h"
  45 #include "tree-ssa-loop-manip.h"
  46 #include "tree-ssa-loop-niter.h"
  47 #include "tree-pass.h"
  48 #include "cfgloop.h"
  49 #include "expr.h"
  50 #include "recog.h"
  51 #include "optabs.h"
  52 #include "params.h"
  53 #include "diagnostic-core.h"
  54 #include "tree-chrec.h"
  55 #include "tree-scalar-evolution.h"
  56 #include "tree-vectorizer.h"
  57 #include "target.h"
  58
  59 /* Loop Vectorization Pass.
  60
  61    This pass tries to vectorize loops.
  62
  63    For example, the vectorizer transforms the following simple loop:
  64
  65         short a[N]; short b[N]; short c[N]; int i;
  66
  67         for (i=0; i<N; i++){
  68           a[i] = b[i] + c[i];
  69         }
  70
  71    as if it was manually vectorized by rewriting the source code into:
  72
  73         typedef int __attribute__((mode(V8HI))) v8hi;
  74         short a[N];  short b[N]; short c[N];   int i;
  75         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  76         v8hi va, vb, vc;
  77
  78         for (i=0; i<N/8; i++){
  79           vb = pb[i];
  80           vc = pc[i];
  81           va = vb + vc;
  82           pa[i] = va;
  83         }
  84
  85         The main entry to this pass is vectorize_loops(), in which
  86    the vectorizer applies a set of analyses on a given set of loops,
  87    followed by the actual vectorization transformation for the loops that
  88    had successfully passed the analysis phase.
  89         Throughout this pass we make a distinction between two types of
  90    data: scalars (which are represented by SSA_NAMES), and memory references
  91    ("data-refs").  These two types of data require different handling both
  92    during analysis and transformation. The types of data-refs that the
  93    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  94    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  95    accesses are required to have a simple (consecutive) access pattern.
  96
  97    Analysis phase:
  98    ===============
  99         The driver for the analysis phase is vect_analyze_loop().
 100    It applies a set of analyses, some of which rely on the scalar evolution
 101    analyzer (scev) developed by Sebastian Pop.
 102
 103         During the analysis phase the vectorizer records some information
 104    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 105    loop, as well as general information about the loop as a whole, which is
 106    recorded in a "loop_vec_info" struct attached to each loop.
 107
 108    Transformation phase:
 109    =====================
 110         The loop transformation phase scans all the stmts in the loop, and
 111    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 112    the loop that needs to be vectorized.  It inserts the vector code sequence
 113    just before the scalar stmt S, and records a pointer to the vector code
 114    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 115    attached to S).  This pointer will be used for the vectorization of following
 116    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 117    otherwise, we rely on dead code elimination for removing it.
 118
 119         For example, say stmt S1 was vectorized into stmt VS1:
 120
 121    VS1: vb = px[i];
 122    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 123    S2:  a = b;
 124
 125    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 126    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 127    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 128    resulting sequence would be:
 129
 130    VS1: vb = px[i];
 131    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 132    VS2: va = vb;
 133    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 134
 135         Operands that are not SSA_NAMEs, are data-refs that appear in
 136    load/store operations (like 'x[i]' in S1), and are handled differently.
 137
 138    Target modeling:
 139    =================
 140         Currently the only target specific information that is used is the
 141    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 142    Targets that can support different sizes of vectors, for now will need
 143    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 144    flexibility will be added in the future.
 145
 146         Since we only vectorize operations which vector form can be
 147    expressed using existing tree codes, to verify that an operation is
 148    supported, the vectorizer checks the relevant optab at the relevant
 149    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 150    the value found is CODE_FOR_nothing, then there's no target support, and
 151    we can't vectorize the stmt.
 152
 153    For additional information on this project see:
 154    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 155 */
 156
 157 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 158
 159 /* Function vect_determine_vectorization_factor
 160
 161    Determine the vectorization factor (VF).  VF is the number of data elements
 162    that are operated upon in parallel in a single iteration of the vectorized
 163    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 164    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 165    elements can fit in a single vector register.
 166
 167    We currently support vectorization of loops in which all types operated upon
 168    are of the same size.  Therefore this function currently sets VF according to
 169    the size of the types operated upon, and fails if there are multiple sizes
 170    in the loop.
 171
 172    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 173    original loop:
 174         for (i=0; i<N; i++){
 175           a[i] = b[i] + c[i];
 176         }
 177
 178    vectorized loop:
 179         for (i=0; i<N; i+=VF){
 180           a[i:VF] = b[i:VF] + c[i:VF];
 181         }
 182 */
 183
 184 static bool
 185 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 186 {
 187   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 188   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 189   int nbbs = loop->num_nodes;
 190   gimple_stmt_iterator si;
 191   unsigned int vectorization_factor = 0;
 192   tree scalar_type;
 193   gimple phi;
 194   tree vectype;
 195   unsigned int nunits;
 196   stmt_vec_info stmt_info;
 197   int i;
 198   HOST_WIDE_INT dummy;
 199   gimple stmt, pattern_stmt = NULL;
 200   gimple_seq pattern_def_seq = NULL;
 201   gimple_stmt_iterator pattern_def_si = gsi_none ();
 202   bool analyze_pattern_stmt = false;
 203
 204   if (dump_enabled_p ())
 205     dump_printf_loc (MSG_NOTE, vect_location,
 206                      "=== vect_determine_vectorization_factor ===\n");
 207
 208   for (i = 0; i < nbbs; i++)
 209     {
 210       basic_block bb = bbs[i];
 211
 212       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 213         {
 214           phi = gsi_stmt (si);
 215           stmt_info = vinfo_for_stmt (phi);
 216           if (dump_enabled_p ())
 217             {
 218               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 219               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 220               dump_printf (MSG_NOTE, "\n");
 221             }
 222
 223           gcc_assert (stmt_info);
 224
 225           if (STMT_VINFO_RELEVANT_P (stmt_info))
 226             {
 227               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 228               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 229
 230               if (dump_enabled_p ())
 231                 {
 232                   dump_printf_loc (MSG_NOTE, vect_location,
 233                                    "get vectype for scalar type:  ");
 234                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 235                   dump_printf (MSG_NOTE, "\n");
 236                 }
 237
 238               vectype = get_vectype_for_scalar_type (scalar_type);
 239               if (!vectype)
 240                 {
 241                   if (dump_enabled_p ())
 242                     {
 243                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 244                                        "not vectorized: unsupported "
 245                                        "data-type ");
 246                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 247                                          scalar_type);
 248                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 249                     }
 250                   return false;
 251                 }
 252               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 253
 254               if (dump_enabled_p ())
 255                 {
 256                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 257                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 258                   dump_printf (MSG_NOTE, "\n");
 259                 }
 260
 261               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 262               if (dump_enabled_p ())
 263                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 264                                  nunits);
 265
 266               if (!vectorization_factor
 267                   || (nunits > vectorization_factor))
 268                 vectorization_factor = nunits;
 269             }
 270         }
 271
 272       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 273         {
 274           tree vf_vectype;
 275
 276           if (analyze_pattern_stmt)
 277             stmt = pattern_stmt;
 278           else
 279             stmt = gsi_stmt (si);
 280
 281           stmt_info = vinfo_for_stmt (stmt);
 282
 283           if (dump_enabled_p ())
 284             {
 285               dump_printf_loc (MSG_NOTE, vect_location,
 286                                "==> examining statement: ");
 287               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 288               dump_printf (MSG_NOTE, "\n");
 289             }
 290
 291           gcc_assert (stmt_info);
 292
 293           /* Skip stmts which do not need to be vectorized.  */
 294           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 295                && !STMT_VINFO_LIVE_P (stmt_info))
 296               || gimple_clobber_p (stmt))
 297             {
 298               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 299                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 300                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 301                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 302                 {
 303                   stmt = pattern_stmt;
 304                   stmt_info = vinfo_for_stmt (pattern_stmt);
 305                   if (dump_enabled_p ())
 306                     {
 307                       dump_printf_loc (MSG_NOTE, vect_location,
 308                                        "==> examining pattern statement: ");
 309                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 310                       dump_printf (MSG_NOTE, "\n");
 311                     }
 312                 }
 313               else
 314                 {
 315                   if (dump_enabled_p ())
 316                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 317                   gsi_next (&si);
 318                   continue;
 319                 }
 320             }
 321           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 322                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 323                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 324                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 325             analyze_pattern_stmt = true;
 326
 327           /* If a pattern statement has def stmts, analyze them too.  */
 328           if (is_pattern_stmt_p (stmt_info))
 329             {
 330               if (pattern_def_seq == NULL)
 331                 {
 332                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 333                   pattern_def_si = gsi_start (pattern_def_seq);
 334                 }
 335               else if (!gsi_end_p (pattern_def_si))
 336                 gsi_next (&pattern_def_si);
 337               if (pattern_def_seq != NULL)
 338                 {
 339                   gimple pattern_def_stmt = NULL;
 340                   stmt_vec_info pattern_def_stmt_info = NULL;
 341
 342                   while (!gsi_end_p (pattern_def_si))
 343                     {
 344                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 345                       pattern_def_stmt_info
 346                         = vinfo_for_stmt (pattern_def_stmt);
 347                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 348                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 349                         break;
 350                       gsi_next (&pattern_def_si);
 351                     }
 352
 353                   if (!gsi_end_p (pattern_def_si))
 354                     {
 355                       if (dump_enabled_p ())
 356                         {
 357                           dump_printf_loc (MSG_NOTE, vect_location,
 358                                            "==> examining pattern def stmt: ");
 359                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 360                                             pattern_def_stmt, 0);
 361                           dump_printf (MSG_NOTE, "\n");
 362                         }
 363
 364                       stmt = pattern_def_stmt;
 365                       stmt_info = pattern_def_stmt_info;
 366                     }
 367                   else
 368                     {
 369                       pattern_def_si = gsi_none ();
 370                       analyze_pattern_stmt = false;
 371                     }
 372                 }
 373               else
 374                 analyze_pattern_stmt = false;
 375             }
 376
 377           if (gimple_get_lhs (stmt) == NULL_TREE
 378               /* MASK_STORE has no lhs, but is ok.  */
 379               && (!is_gimple_call (stmt)
 380                   || !gimple_call_internal_p (stmt)
 381                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 382             {
 383               if (is_gimple_call (stmt))
 384                 {
 385                   /* Ignore calls with no lhs.  These must be calls to
 386                      #pragma omp simd functions, and what vectorization factor
 387                      it really needs can't be determined until
 388                      vectorizable_simd_clone_call.  */
 389                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 390                     {
 391                       pattern_def_seq = NULL;
 392                       gsi_next (&si);
 393                     }
 394                   continue;
 395                 }
 396               if (dump_enabled_p ())
 397                 {
 398                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 399                                    "not vectorized: irregular stmt.");
 400                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 401                                     0);
 402                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 403                 }
 404               return false;
 405             }
 406
 407           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 408             {
 409               if (dump_enabled_p ())
 410                 {
 411                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 412                                    "not vectorized: vector stmt in loop:");
 413                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 414                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 415                 }
 416               return false;
 417             }
 418
 419           if (STMT_VINFO_VECTYPE (stmt_info))
 420             {
 421               /* The only case when a vectype had been already set is for stmts
 422                  that contain a dataref, or for "pattern-stmts" (stmts
 423                  generated by the vectorizer to represent/replace a certain
 424                  idiom).  */
 425               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 426                           || is_pattern_stmt_p (stmt_info)
 427                           || !gsi_end_p (pattern_def_si));
 428               vectype = STMT_VINFO_VECTYPE (stmt_info);
 429             }
 430           else
 431             {
 432               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 433               if (is_gimple_call (stmt)
 434                   && gimple_call_internal_p (stmt)
 435                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 436                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 437               else
 438                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 439               if (dump_enabled_p ())
 440                 {
 441                   dump_printf_loc (MSG_NOTE, vect_location,
 442                                    "get vectype for scalar type:  ");
 443                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 444                   dump_printf (MSG_NOTE, "\n");
 445                 }
 446               vectype = get_vectype_for_scalar_type (scalar_type);
 447               if (!vectype)
 448                 {
 449                   if (dump_enabled_p ())
 450                     {
 451                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 452                                        "not vectorized: unsupported "
 453                                        "data-type ");
 454                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 455                                          scalar_type);
 456                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 457                     }
 458                   return false;
 459                 }
 460
 461               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 462
 463               if (dump_enabled_p ())
 464                 {
 465                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 466                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 467                   dump_printf (MSG_NOTE, "\n");
 468                 }
 469             }
 470
 471           /* The vectorization factor is according to the smallest
 472              scalar type (or the largest vector size, but we only
 473              support one vector size per loop).  */
 474           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 475                                                        &dummy);
 476           if (dump_enabled_p ())
 477             {
 478               dump_printf_loc (MSG_NOTE, vect_location,
 479                                "get vectype for scalar type:  ");
 480               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 481               dump_printf (MSG_NOTE, "\n");
 482             }
 483           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 484           if (!vf_vectype)
 485             {
 486               if (dump_enabled_p ())
 487                 {
 488                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 489                                    "not vectorized: unsupported data-type ");
 490                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 491                                      scalar_type);
 492                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 493                 }
 494               return false;
 495             }
 496
 497           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 498                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 499             {
 500               if (dump_enabled_p ())
 501                 {
 502                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 503                                    "not vectorized: different sized vector "
 504                                    "types in statement, ");
 505                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 506                                      vectype);
 507                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 508                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 509                                      vf_vectype);
 510                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 511                 }
 512               return false;
 513             }
 514
 515           if (dump_enabled_p ())
 516             {
 517               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 518               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 519               dump_printf (MSG_NOTE, "\n");
 520             }
 521
 522           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 523           if (dump_enabled_p ())
 524             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 525           if (!vectorization_factor
 526               || (nunits > vectorization_factor))
 527             vectorization_factor = nunits;
 528
 529           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 530             {
 531               pattern_def_seq = NULL;
 532               gsi_next (&si);
 533             }
 534         }
 535     }
 536
 537   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 538   if (dump_enabled_p ())
 539     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 540                      vectorization_factor);
 541   if (vectorization_factor <= 1)
 542     {
 543       if (dump_enabled_p ())
 544         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 545                          "not vectorized: unsupported data-type\n");
 546       return false;
 547     }
 548   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 549
 550   return true;
 551 }
 552
 553
 554 /* Function vect_is_simple_iv_evolution.
 555
 556    FORNOW: A simple evolution of an induction variables in the loop is
 557    considered a polynomial evolution.  */
 558
 559 static bool
 560 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 561                              tree * step)
 562 {
 563   tree init_expr;
 564   tree step_expr;
 565   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 566   basic_block bb;
 567
 568   /* When there is no evolution in this loop, the evolution function
 569      is not "simple".  */
 570   if (evolution_part == NULL_TREE)
 571     return false;
 572
 573   /* When the evolution is a polynomial of degree >= 2
 574      the evolution function is not "simple".  */
 575   if (tree_is_chrec (evolution_part))
 576     return false;
 577
 578   step_expr = evolution_part;
 579   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 580
 581   if (dump_enabled_p ())
 582     {
 583       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 584       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 585       dump_printf (MSG_NOTE, ",  init: ");
 586       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 587       dump_printf (MSG_NOTE, "\n");
 588     }
 589
 590   *init = init_expr;
 591   *step = step_expr;
 592
 593   if (TREE_CODE (step_expr) != INTEGER_CST
 594       && (TREE_CODE (step_expr) != SSA_NAME
 595           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 596               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 597           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 598               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 599                   || !flag_associative_math)))
 600       && (TREE_CODE (step_expr) != REAL_CST
 601           || !flag_associative_math))
 602     {
 603       if (dump_enabled_p ())
 604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 605                          "step unknown.\n");
 606       return false;
 607     }
 608
 609   return true;
 610 }
 611
 612 /* Function vect_analyze_scalar_cycles_1.
 613
 614    Examine the cross iteration def-use cycles of scalar variables
 615    in LOOP.  LOOP_VINFO represents the loop that is now being
 616    considered for vectorization (can be LOOP, or an outer-loop
 617    enclosing LOOP).  */
 618
 619 static void
 620 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 621 {
 622   basic_block bb = loop->header;
 623   tree init, step;
 624   auto_vec<gimple, 64> worklist;
 625   gimple_phi_iterator gsi;
 626   bool double_reduc;
 627
 628   if (dump_enabled_p ())
 629     dump_printf_loc (MSG_NOTE, vect_location,
 630                      "=== vect_analyze_scalar_cycles ===\n");
 631
 632   /* First - identify all inductions.  Reduction detection assumes that all the
 633      inductions have been identified, therefore, this order must not be
 634      changed.  */
 635   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 636     {
 637       gimple_phi phi = gsi.phi ();
 638       tree access_fn = NULL;
 639       tree def = PHI_RESULT (phi);
 640       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 641
 642       if (dump_enabled_p ())
 643         {
 644           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 645           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 646           dump_printf (MSG_NOTE, "\n");
 647         }
 648
 649       /* Skip virtual phi's.  The data dependences that are associated with
 650          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 651       if (virtual_operand_p (def))
 652         continue;
 653
 654       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 655
 656       /* Analyze the evolution function.  */
 657       access_fn = analyze_scalar_evolution (loop, def);
 658       if (access_fn)
 659         {
 660           STRIP_NOPS (access_fn);
 661           if (dump_enabled_p ())
 662             {
 663               dump_printf_loc (MSG_NOTE, vect_location,
 664                                "Access function of PHI: ");
 665               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 666               dump_printf (MSG_NOTE, "\n");
 667             }
 668           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 669             = evolution_part_in_loop_num (access_fn, loop->num);
 670         }
 671
 672       if (!access_fn
 673           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 674           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 675               && TREE_CODE (step) != INTEGER_CST))
 676         {
 677           worklist.safe_push (phi);
 678           continue;
 679         }
 680
 681       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 682
 683       if (dump_enabled_p ())
 684         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 685       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 686     }
 687
 688
 689   /* Second - identify all reductions and nested cycles.  */
 690   while (worklist.length () > 0)
 691     {
 692       gimple phi = worklist.pop ();
 693       tree def = PHI_RESULT (phi);
 694       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 695       gimple reduc_stmt;
 696       bool nested_cycle;
 697
 698       if (dump_enabled_p ())
 699         {
 700           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 701           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 702           dump_printf (MSG_NOTE, "\n");
 703         }
 704
 705       gcc_assert (!virtual_operand_p (def)
 706                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 707
 708       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 709       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 710                                                 &double_reduc);
 711       if (reduc_stmt)
 712         {
 713           if (double_reduc)
 714             {
 715               if (dump_enabled_p ())
 716                 dump_printf_loc (MSG_NOTE, vect_location,
 717                                  "Detected double reduction.\n");
 718
 719               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 720               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 721                                                     vect_double_reduction_def;
 722             }
 723           else
 724             {
 725               if (nested_cycle)
 726                 {
 727                   if (dump_enabled_p ())
 728                     dump_printf_loc (MSG_NOTE, vect_location,
 729                                      "Detected vectorizable nested cycle.\n");
 730
 731                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 732                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 733                                                              vect_nested_cycle;
 734                 }
 735               else
 736                 {
 737                   if (dump_enabled_p ())
 738                     dump_printf_loc (MSG_NOTE, vect_location,
 739                                      "Detected reduction.\n");
 740
 741                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 742                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 743                                                            vect_reduction_def;
 744                   /* Store the reduction cycles for possible vectorization in
 745                      loop-aware SLP.  */
 746                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 747                 }
 748             }
 749         }
 750       else
 751         if (dump_enabled_p ())
 752           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 753                            "Unknown def-use cycle pattern.\n");
 754     }
 755 }
 756
 757
 758 /* Function vect_analyze_scalar_cycles.
 759
 760    Examine the cross iteration def-use cycles of scalar variables, by
 761    analyzing the loop-header PHIs of scalar variables.  Classify each
 762    cycle as one of the following: invariant, induction, reduction, unknown.
 763    We do that for the loop represented by LOOP_VINFO, and also to its
 764    inner-loop, if exists.
 765    Examples for scalar cycles:
 766
 767    Example1: reduction:
 768
 769               loop1:
 770               for (i=0; i<N; i++)
 771                  sum += a[i];
 772
 773    Example2: induction:
 774
 775               loop2:
 776               for (i=0; i<N; i++)
 777                  a[i] = i;  */
 778
 779 static void
 780 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 781 {
 782   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 783
 784   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 785
 786   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 787      Reductions in such inner-loop therefore have different properties than
 788      the reductions in the nest that gets vectorized:
 789      1. When vectorized, they are executed in the same order as in the original
 790         scalar loop, so we can't change the order of computation when
 791         vectorizing them.
 792      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 793         current checks are too strict.  */
 794
 795   if (loop->inner)
 796     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 797 }
 798
 799
 800 /* Function vect_get_loop_niters.
 801
 802    Determine how many iterations the loop is executed and place it
 803    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 804    in NUMBER_OF_ITERATIONSM1.
 805
 806    Return the loop exit condition.  */
 807
 808
 809 static gimple_cond
 810 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 811                       tree *number_of_iterationsm1)
 812 {
 813   tree niters;
 814
 815   if (dump_enabled_p ())
 816     dump_printf_loc (MSG_NOTE, vect_location,
 817                      "=== get_loop_niters ===\n");
 818
 819   niters = number_of_latch_executions (loop);
 820   *number_of_iterationsm1 = niters;
 821
 822   /* We want the number of loop header executions which is the number
 823      of latch executions plus one.
 824      ???  For UINT_MAX latch executions this number overflows to zero
 825      for loops like do { n++; } while (n != 0);  */
 826   if (niters && !chrec_contains_undetermined (niters))
 827     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 828                           build_int_cst (TREE_TYPE (niters), 1));
 829   *number_of_iterations = niters;
 830
 831   return get_loop_exit_condition (loop);
 832 }
 833
 834
 835 /* Function bb_in_loop_p
 836
 837    Used as predicate for dfs order traversal of the loop bbs.  */
 838
 839 static bool
 840 bb_in_loop_p (const_basic_block bb, const void *data)
 841 {
 842   const struct loop *const loop = (const struct loop *)data;
 843   if (flow_bb_inside_loop_p (loop, bb))
 844     return true;
 845   return false;
 846 }
 847
 848
 849 /* Function new_loop_vec_info.
 850
 851    Create and initialize a new loop_vec_info struct for LOOP, as well as
 852    stmt_vec_info structs for all the stmts in LOOP.  */
 853
 854 static loop_vec_info
 855 new_loop_vec_info (struct loop *loop)
 856 {
 857   loop_vec_info res;
 858   basic_block *bbs;
 859   gimple_stmt_iterator si;
 860   unsigned int i, nbbs;
 861
 862   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 863   LOOP_VINFO_LOOP (res) = loop;
 864
 865   bbs = get_loop_body (loop);
 866
 867   /* Create/Update stmt_info for all stmts in the loop.  */
 868   for (i = 0; i < loop->num_nodes; i++)
 869     {
 870       basic_block bb = bbs[i];
 871
 872       /* BBs in a nested inner-loop will have been already processed (because
 873          we will have called vect_analyze_loop_form for any nested inner-loop).
 874          Therefore, for stmts in an inner-loop we just want to update the
 875          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 876          loop_info of the outer-loop we are currently considering to vectorize
 877          (instead of the loop_info of the inner-loop).
 878          For stmts in other BBs we need to create a stmt_info from scratch.  */
 879       if (bb->loop_father != loop)
 880         {
 881           /* Inner-loop bb.  */
 882           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 883           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 884             {
 885               gimple phi = gsi_stmt (si);
 886               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 887               loop_vec_info inner_loop_vinfo =
 888                 STMT_VINFO_LOOP_VINFO (stmt_info);
 889               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 890               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 891             }
 892           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 893            {
 894               gimple stmt = gsi_stmt (si);
 895               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 896               loop_vec_info inner_loop_vinfo =
 897                  STMT_VINFO_LOOP_VINFO (stmt_info);
 898               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 899               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 900            }
 901         }
 902       else
 903         {
 904           /* bb in current nest.  */
 905           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 906             {
 907               gimple phi = gsi_stmt (si);
 908               gimple_set_uid (phi, 0);
 909               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 910             }
 911
 912           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 913             {
 914               gimple stmt = gsi_stmt (si);
 915               gimple_set_uid (stmt, 0);
 916               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 917             }
 918         }
 919     }
 920
 921   /* CHECKME: We want to visit all BBs before their successors (except for
 922      latch blocks, for which this assertion wouldn't hold).  In the simple
 923      case of the loop forms we allow, a dfs order of the BBs would the same
 924      as reversed postorder traversal, so we are safe.  */
 925
 926    free (bbs);
 927    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 928    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 929                               bbs, loop->num_nodes, loop);
 930    gcc_assert (nbbs == loop->num_nodes);
 931
 932   LOOP_VINFO_BBS (res) = bbs;
 933   LOOP_VINFO_NITERSM1 (res) = NULL;
 934   LOOP_VINFO_NITERS (res) = NULL;
 935   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 936   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 937   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 938   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 939   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 940   LOOP_VINFO_VECT_FACTOR (res) = 0;
 941   LOOP_VINFO_LOOP_NEST (res).create (3);
 942   LOOP_VINFO_DATAREFS (res).create (10);
 943   LOOP_VINFO_DDRS (res).create (10 * 10);
 944   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 945   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 946              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 947   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 948              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 949   LOOP_VINFO_GROUPED_STORES (res).create (10);
 950   LOOP_VINFO_REDUCTIONS (res).create (10);
 951   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 952   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 953   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 954   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 955   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 956   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 957   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 958
 959   return res;
 960 }
 961
 962
 963 /* Function destroy_loop_vec_info.
 964
 965    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 966    stmts in the loop.  */
 967
 968 void
 969 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 970 {
 971   struct loop *loop;
 972   basic_block *bbs;
 973   int nbbs;
 974   gimple_stmt_iterator si;
 975   int j;
 976   vec<slp_instance> slp_instances;
 977   slp_instance instance;
 978   bool swapped;
 979
 980   if (!loop_vinfo)
 981     return;
 982
 983   loop = LOOP_VINFO_LOOP (loop_vinfo);
 984
 985   bbs = LOOP_VINFO_BBS (loop_vinfo);
 986   nbbs = clean_stmts ? loop->num_nodes : 0;
 987   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 988
 989   for (j = 0; j < nbbs; j++)
 990     {
 991       basic_block bb = bbs[j];
 992       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 993         free_stmt_vec_info (gsi_stmt (si));
 994
 995       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 996         {
 997           gimple stmt = gsi_stmt (si);
 998
 999           /* We may have broken canonical form by moving a constant
1000              into RHS1 of a commutative op.  Fix such occurrences.  */
1001           if (swapped && is_gimple_assign (stmt))
1002             {
1003               enum tree_code code = gimple_assign_rhs_code (stmt);
1004
1005               if ((code == PLUS_EXPR
1006                    || code == POINTER_PLUS_EXPR
1007                    || code == MULT_EXPR)
1008                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1009                 swap_ssa_operands (stmt,
1010                                    gimple_assign_rhs1_ptr (stmt),
1011                                    gimple_assign_rhs2_ptr (stmt));
1012             }
1013
1014           /* Free stmt_vec_info.  */
1015           free_stmt_vec_info (stmt);
1016           gsi_next (&si);
1017         }
1018     }
1019
1020   free (LOOP_VINFO_BBS (loop_vinfo));
1021   vect_destroy_datarefs (loop_vinfo, NULL);
1022   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1023   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1024   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1025   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1026   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1027   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1028     vect_free_slp_instance (instance);
1029
1030   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1031   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1032   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1033   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1034
1035   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1036   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1037
1038   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1039
1040   free (loop_vinfo);
1041   loop->aux = NULL;
1042 }
1043
1044
1045 /* Function vect_analyze_loop_1.
1046
1047    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1048    for it. The different analyses will record information in the
1049    loop_vec_info struct.  This is a subset of the analyses applied in
1050    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1051    that is now considered for (outer-loop) vectorization.  */
1052
1053 static loop_vec_info
1054 vect_analyze_loop_1 (struct loop *loop)
1055 {
1056   loop_vec_info loop_vinfo;
1057
1058   if (dump_enabled_p ())
1059     dump_printf_loc (MSG_NOTE, vect_location,
1060                      "===== analyze_loop_nest_1 =====\n");
1061
1062   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1063
1064   loop_vinfo = vect_analyze_loop_form (loop);
1065   if (!loop_vinfo)
1066     {
1067       if (dump_enabled_p ())
1068         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1069                          "bad inner-loop form.\n");
1070       return NULL;
1071     }
1072
1073   return loop_vinfo;
1074 }
1075
1076
1077 /* Function vect_analyze_loop_form.
1078
1079    Verify that certain CFG restrictions hold, including:
1080    - the loop has a pre-header
1081    - the loop has a single entry and exit
1082    - the loop exit condition is simple enough, and the number of iterations
1083      can be analyzed (a countable loop).  */
1084
1085 loop_vec_info
1086 vect_analyze_loop_form (struct loop *loop)
1087 {
1088   loop_vec_info loop_vinfo;
1089   gimple_cond loop_cond;
1090   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1091   loop_vec_info inner_loop_vinfo = NULL;
1092
1093   if (dump_enabled_p ())
1094     dump_printf_loc (MSG_NOTE, vect_location,
1095                      "=== vect_analyze_loop_form ===\n");
1096
1097   /* Different restrictions apply when we are considering an inner-most loop,
1098      vs. an outer (nested) loop.
1099      (FORNOW. May want to relax some of these restrictions in the future).  */
1100
1101   if (!loop->inner)
1102     {
1103       /* Inner-most loop.  We currently require that the number of BBs is
1104          exactly 2 (the header and latch).  Vectorizable inner-most loops
1105          look like this:
1106
1107                         (pre-header)
1108                            |
1109                           header <--------+
1110                            | |            |
1111                            | +--> latch --+
1112                            |
1113                         (exit-bb)  */
1114
1115       if (loop->num_nodes != 2)
1116         {
1117           if (dump_enabled_p ())
1118             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1119                              "not vectorized: control flow in loop.\n");
1120           return NULL;
1121         }
1122
1123       if (empty_block_p (loop->header))
1124         {
1125           if (dump_enabled_p ())
1126             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1127                              "not vectorized: empty loop.\n");
1128           return NULL;
1129         }
1130     }
1131   else
1132     {
1133       struct loop *innerloop = loop->inner;
1134       edge entryedge;
1135
1136       /* Nested loop. We currently require that the loop is doubly-nested,
1137          contains a single inner loop, and the number of BBs is exactly 5.
1138          Vectorizable outer-loops look like this:
1139
1140                         (pre-header)
1141                            |
1142                           header <---+
1143                            |         |
1144                           inner-loop |
1145                            |         |
1146                           tail ------+
1147                            |
1148                         (exit-bb)
1149
1150          The inner-loop has the properties expected of inner-most loops
1151          as described above.  */
1152
1153       if ((loop->inner)->inner || (loop->inner)->next)
1154         {
1155           if (dump_enabled_p ())
1156             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1157                              "not vectorized: multiple nested loops.\n");
1158           return NULL;
1159         }
1160
1161       /* Analyze the inner-loop.  */
1162       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1163       if (!inner_loop_vinfo)
1164         {
1165           if (dump_enabled_p ())
1166             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167                              "not vectorized: Bad inner loop.\n");
1168           return NULL;
1169         }
1170
1171       if (!expr_invariant_in_loop_p (loop,
1172                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1173         {
1174           if (dump_enabled_p ())
1175             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1176                              "not vectorized: inner-loop count not"
1177                              " invariant.\n");
1178           destroy_loop_vec_info (inner_loop_vinfo, true);
1179           return NULL;
1180         }
1181
1182       if (loop->num_nodes != 5)
1183         {
1184           if (dump_enabled_p ())
1185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                              "not vectorized: control flow in loop.\n");
1187           destroy_loop_vec_info (inner_loop_vinfo, true);
1188           return NULL;
1189         }
1190
1191       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1192       entryedge = EDGE_PRED (innerloop->header, 0);
1193       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1194         entryedge = EDGE_PRED (innerloop->header, 1);
1195
1196       if (entryedge->src != loop->header
1197           || !single_exit (innerloop)
1198           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1199         {
1200           if (dump_enabled_p ())
1201             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1202                              "not vectorized: unsupported outerloop form.\n");
1203           destroy_loop_vec_info (inner_loop_vinfo, true);
1204           return NULL;
1205         }
1206
1207       if (dump_enabled_p ())
1208         dump_printf_loc (MSG_NOTE, vect_location,
1209                          "Considering outer-loop vectorization.\n");
1210     }
1211
1212   if (!single_exit (loop)
1213       || EDGE_COUNT (loop->header->preds) != 2)
1214     {
1215       if (dump_enabled_p ())
1216         {
1217           if (!single_exit (loop))
1218             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1219                              "not vectorized: multiple exits.\n");
1220           else if (EDGE_COUNT (loop->header->preds) != 2)
1221             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1222                              "not vectorized: too many incoming edges.\n");
1223         }
1224       if (inner_loop_vinfo)
1225         destroy_loop_vec_info (inner_loop_vinfo, true);
1226       return NULL;
1227     }
1228
1229   /* We assume that the loop exit condition is at the end of the loop. i.e,
1230      that the loop is represented as a do-while (with a proper if-guard
1231      before the loop if needed), where the loop header contains all the
1232      executable statements, and the latch is empty.  */
1233   if (!empty_block_p (loop->latch)
1234       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1235     {
1236       if (dump_enabled_p ())
1237         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1238                          "not vectorized: latch block not empty.\n");
1239       if (inner_loop_vinfo)
1240         destroy_loop_vec_info (inner_loop_vinfo, true);
1241       return NULL;
1242     }
1243
1244   /* Make sure there exists a single-predecessor exit bb:  */
1245   if (!single_pred_p (single_exit (loop)->dest))
1246     {
1247       edge e = single_exit (loop);
1248       if (!(e->flags & EDGE_ABNORMAL))
1249         {
1250           split_loop_exit_edge (e);
1251           if (dump_enabled_p ())
1252             dump_printf (MSG_NOTE, "split exit edge.\n");
1253         }
1254       else
1255         {
1256           if (dump_enabled_p ())
1257             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1258                              "not vectorized: abnormal loop exit edge.\n");
1259           if (inner_loop_vinfo)
1260             destroy_loop_vec_info (inner_loop_vinfo, true);
1261           return NULL;
1262         }
1263     }
1264
1265   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1266                                     &number_of_iterationsm1);
1267   if (!loop_cond)
1268     {
1269       if (dump_enabled_p ())
1270         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                          "not vectorized: complicated exit condition.\n");
1272       if (inner_loop_vinfo)
1273         destroy_loop_vec_info (inner_loop_vinfo, true);
1274       return NULL;
1275     }
1276
1277   if (!number_of_iterations
1278       || chrec_contains_undetermined (number_of_iterations))
1279     {
1280       if (dump_enabled_p ())
1281         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1282                          "not vectorized: number of iterations cannot be "
1283                          "computed.\n");
1284       if (inner_loop_vinfo)
1285         destroy_loop_vec_info (inner_loop_vinfo, true);
1286       return NULL;
1287     }
1288
1289   if (integer_zerop (number_of_iterations))
1290     {
1291       if (dump_enabled_p ())
1292         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293                          "not vectorized: number of iterations = 0.\n");
1294       if (inner_loop_vinfo)
1295         destroy_loop_vec_info (inner_loop_vinfo, true);
1296       return NULL;
1297     }
1298
1299   loop_vinfo = new_loop_vec_info (loop);
1300   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1301   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1302   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1303
1304   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1305     {
1306       if (dump_enabled_p ())
1307         {
1308           dump_printf_loc (MSG_NOTE, vect_location,
1309                            "Symbolic number of iterations is ");
1310           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1311           dump_printf (MSG_NOTE, "\n");
1312         }
1313     }
1314
1315   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1316
1317   /* CHECKME: May want to keep it around it in the future.  */
1318   if (inner_loop_vinfo)
1319     destroy_loop_vec_info (inner_loop_vinfo, false);
1320
1321   gcc_assert (!loop->aux);
1322   loop->aux = loop_vinfo;
1323   return loop_vinfo;
1324 }
1325
1326
1327 /* Function vect_analyze_loop_operations.
1328
1329    Scan the loop stmts and make sure they are all vectorizable.  */
1330
1331 static bool
1332 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1333 {
1334   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1335   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1336   int nbbs = loop->num_nodes;
1337   gimple_stmt_iterator si;
1338   unsigned int vectorization_factor = 0;
1339   int i;
1340   gimple phi;
1341   stmt_vec_info stmt_info;
1342   bool need_to_vectorize = false;
1343   int min_profitable_iters;
1344   int min_scalar_loop_bound;
1345   unsigned int th;
1346   bool only_slp_in_loop = true, ok;
1347   HOST_WIDE_INT max_niter;
1348   HOST_WIDE_INT estimated_niter;
1349   int min_profitable_estimate;
1350
1351   if (dump_enabled_p ())
1352     dump_printf_loc (MSG_NOTE, vect_location,
1353                      "=== vect_analyze_loop_operations ===\n");
1354
1355   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1356   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1357   if (slp)
1358     {
1359       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1360          vectorization factor of the loop is the unrolling factor required by
1361          the SLP instances.  If that unrolling factor is 1, we say, that we
1362          perform pure SLP on loop - cross iteration parallelism is not
1363          exploited.  */
1364       for (i = 0; i < nbbs; i++)
1365         {
1366           basic_block bb = bbs[i];
1367           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1368             {
1369               gimple stmt = gsi_stmt (si);
1370               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1371               gcc_assert (stmt_info);
1372               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1373                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1374                   && !PURE_SLP_STMT (stmt_info))
1375                 /* STMT needs both SLP and loop-based vectorization.  */
1376                 only_slp_in_loop = false;
1377             }
1378         }
1379
1380       if (only_slp_in_loop)
1381         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1382       else
1383         vectorization_factor = least_common_multiple (vectorization_factor,
1384                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1385
1386       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1387       if (dump_enabled_p ())
1388         dump_printf_loc (MSG_NOTE, vect_location,
1389                          "Updating vectorization factor to %d\n",
1390                          vectorization_factor);
1391     }
1392
1393   for (i = 0; i < nbbs; i++)
1394     {
1395       basic_block bb = bbs[i];
1396
1397       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1398         {
1399           phi = gsi_stmt (si);
1400           ok = true;
1401
1402           stmt_info = vinfo_for_stmt (phi);
1403           if (dump_enabled_p ())
1404             {
1405               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1406               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1407               dump_printf (MSG_NOTE, "\n");
1408             }
1409
1410           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1411              (i.e., a phi in the tail of the outer-loop).  */
1412           if (! is_loop_header_bb_p (bb))
1413             {
1414               /* FORNOW: we currently don't support the case that these phis
1415                  are not used in the outerloop (unless it is double reduction,
1416                  i.e., this phi is vect_reduction_def), cause this case
1417                  requires to actually do something here.  */
1418               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1419                    || STMT_VINFO_LIVE_P (stmt_info))
1420                   && STMT_VINFO_DEF_TYPE (stmt_info)
1421                      != vect_double_reduction_def)
1422                 {
1423                   if (dump_enabled_p ())
1424                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1425                                      "Unsupported loop-closed phi in "
1426                                      "outer-loop.\n");
1427                   return false;
1428                 }
1429
1430               /* If PHI is used in the outer loop, we check that its operand
1431                  is defined in the inner loop.  */
1432               if (STMT_VINFO_RELEVANT_P (stmt_info))
1433                 {
1434                   tree phi_op;
1435                   gimple op_def_stmt;
1436
1437                   if (gimple_phi_num_args (phi) != 1)
1438                     return false;
1439
1440                   phi_op = PHI_ARG_DEF (phi, 0);
1441                   if (TREE_CODE (phi_op) != SSA_NAME)
1442                     return false;
1443
1444                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1445                   if (gimple_nop_p (op_def_stmt)
1446                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1447                       || !vinfo_for_stmt (op_def_stmt))
1448                     return false;
1449
1450                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1451                         != vect_used_in_outer
1452                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1453                            != vect_used_in_outer_by_reduction)
1454                     return false;
1455                 }
1456
1457               continue;
1458             }
1459
1460           gcc_assert (stmt_info);
1461
1462           if (STMT_VINFO_LIVE_P (stmt_info))
1463             {
1464               /* FORNOW: not yet supported.  */
1465               if (dump_enabled_p ())
1466                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1467                                  "not vectorized: value used after loop.\n");
1468               return false;
1469             }
1470
1471           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1472               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1473             {
1474               /* A scalar-dependence cycle that we don't support.  */
1475               if (dump_enabled_p ())
1476                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477                                  "not vectorized: scalar dependence cycle.\n");
1478               return false;
1479             }
1480
1481           if (STMT_VINFO_RELEVANT_P (stmt_info))
1482             {
1483               need_to_vectorize = true;
1484               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1485                 ok = vectorizable_induction (phi, NULL, NULL);
1486             }
1487
1488           if (!ok)
1489             {
1490               if (dump_enabled_p ())
1491                 {
1492                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1493                                    "not vectorized: relevant phi not "
1494                                    "supported: ");
1495                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1496                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1497                 }
1498               return false;
1499             }
1500         }
1501
1502       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1503         {
1504           gimple stmt = gsi_stmt (si);
1505           if (!gimple_clobber_p (stmt)
1506               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1507             return false;
1508         }
1509     } /* bbs */
1510
1511   /* All operations in the loop are either irrelevant (deal with loop
1512      control, or dead), or only used outside the loop and can be moved
1513      out of the loop (e.g. invariants, inductions).  The loop can be
1514      optimized away by scalar optimizations.  We're better off not
1515      touching this loop.  */
1516   if (!need_to_vectorize)
1517     {
1518       if (dump_enabled_p ())
1519         dump_printf_loc (MSG_NOTE, vect_location,
1520                          "All the computation can be taken out of the loop.\n");
1521       if (dump_enabled_p ())
1522         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1523                          "not vectorized: redundant loop. no profit to "
1524                          "vectorize.\n");
1525       return false;
1526     }
1527
1528   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1529     dump_printf_loc (MSG_NOTE, vect_location,
1530                      "vectorization_factor = %d, niters = "
1531                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1532                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1533
1534   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1535        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1536       || ((max_niter = max_stmt_executions_int (loop)) != -1
1537           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1538     {
1539       if (dump_enabled_p ())
1540         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1541                          "not vectorized: iteration count too small.\n");
1542       if (dump_enabled_p ())
1543         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1544                          "not vectorized: iteration count smaller than "
1545                          "vectorization factor.\n");
1546       return false;
1547     }
1548
1549   /* Analyze cost.  Decide if worth while to vectorize.  */
1550
1551   /* Once VF is set, SLP costs should be updated since the number of created
1552      vector stmts depends on VF.  */
1553   vect_update_slp_costs_according_to_vf (loop_vinfo);
1554
1555   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1556                                       &min_profitable_estimate);
1557   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1558
1559   if (min_profitable_iters < 0)
1560     {
1561       if (dump_enabled_p ())
1562         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1563                          "not vectorized: vectorization not profitable.\n");
1564       if (dump_enabled_p ())
1565         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1566                          "not vectorized: vector version will never be "
1567                          "profitable.\n");
1568       return false;
1569     }
1570
1571   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1572                             * vectorization_factor) - 1);
1573
1574
1575   /* Use the cost model only if it is more conservative than user specified
1576      threshold.  */
1577
1578   th = (unsigned) min_scalar_loop_bound;
1579   if (min_profitable_iters
1580       && (!min_scalar_loop_bound
1581           || min_profitable_iters > min_scalar_loop_bound))
1582     th = (unsigned) min_profitable_iters;
1583
1584   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1585
1586   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1587       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1588     {
1589       if (dump_enabled_p ())
1590         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1591                          "not vectorized: vectorization not profitable.\n");
1592       if (dump_enabled_p ())
1593         dump_printf_loc (MSG_NOTE, vect_location,
1594                          "not vectorized: iteration count smaller than user "
1595                          "specified loop bound parameter or minimum profitable "
1596                          "iterations (whichever is more conservative).\n");
1597       return false;
1598     }
1599
1600   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1601       && ((unsigned HOST_WIDE_INT) estimated_niter
1602           <= MAX (th, (unsigned)min_profitable_estimate)))
1603     {
1604       if (dump_enabled_p ())
1605         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1606                          "not vectorized: estimated iteration count too "
1607                          "small.\n");
1608       if (dump_enabled_p ())
1609         dump_printf_loc (MSG_NOTE, vect_location,
1610                          "not vectorized: estimated iteration count smaller "
1611                          "than specified loop bound parameter or minimum "
1612                          "profitable iterations (whichever is more "
1613                          "conservative).\n");
1614       return false;
1615     }
1616
1617   return true;
1618 }
1619
1620
1621 /* Function vect_analyze_loop_2.
1622
1623    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1624    for it.  The different analyses will record information in the
1625    loop_vec_info struct.  */
1626 static bool
1627 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1628 {
1629   bool ok, slp = false;
1630   int max_vf = MAX_VECTORIZATION_FACTOR;
1631   int min_vf = 2;
1632   unsigned int th;
1633   unsigned int n_stmts = 0;
1634
1635   /* Find all data references in the loop (which correspond to vdefs/vuses)
1636      and analyze their evolution in the loop.  Also adjust the minimal
1637      vectorization factor according to the loads and stores.
1638
1639      FORNOW: Handle only simple, array references, which
1640      alignment can be forced, and aligned pointer-references.  */
1641
1642   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1643   if (!ok)
1644     {
1645       if (dump_enabled_p ())
1646         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1647                          "bad data references.\n");
1648       return false;
1649     }
1650
1651   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1652      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1653
1654   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1655   if (!ok)
1656     {
1657       if (dump_enabled_p ())
1658         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1659                          "bad data access.\n");
1660       return false;
1661     }
1662
1663   /* Classify all cross-iteration scalar data-flow cycles.
1664      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1665
1666   vect_analyze_scalar_cycles (loop_vinfo);
1667
1668   vect_pattern_recog (loop_vinfo, NULL);
1669
1670   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1671
1672   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1673   if (!ok)
1674     {
1675       if (dump_enabled_p ())
1676         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1677                          "unexpected pattern.\n");
1678       return false;
1679     }
1680
1681   /* Analyze data dependences between the data-refs in the loop
1682      and adjust the maximum vectorization factor according to
1683      the dependences.
1684      FORNOW: fail at the first data dependence that we encounter.  */
1685
1686   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1687   if (!ok
1688       || max_vf < min_vf)
1689     {
1690       if (dump_enabled_p ())
1691             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1692                              "bad data dependence.\n");
1693       return false;
1694     }
1695
1696   ok = vect_determine_vectorization_factor (loop_vinfo);
1697   if (!ok)
1698     {
1699       if (dump_enabled_p ())
1700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701                          "can't determine vectorization factor.\n");
1702       return false;
1703     }
1704   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1705     {
1706       if (dump_enabled_p ())
1707         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1708                          "bad data dependence.\n");
1709       return false;
1710     }
1711
1712   /* Analyze the alignment of the data-refs in the loop.
1713      Fail if a data reference is found that cannot be vectorized.  */
1714
1715   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1716   if (!ok)
1717     {
1718       if (dump_enabled_p ())
1719         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1720                          "bad data alignment.\n");
1721       return false;
1722     }
1723
1724   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1725      It is important to call pruning after vect_analyze_data_ref_accesses,
1726      since we use grouping information gathered by interleaving analysis.  */
1727   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1728   if (!ok)
1729     {
1730       if (dump_enabled_p ())
1731         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1732                          "number of versioning for alias "
1733                          "run-time tests exceeds %d "
1734                          "(--param vect-max-version-for-alias-checks)\n",
1735                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1736       return false;
1737     }
1738
1739   /* This pass will decide on using loop versioning and/or loop peeling in
1740      order to enhance the alignment of data references in the loop.  */
1741
1742   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1743   if (!ok)
1744     {
1745       if (dump_enabled_p ())
1746         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1747                          "bad data alignment.\n");
1748       return false;
1749     }
1750
1751   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1752   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1753   if (ok)
1754     {
1755       /* Decide which possible SLP instances to SLP.  */
1756       slp = vect_make_slp_decision (loop_vinfo);
1757
1758       /* Find stmts that need to be both vectorized and SLPed.  */
1759       vect_detect_hybrid_slp (loop_vinfo);
1760     }
1761   else
1762     return false;
1763
1764   /* Scan all the operations in the loop and make sure they are
1765      vectorizable.  */
1766
1767   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1768   if (!ok)
1769     {
1770       if (dump_enabled_p ())
1771         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1772                          "bad operation or unsupported loop bound.\n");
1773       return false;
1774     }
1775
1776   /* Decide whether we need to create an epilogue loop to handle
1777      remaining scalar iterations.  */
1778   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1779         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1780        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1781
1782   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1783       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1784     {
1785       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1786                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1787           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1788         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1789     }
1790   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1791            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1792                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1793                /* In case of versioning, check if the maximum number of
1794                   iterations is greater than th.  If they are identical,
1795                   the epilogue is unnecessary.  */
1796                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1797                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1798                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1799                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1800     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1801
1802   /* If an epilogue loop is required make sure we can create one.  */
1803   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1804       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1805     {
1806       if (dump_enabled_p ())
1807         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1808       if (!vect_can_advance_ivs_p (loop_vinfo)
1809           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1810                                            single_exit (LOOP_VINFO_LOOP
1811                                                          (loop_vinfo))))
1812         {
1813           if (dump_enabled_p ())
1814             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815                              "not vectorized: can't create required "
1816                              "epilog loop\n");
1817           return false;
1818         }
1819     }
1820
1821   return true;
1822 }
1823
1824 /* Function vect_analyze_loop.
1825
1826    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1827    for it.  The different analyses will record information in the
1828    loop_vec_info struct.  */
1829 loop_vec_info
1830 vect_analyze_loop (struct loop *loop)
1831 {
1832   loop_vec_info loop_vinfo;
1833   unsigned int vector_sizes;
1834
1835   /* Autodetect first vector size we try.  */
1836   current_vector_size = 0;
1837   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1838
1839   if (dump_enabled_p ())
1840     dump_printf_loc (MSG_NOTE, vect_location,
1841                      "===== analyze_loop_nest =====\n");
1842
1843   if (loop_outer (loop)
1844       && loop_vec_info_for_loop (loop_outer (loop))
1845       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1846     {
1847       if (dump_enabled_p ())
1848         dump_printf_loc (MSG_NOTE, vect_location,
1849                          "outer-loop already vectorized.\n");
1850       return NULL;
1851     }
1852
1853   while (1)
1854     {
1855       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1856       loop_vinfo = vect_analyze_loop_form (loop);
1857       if (!loop_vinfo)
1858         {
1859           if (dump_enabled_p ())
1860             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1861                              "bad loop form.\n");
1862           return NULL;
1863         }
1864
1865       if (vect_analyze_loop_2 (loop_vinfo))
1866         {
1867           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1868
1869           return loop_vinfo;
1870         }
1871
1872       destroy_loop_vec_info (loop_vinfo, true);
1873
1874       vector_sizes &= ~current_vector_size;
1875       if (vector_sizes == 0
1876           || current_vector_size == 0)
1877         return NULL;
1878
1879       /* Try the next biggest vector size.  */
1880       current_vector_size = 1 << floor_log2 (vector_sizes);
1881       if (dump_enabled_p ())
1882         dump_printf_loc (MSG_NOTE, vect_location,
1883                          "***** Re-trying analysis with "
1884                          "vector size %d\n", current_vector_size);
1885     }
1886 }
1887
1888
1889 /* Function reduction_code_for_scalar_code
1890
1891    Input:
1892    CODE - tree_code of a reduction operations.
1893
1894    Output:
1895    REDUC_CODE - the corresponding tree-code to be used to reduce the
1896       vector of partial results into a single scalar result (which
1897       will also reside in a vector) or ERROR_MARK if the operation is
1898       a supported reduction operation, but does not have such tree-code.
1899
1900    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1901
1902 static bool
1903 reduction_code_for_scalar_code (enum tree_code code,
1904                                 enum tree_code *reduc_code)
1905 {
1906   switch (code)
1907     {
1908       case MAX_EXPR:
1909         *reduc_code = REDUC_MAX_EXPR;
1910         return true;
1911
1912       case MIN_EXPR:
1913         *reduc_code = REDUC_MIN_EXPR;
1914         return true;
1915
1916       case PLUS_EXPR:
1917         *reduc_code = REDUC_PLUS_EXPR;
1918         return true;
1919
1920       case MULT_EXPR:
1921       case MINUS_EXPR:
1922       case BIT_IOR_EXPR:
1923       case BIT_XOR_EXPR:
1924       case BIT_AND_EXPR:
1925         *reduc_code = ERROR_MARK;
1926         return true;
1927
1928       default:
1929        return false;
1930     }
1931 }
1932
1933
1934 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1935    STMT is printed with a message MSG. */
1936
1937 static void
1938 report_vect_op (int msg_type, gimple stmt, const char *msg)
1939 {
1940   dump_printf_loc (msg_type, vect_location, "%s", msg);
1941   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1942   dump_printf (msg_type, "\n");
1943 }
1944
1945
1946 /* Detect SLP reduction of the form:
1947
1948    #a1 = phi <a5, a0>
1949    a2 = operation (a1)
1950    a3 = operation (a2)
1951    a4 = operation (a3)
1952    a5 = operation (a4)
1953
1954    #a = phi <a5>
1955
1956    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1957    FIRST_STMT is the first reduction stmt in the chain
1958    (a2 = operation (a1)).
1959
1960    Return TRUE if a reduction chain was detected.  */
1961
1962 static bool
1963 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1964 {
1965   struct loop *loop = (gimple_bb (phi))->loop_father;
1966   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1967   enum tree_code code;
1968   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1969   stmt_vec_info use_stmt_info, current_stmt_info;
1970   tree lhs;
1971   imm_use_iterator imm_iter;
1972   use_operand_p use_p;
1973   int nloop_uses, size = 0, n_out_of_loop_uses;
1974   bool found = false;
1975
1976   if (loop != vect_loop)
1977     return false;
1978
1979   lhs = PHI_RESULT (phi);
1980   code = gimple_assign_rhs_code (first_stmt);
1981   while (1)
1982     {
1983       nloop_uses = 0;
1984       n_out_of_loop_uses = 0;
1985       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1986         {
1987           gimple use_stmt = USE_STMT (use_p);
1988           if (is_gimple_debug (use_stmt))
1989             continue;
1990
1991           /* Check if we got back to the reduction phi.  */
1992           if (use_stmt == phi)
1993             {
1994               loop_use_stmt = use_stmt;
1995               found = true;
1996               break;
1997             }
1998
1999           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2000             {
2001               if (vinfo_for_stmt (use_stmt)
2002                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2003                 {
2004                   loop_use_stmt = use_stmt;
2005                   nloop_uses++;
2006                 }
2007             }
2008            else
2009              n_out_of_loop_uses++;
2010
2011            /* There are can be either a single use in the loop or two uses in
2012               phi nodes.  */
2013            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2014              return false;
2015         }
2016
2017       if (found)
2018         break;
2019
2020       /* We reached a statement with no loop uses.  */
2021       if (nloop_uses == 0)
2022         return false;
2023
2024       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2025       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2026         return false;
2027
2028       if (!is_gimple_assign (loop_use_stmt)
2029           || code != gimple_assign_rhs_code (loop_use_stmt)
2030           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2031         return false;
2032
2033       /* Insert USE_STMT into reduction chain.  */
2034       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2035       if (current_stmt)
2036         {
2037           current_stmt_info = vinfo_for_stmt (current_stmt);
2038           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2039           GROUP_FIRST_ELEMENT (use_stmt_info)
2040             = GROUP_FIRST_ELEMENT (current_stmt_info);
2041         }
2042       else
2043         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2044
2045       lhs = gimple_assign_lhs (loop_use_stmt);
2046       current_stmt = loop_use_stmt;
2047       size++;
2048    }
2049
2050   if (!found || loop_use_stmt != phi || size < 2)
2051     return false;
2052
2053   /* Swap the operands, if needed, to make the reduction operand be the second
2054      operand.  */
2055   lhs = PHI_RESULT (phi);
2056   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2057   while (next_stmt)
2058     {
2059       if (gimple_assign_rhs2 (next_stmt) == lhs)
2060         {
2061           tree op = gimple_assign_rhs1 (next_stmt);
2062           gimple def_stmt = NULL;
2063
2064           if (TREE_CODE (op) == SSA_NAME)
2065             def_stmt = SSA_NAME_DEF_STMT (op);
2066
2067           /* Check that the other def is either defined in the loop
2068              ("vect_internal_def"), or it's an induction (defined by a
2069              loop-header phi-node).  */
2070           if (def_stmt
2071               && gimple_bb (def_stmt)
2072               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2073               && (is_gimple_assign (def_stmt)
2074                   || is_gimple_call (def_stmt)
2075                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2076                            == vect_induction_def
2077                   || (gimple_code (def_stmt) == GIMPLE_PHI
2078                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2079                                   == vect_internal_def
2080                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2081             {
2082               lhs = gimple_assign_lhs (next_stmt);
2083               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2084               continue;
2085             }
2086
2087           return false;
2088         }
2089       else
2090         {
2091           tree op = gimple_assign_rhs2 (next_stmt);
2092           gimple def_stmt = NULL;
2093
2094           if (TREE_CODE (op) == SSA_NAME)
2095             def_stmt = SSA_NAME_DEF_STMT (op);
2096
2097           /* Check that the other def is either defined in the loop
2098             ("vect_internal_def"), or it's an induction (defined by a
2099             loop-header phi-node).  */
2100           if (def_stmt
2101               && gimple_bb (def_stmt)
2102               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2103               && (is_gimple_assign (def_stmt)
2104                   || is_gimple_call (def_stmt)
2105                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2106                               == vect_induction_def
2107                   || (gimple_code (def_stmt) == GIMPLE_PHI
2108                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2109                                   == vect_internal_def
2110                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2111             {
2112               if (dump_enabled_p ())
2113                 {
2114                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2115                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2116                   dump_printf (MSG_NOTE, "\n");
2117                 }
2118
2119               swap_ssa_operands (next_stmt,
2120                                  gimple_assign_rhs1_ptr (next_stmt),
2121                                  gimple_assign_rhs2_ptr (next_stmt));
2122               update_stmt (next_stmt);
2123
2124               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2125                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2126             }
2127           else
2128             return false;
2129         }
2130
2131       lhs = gimple_assign_lhs (next_stmt);
2132       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2133     }
2134
2135   /* Save the chain for further analysis in SLP detection.  */
2136   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2137   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2138   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2139
2140   return true;
2141 }
2142
2143
2144 /* Function vect_is_simple_reduction_1
2145
2146    (1) Detect a cross-iteration def-use cycle that represents a simple
2147    reduction computation.  We look for the following pattern:
2148
2149    loop_header:
2150      a1 = phi < a0, a2 >
2151      a3 = ...
2152      a2 = operation (a3, a1)
2153
2154    or
2155
2156    a3 = ...
2157    loop_header:
2158      a1 = phi < a0, a2 >
2159      a2 = operation (a3, a1)
2160
2161    such that:
2162    1. operation is commutative and associative and it is safe to
2163       change the order of the computation (if CHECK_REDUCTION is true)
2164    2. no uses for a2 in the loop (a2 is used out of the loop)
2165    3. no uses of a1 in the loop besides the reduction operation
2166    4. no uses of a1 outside the loop.
2167
2168    Conditions 1,4 are tested here.
2169    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2170
2171    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2172    nested cycles, if CHECK_REDUCTION is false.
2173
2174    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2175    reductions:
2176
2177      a1 = phi < a0, a2 >
2178      inner loop (def of a3)
2179      a2 = phi < a3 >
2180
2181    If MODIFY is true it tries also to rework the code in-place to enable
2182    detection of more reduction patterns.  For the time being we rewrite
2183    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2184 */
2185
2186 static gimple
2187 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2188                             bool check_reduction, bool *double_reduc,
2189                             bool modify)
2190 {
2191   struct loop *loop = (gimple_bb (phi))->loop_father;
2192   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2193   edge latch_e = loop_latch_edge (loop);
2194   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2195   gimple def_stmt, def1 = NULL, def2 = NULL;
2196   enum tree_code orig_code, code;
2197   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2198   tree type;
2199   int nloop_uses;
2200   tree name;
2201   imm_use_iterator imm_iter;
2202   use_operand_p use_p;
2203   bool phi_def;
2204
2205   *double_reduc = false;
2206
2207   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2208      otherwise, we assume outer loop vectorization.  */
2209   gcc_assert ((check_reduction && loop == vect_loop)
2210               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2211
2212   name = PHI_RESULT (phi);
2213   /* ???  If there are no uses of the PHI result the inner loop reduction
2214      won't be detected as possibly double-reduction by vectorizable_reduction
2215      because that tries to walk the PHI arg from the preheader edge which
2216      can be constant.  See PR60382.  */
2217   if (has_zero_uses (name))
2218     return NULL;
2219   nloop_uses = 0;
2220   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2221     {
2222       gimple use_stmt = USE_STMT (use_p);
2223       if (is_gimple_debug (use_stmt))
2224         continue;
2225
2226       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2227         {
2228           if (dump_enabled_p ())
2229             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2230                              "intermediate value used outside loop.\n");
2231
2232           return NULL;
2233         }
2234
2235       if (vinfo_for_stmt (use_stmt)
2236           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2237         nloop_uses++;
2238       if (nloop_uses > 1)
2239         {
2240           if (dump_enabled_p ())
2241             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2242                              "reduction used in loop.\n");
2243           return NULL;
2244         }
2245     }
2246
2247   if (TREE_CODE (loop_arg) != SSA_NAME)
2248     {
2249       if (dump_enabled_p ())
2250         {
2251           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252                            "reduction: not ssa_name: ");
2253           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2254           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2255         }
2256       return NULL;
2257     }
2258
2259   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2260   if (!def_stmt)
2261     {
2262       if (dump_enabled_p ())
2263         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2264                          "reduction: no def_stmt.\n");
2265       return NULL;
2266     }
2267
2268   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2269     {
2270       if (dump_enabled_p ())
2271         {
2272           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2273           dump_printf (MSG_NOTE, "\n");
2274         }
2275       return NULL;
2276     }
2277
2278   if (is_gimple_assign (def_stmt))
2279     {
2280       name = gimple_assign_lhs (def_stmt);
2281       phi_def = false;
2282     }
2283   else
2284     {
2285       name = PHI_RESULT (def_stmt);
2286       phi_def = true;
2287     }
2288
2289   nloop_uses = 0;
2290   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2291     {
2292       gimple use_stmt = USE_STMT (use_p);
2293       if (is_gimple_debug (use_stmt))
2294         continue;
2295       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2296           && vinfo_for_stmt (use_stmt)
2297           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2298         nloop_uses++;
2299       if (nloop_uses > 1)
2300         {
2301           if (dump_enabled_p ())
2302             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2303                              "reduction used in loop.\n");
2304           return NULL;
2305         }
2306     }
2307
2308   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2309      defined in the inner loop.  */
2310   if (phi_def)
2311     {
2312       op1 = PHI_ARG_DEF (def_stmt, 0);
2313
2314       if (gimple_phi_num_args (def_stmt) != 1
2315           || TREE_CODE (op1) != SSA_NAME)
2316         {
2317           if (dump_enabled_p ())
2318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2319                              "unsupported phi node definition.\n");
2320
2321           return NULL;
2322         }
2323
2324       def1 = SSA_NAME_DEF_STMT (op1);
2325       if (gimple_bb (def1)
2326           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2327           && loop->inner
2328           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2329           && is_gimple_assign (def1))
2330         {
2331           if (dump_enabled_p ())
2332             report_vect_op (MSG_NOTE, def_stmt,
2333                             "detected double reduction: ");
2334
2335           *double_reduc = true;
2336           return def_stmt;
2337         }
2338
2339       return NULL;
2340     }
2341
2342   code = orig_code = gimple_assign_rhs_code (def_stmt);
2343
2344   /* We can handle "res -= x[i]", which is non-associative by
2345      simply rewriting this into "res += -x[i]".  Avoid changing
2346      gimple instruction for the first simple tests and only do this
2347      if we're allowed to change code at all.  */
2348   if (code == MINUS_EXPR
2349       && modify
2350       && (op1 = gimple_assign_rhs1 (def_stmt))
2351       && TREE_CODE (op1) == SSA_NAME
2352       && SSA_NAME_DEF_STMT (op1) == phi)
2353     code = PLUS_EXPR;
2354
2355   if (check_reduction
2356       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2357     {
2358       if (dump_enabled_p ())
2359         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2360                         "reduction: not commutative/associative: ");
2361       return NULL;
2362     }
2363
2364   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2365     {
2366       if (code != COND_EXPR)
2367         {
2368           if (dump_enabled_p ())
2369             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370                             "reduction: not binary operation: ");
2371
2372           return NULL;
2373         }
2374
2375       op3 = gimple_assign_rhs1 (def_stmt);
2376       if (COMPARISON_CLASS_P (op3))
2377         {
2378           op4 = TREE_OPERAND (op3, 1);
2379           op3 = TREE_OPERAND (op3, 0);
2380         }
2381
2382       op1 = gimple_assign_rhs2 (def_stmt);
2383       op2 = gimple_assign_rhs3 (def_stmt);
2384
2385       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2386         {
2387           if (dump_enabled_p ())
2388             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2389                             "reduction: uses not ssa_names: ");
2390
2391           return NULL;
2392         }
2393     }
2394   else
2395     {
2396       op1 = gimple_assign_rhs1 (def_stmt);
2397       op2 = gimple_assign_rhs2 (def_stmt);
2398
2399       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2400         {
2401           if (dump_enabled_p ())
2402             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403                             "reduction: uses not ssa_names: ");
2404
2405           return NULL;
2406         }
2407    }
2408
2409   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2410   if ((TREE_CODE (op1) == SSA_NAME
2411        && !types_compatible_p (type,TREE_TYPE (op1)))
2412       || (TREE_CODE (op2) == SSA_NAME
2413           && !types_compatible_p (type, TREE_TYPE (op2)))
2414       || (op3 && TREE_CODE (op3) == SSA_NAME
2415           && !types_compatible_p (type, TREE_TYPE (op3)))
2416       || (op4 && TREE_CODE (op4) == SSA_NAME
2417           && !types_compatible_p (type, TREE_TYPE (op4))))
2418     {
2419       if (dump_enabled_p ())
2420         {
2421           dump_printf_loc (MSG_NOTE, vect_location,
2422                            "reduction: multiple types: operation type: ");
2423           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2424           dump_printf (MSG_NOTE, ", operands types: ");
2425           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2426                              TREE_TYPE (op1));
2427           dump_printf (MSG_NOTE, ",");
2428           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2429                              TREE_TYPE (op2));
2430           if (op3)
2431             {
2432               dump_printf (MSG_NOTE, ",");
2433               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2434                                  TREE_TYPE (op3));
2435             }
2436
2437           if (op4)
2438             {
2439               dump_printf (MSG_NOTE, ",");
2440               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2441                                  TREE_TYPE (op4));
2442             }
2443           dump_printf (MSG_NOTE, "\n");
2444         }
2445
2446       return NULL;
2447     }
2448
2449   /* Check that it's ok to change the order of the computation.
2450      Generally, when vectorizing a reduction we change the order of the
2451      computation.  This may change the behavior of the program in some
2452      cases, so we need to check that this is ok.  One exception is when
2453      vectorizing an outer-loop: the inner-loop is executed sequentially,
2454      and therefore vectorizing reductions in the inner-loop during
2455      outer-loop vectorization is safe.  */
2456
2457   /* CHECKME: check for !flag_finite_math_only too?  */
2458   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2459       && check_reduction)
2460     {
2461       /* Changing the order of operations changes the semantics.  */
2462       if (dump_enabled_p ())
2463         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2464                         "reduction: unsafe fp math optimization: ");
2465       return NULL;
2466     }
2467   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2468            && check_reduction)
2469     {
2470       /* Changing the order of operations changes the semantics.  */
2471       if (dump_enabled_p ())
2472         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2473                         "reduction: unsafe int math optimization: ");
2474       return NULL;
2475     }
2476   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2477     {
2478       /* Changing the order of operations changes the semantics.  */
2479       if (dump_enabled_p ())
2480         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2481                         "reduction: unsafe fixed-point math optimization: ");
2482       return NULL;
2483     }
2484
2485   /* If we detected "res -= x[i]" earlier, rewrite it into
2486      "res += -x[i]" now.  If this turns out to be useless reassoc
2487      will clean it up again.  */
2488   if (orig_code == MINUS_EXPR)
2489     {
2490       tree rhs = gimple_assign_rhs2 (def_stmt);
2491       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2492       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2493                                                          rhs, NULL);
2494       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2495       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2496                                                           loop_info, NULL));
2497       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2498       gimple_assign_set_rhs2 (def_stmt, negrhs);
2499       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2500       update_stmt (def_stmt);
2501     }
2502
2503   /* Reduction is safe. We're dealing with one of the following:
2504      1) integer arithmetic and no trapv
2505      2) floating point arithmetic, and special flags permit this optimization
2506      3) nested cycle (i.e., outer loop vectorization).  */
2507   if (TREE_CODE (op1) == SSA_NAME)
2508     def1 = SSA_NAME_DEF_STMT (op1);
2509
2510   if (TREE_CODE (op2) == SSA_NAME)
2511     def2 = SSA_NAME_DEF_STMT (op2);
2512
2513   if (code != COND_EXPR
2514       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2515     {
2516       if (dump_enabled_p ())
2517         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2518       return NULL;
2519     }
2520
2521   /* Check that one def is the reduction def, defined by PHI,
2522      the other def is either defined in the loop ("vect_internal_def"),
2523      or it's an induction (defined by a loop-header phi-node).  */
2524
2525   if (def2 && def2 == phi
2526       && (code == COND_EXPR
2527           || !def1 || gimple_nop_p (def1)
2528           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2529           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2530               && (is_gimple_assign (def1)
2531                   || is_gimple_call (def1)
2532                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2533                       == vect_induction_def
2534                   || (gimple_code (def1) == GIMPLE_PHI
2535                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2536                           == vect_internal_def
2537                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2538     {
2539       if (dump_enabled_p ())
2540         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2541       return def_stmt;
2542     }
2543
2544   if (def1 && def1 == phi
2545       && (code == COND_EXPR
2546           || !def2 || gimple_nop_p (def2)
2547           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2548           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2549               && (is_gimple_assign (def2)
2550                   || is_gimple_call (def2)
2551                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2552                       == vect_induction_def
2553                   || (gimple_code (def2) == GIMPLE_PHI
2554                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2555                           == vect_internal_def
2556                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2557     {
2558       if (check_reduction)
2559         {
2560           /* Swap operands (just for simplicity - so that the rest of the code
2561              can assume that the reduction variable is always the last (second)
2562              argument).  */
2563           if (dump_enabled_p ())
2564             report_vect_op (MSG_NOTE, def_stmt,
2565                             "detected reduction: need to swap operands: ");
2566
2567           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2568                              gimple_assign_rhs2_ptr (def_stmt));
2569
2570           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2571             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2572         }
2573       else
2574         {
2575           if (dump_enabled_p ())
2576             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2577         }
2578
2579       return def_stmt;
2580     }
2581
2582   /* Try to find SLP reduction chain.  */
2583   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2584     {
2585       if (dump_enabled_p ())
2586         report_vect_op (MSG_NOTE, def_stmt,
2587                         "reduction: detected reduction chain: ");
2588
2589       return def_stmt;
2590     }
2591
2592   if (dump_enabled_p ())
2593     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2594                     "reduction: unknown pattern: ");
2595
2596   return NULL;
2597 }
2598
2599 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2600    in-place.  Arguments as there.  */
2601
2602 static gimple
2603 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2604                           bool check_reduction, bool *double_reduc)
2605 {
2606   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2607                                      double_reduc, false);
2608 }
2609
2610 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2611    in-place if it enables detection of more reductions.  Arguments
2612    as there.  */
2613
2614 gimple
2615 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2616                           bool check_reduction, bool *double_reduc)
2617 {
2618   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2619                                      double_reduc, true);
2620 }
2621
2622 /* Calculate the cost of one scalar iteration of the loop.  */
2623 int
2624 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2625 {
2626   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2627   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2628   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2629   int innerloop_iters, i, stmt_cost;
2630
2631   /* Count statements in scalar loop.  Using this as scalar cost for a single
2632      iteration for now.
2633
2634      TODO: Add outer loop support.
2635
2636      TODO: Consider assigning different costs to different scalar
2637      statements.  */
2638
2639   /* FORNOW.  */
2640   innerloop_iters = 1;
2641   if (loop->inner)
2642     innerloop_iters = 50; /* FIXME */
2643
2644   for (i = 0; i < nbbs; i++)
2645     {
2646       gimple_stmt_iterator si;
2647       basic_block bb = bbs[i];
2648
2649       if (bb->loop_father == loop->inner)
2650         factor = innerloop_iters;
2651       else
2652         factor = 1;
2653
2654       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2655         {
2656           gimple stmt = gsi_stmt (si);
2657           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2658
2659           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2660             continue;
2661
2662           /* Skip stmts that are not vectorized inside the loop.  */
2663           if (stmt_info
2664               && !STMT_VINFO_RELEVANT_P (stmt_info)
2665               && (!STMT_VINFO_LIVE_P (stmt_info)
2666                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2667               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2668             continue;
2669
2670           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2671             {
2672               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2673                stmt_cost = vect_get_stmt_cost (scalar_load);
2674              else
2675                stmt_cost = vect_get_stmt_cost (scalar_store);
2676             }
2677           else
2678             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2679
2680           scalar_single_iter_cost += stmt_cost * factor;
2681         }
2682     }
2683   return scalar_single_iter_cost;
2684 }
2685
2686 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2687 int
2688 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2689                              int *peel_iters_epilogue,
2690                              int scalar_single_iter_cost,
2691                              stmt_vector_for_cost *prologue_cost_vec,
2692                              stmt_vector_for_cost *epilogue_cost_vec)
2693 {
2694   int retval = 0;
2695   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2696
2697   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2698     {
2699       *peel_iters_epilogue = vf/2;
2700       if (dump_enabled_p ())
2701         dump_printf_loc (MSG_NOTE, vect_location,
2702                          "cost model: epilogue peel iters set to vf/2 "
2703                          "because loop iterations are unknown .\n");
2704
2705       /* If peeled iterations are known but number of scalar loop
2706          iterations are unknown, count a taken branch per peeled loop.  */
2707       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2708                                  NULL, 0, vect_prologue);
2709     }
2710   else
2711     {
2712       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2713       peel_iters_prologue = niters < peel_iters_prologue ?
2714                             niters : peel_iters_prologue;
2715       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2716       /* If we need to peel for gaps, but no peeling is required, we have to
2717          peel VF iterations.  */
2718       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2719         *peel_iters_epilogue = vf;
2720     }
2721
2722   if (peel_iters_prologue)
2723     retval += record_stmt_cost (prologue_cost_vec,
2724                                 peel_iters_prologue * scalar_single_iter_cost,
2725                                 scalar_stmt, NULL, 0, vect_prologue);
2726   if (*peel_iters_epilogue)
2727     retval += record_stmt_cost (epilogue_cost_vec,
2728                                 *peel_iters_epilogue * scalar_single_iter_cost,
2729                                 scalar_stmt, NULL, 0, vect_epilogue);
2730   return retval;
2731 }
2732
2733 /* Function vect_estimate_min_profitable_iters
2734
2735    Return the number of iterations required for the vector version of the
2736    loop to be profitable relative to the cost of the scalar version of the
2737    loop.  */
2738
2739 static void
2740 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2741                                     int *ret_min_profitable_niters,
2742                                     int *ret_min_profitable_estimate)
2743 {
2744   int min_profitable_iters;
2745   int min_profitable_estimate;
2746   int peel_iters_prologue;
2747   int peel_iters_epilogue;
2748   unsigned vec_inside_cost = 0;
2749   int vec_outside_cost = 0;
2750   unsigned vec_prologue_cost = 0;
2751   unsigned vec_epilogue_cost = 0;
2752   int scalar_single_iter_cost = 0;
2753   int scalar_outside_cost = 0;
2754   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2755   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2756   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2757
2758   /* Cost model disabled.  */
2759   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2760     {
2761       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2762       *ret_min_profitable_niters = 0;
2763       *ret_min_profitable_estimate = 0;
2764       return;
2765     }
2766
2767   /* Requires loop versioning tests to handle misalignment.  */
2768   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2769     {
2770       /*  FIXME: Make cost depend on complexity of individual check.  */
2771       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2772       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2773                             vect_prologue);
2774       dump_printf (MSG_NOTE,
2775                    "cost model: Adding cost of checks for loop "
2776                    "versioning to treat misalignment.\n");
2777     }
2778
2779   /* Requires loop versioning with alias checks.  */
2780   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2781     {
2782       /*  FIXME: Make cost depend on complexity of individual check.  */
2783       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2784       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2785                             vect_prologue);
2786       dump_printf (MSG_NOTE,
2787                    "cost model: Adding cost of checks for loop "
2788                    "versioning aliasing.\n");
2789     }
2790
2791   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2792       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2793     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2794                           vect_prologue);
2795
2796   /* Count statements in scalar loop.  Using this as scalar cost for a single
2797      iteration for now.
2798
2799      TODO: Add outer loop support.
2800
2801      TODO: Consider assigning different costs to different scalar
2802      statements.  */
2803
2804   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2805
2806   /* Add additional cost for the peeled instructions in prologue and epilogue
2807      loop.
2808
2809      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2810      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2811
2812      TODO: Build an expression that represents peel_iters for prologue and
2813      epilogue to be used in a run-time test.  */
2814
2815   if (npeel  < 0)
2816     {
2817       peel_iters_prologue = vf/2;
2818       dump_printf (MSG_NOTE, "cost model: "
2819                    "prologue peel iters set to vf/2.\n");
2820
2821       /* If peeling for alignment is unknown, loop bound of main loop becomes
2822          unknown.  */
2823       peel_iters_epilogue = vf/2;
2824       dump_printf (MSG_NOTE, "cost model: "
2825                    "epilogue peel iters set to vf/2 because "
2826                    "peeling for alignment is unknown.\n");
2827
2828       /* If peeled iterations are unknown, count a taken branch and a not taken
2829          branch per peeled loop. Even if scalar loop iterations are known,
2830          vector iterations are not known since peeled prologue iterations are
2831          not known. Hence guards remain the same.  */
2832       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2833                             NULL, 0, vect_prologue);
2834       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2835                             NULL, 0, vect_prologue);
2836       /* FORNOW: Don't attempt to pass individual scalar instructions to
2837          the model; just assume linear cost for scalar iterations.  */
2838       (void) add_stmt_cost (target_cost_data,
2839                             peel_iters_prologue * scalar_single_iter_cost,
2840                             scalar_stmt, NULL, 0, vect_prologue);
2841       (void) add_stmt_cost (target_cost_data,
2842                             peel_iters_epilogue * scalar_single_iter_cost,
2843                             scalar_stmt, NULL, 0, vect_epilogue);
2844     }
2845   else
2846     {
2847       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2848       stmt_info_for_cost *si;
2849       int j;
2850       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2851
2852       prologue_cost_vec.create (2);
2853       epilogue_cost_vec.create (2);
2854       peel_iters_prologue = npeel;
2855
2856       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2857                                           &peel_iters_epilogue,
2858                                           scalar_single_iter_cost,
2859                                           &prologue_cost_vec,
2860                                           &epilogue_cost_vec);
2861
2862       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2863         {
2864           struct _stmt_vec_info *stmt_info
2865             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2866           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2867                                 si->misalign, vect_prologue);
2868         }
2869
2870       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2871         {
2872           struct _stmt_vec_info *stmt_info
2873             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2874           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2875                                 si->misalign, vect_epilogue);
2876         }
2877
2878       prologue_cost_vec.release ();
2879       epilogue_cost_vec.release ();
2880     }
2881
2882   /* FORNOW: The scalar outside cost is incremented in one of the
2883      following ways:
2884
2885      1. The vectorizer checks for alignment and aliasing and generates
2886      a condition that allows dynamic vectorization.  A cost model
2887      check is ANDED with the versioning condition.  Hence scalar code
2888      path now has the added cost of the versioning check.
2889
2890        if (cost > th & versioning_check)
2891          jmp to vector code
2892
2893      Hence run-time scalar is incremented by not-taken branch cost.
2894
2895      2. The vectorizer then checks if a prologue is required.  If the
2896      cost model check was not done before during versioning, it has to
2897      be done before the prologue check.
2898
2899        if (cost <= th)
2900          prologue = scalar_iters
2901        if (prologue == 0)
2902          jmp to vector code
2903        else
2904          execute prologue
2905        if (prologue == num_iters)
2906          go to exit
2907
2908      Hence the run-time scalar cost is incremented by a taken branch,
2909      plus a not-taken branch, plus a taken branch cost.
2910
2911      3. The vectorizer then checks if an epilogue is required.  If the
2912      cost model check was not done before during prologue check, it
2913      has to be done with the epilogue check.
2914
2915        if (prologue == 0)
2916          jmp to vector code
2917        else
2918          execute prologue
2919        if (prologue == num_iters)
2920          go to exit
2921        vector code:
2922          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2923            jmp to epilogue
2924
2925      Hence the run-time scalar cost should be incremented by 2 taken
2926      branches.
2927
2928      TODO: The back end may reorder the BBS's differently and reverse
2929      conditions/branch directions.  Change the estimates below to
2930      something more reasonable.  */
2931
2932   /* If the number of iterations is known and we do not do versioning, we can
2933      decide whether to vectorize at compile time.  Hence the scalar version
2934      do not carry cost model guard costs.  */
2935   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2936       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2937       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2938     {
2939       /* Cost model check occurs at versioning.  */
2940       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2941           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2942         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2943       else
2944         {
2945           /* Cost model check occurs at prologue generation.  */
2946           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2947             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2948               + vect_get_stmt_cost (cond_branch_not_taken);
2949           /* Cost model check occurs at epilogue generation.  */
2950           else
2951             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2952         }
2953     }
2954
2955   /* Complete the target-specific cost calculations.  */
2956   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2957                &vec_inside_cost, &vec_epilogue_cost);
2958
2959   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2960
2961   /* Calculate number of iterations required to make the vector version
2962      profitable, relative to the loop bodies only.  The following condition
2963      must hold true:
2964      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2965      where
2966      SIC = scalar iteration cost, VIC = vector iteration cost,
2967      VOC = vector outside cost, VF = vectorization factor,
2968      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2969      SOC = scalar outside cost for run time cost model check.  */
2970
2971   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2972     {
2973       if (vec_outside_cost <= 0)
2974         min_profitable_iters = 1;
2975       else
2976         {
2977           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2978                                   - vec_inside_cost * peel_iters_prologue
2979                                   - vec_inside_cost * peel_iters_epilogue)
2980                                  / ((scalar_single_iter_cost * vf)
2981                                     - vec_inside_cost);
2982
2983           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2984               <= (((int) vec_inside_cost * min_profitable_iters)
2985                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2986             min_profitable_iters++;
2987         }
2988     }
2989   /* vector version will never be profitable.  */
2990   else
2991     {
2992       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
2993         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
2994                     "did not happen for a simd loop");
2995
2996       if (dump_enabled_p ())
2997         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2998                          "cost model: the vector iteration cost = %d "
2999                          "divided by the scalar iteration cost = %d "
3000                          "is greater or equal to the vectorization factor = %d"
3001                          ".\n",
3002                          vec_inside_cost, scalar_single_iter_cost, vf);
3003       *ret_min_profitable_niters = -1;
3004       *ret_min_profitable_estimate = -1;
3005       return;
3006     }
3007
3008   if (dump_enabled_p ())
3009     {
3010       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3011       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3012                    vec_inside_cost);
3013       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3014                    vec_prologue_cost);
3015       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3016                    vec_epilogue_cost);
3017       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3018                    scalar_single_iter_cost);
3019       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3020                    scalar_outside_cost);
3021       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3022                    vec_outside_cost);
3023       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3024                    peel_iters_prologue);
3025       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3026                    peel_iters_epilogue);
3027       dump_printf (MSG_NOTE,
3028                    "  Calculated minimum iters for profitability: %d\n",
3029                    min_profitable_iters);
3030       dump_printf (MSG_NOTE, "\n");
3031     }
3032
3033   min_profitable_iters =
3034         min_profitable_iters < vf ? vf : min_profitable_iters;
3035
3036   /* Because the condition we create is:
3037      if (niters <= min_profitable_iters)
3038        then skip the vectorized loop.  */
3039   min_profitable_iters--;
3040
3041   if (dump_enabled_p ())
3042     dump_printf_loc (MSG_NOTE, vect_location,
3043                      "  Runtime profitability threshold = %d\n",
3044                      min_profitable_iters);
3045
3046   *ret_min_profitable_niters = min_profitable_iters;
3047
3048   /* Calculate number of iterations required to make the vector version
3049      profitable, relative to the loop bodies only.
3050
3051      Non-vectorized variant is SIC * niters and it must win over vector
3052      variant on the expected loop trip count.  The following condition must hold true:
3053      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3054
3055   if (vec_outside_cost <= 0)
3056     min_profitable_estimate = 1;
3057   else
3058     {
3059       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3060                                  - vec_inside_cost * peel_iters_prologue
3061                                  - vec_inside_cost * peel_iters_epilogue)
3062                                  / ((scalar_single_iter_cost * vf)
3063                                    - vec_inside_cost);
3064     }
3065   min_profitable_estimate --;
3066   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3067   if (dump_enabled_p ())
3068     dump_printf_loc (MSG_NOTE, vect_location,
3069                      "  Static estimate profitability threshold = %d\n",
3070                       min_profitable_iters);
3071
3072   *ret_min_profitable_estimate = min_profitable_estimate;
3073 }
3074
3075
3076 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3077    functions. Design better to avoid maintenance issues.  */
3078
3079 /* Function vect_model_reduction_cost.
3080
3081    Models cost for a reduction operation, including the vector ops
3082    generated within the strip-mine loop, the initial definition before
3083    the loop, and the epilogue code that must be generated.  */
3084
3085 static bool
3086 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3087                            int ncopies)
3088 {
3089   int prologue_cost = 0, epilogue_cost = 0;
3090   enum tree_code code;
3091   optab optab;
3092   tree vectype;
3093   gimple stmt, orig_stmt;
3094   tree reduction_op;
3095   enum machine_mode mode;
3096   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3097   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3098   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3099
3100   /* Cost of reduction op inside loop.  */
3101   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3102                                         stmt_info, 0, vect_body);
3103   stmt = STMT_VINFO_STMT (stmt_info);
3104
3105   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3106     {
3107     case GIMPLE_SINGLE_RHS:
3108       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3109       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3110       break;
3111     case GIMPLE_UNARY_RHS:
3112       reduction_op = gimple_assign_rhs1 (stmt);
3113       break;
3114     case GIMPLE_BINARY_RHS:
3115       reduction_op = gimple_assign_rhs2 (stmt);
3116       break;
3117     case GIMPLE_TERNARY_RHS:
3118       reduction_op = gimple_assign_rhs3 (stmt);
3119       break;
3120     default:
3121       gcc_unreachable ();
3122     }
3123
3124   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3125   if (!vectype)
3126     {
3127       if (dump_enabled_p ())
3128         {
3129           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3130                            "unsupported data-type ");
3131           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3132                              TREE_TYPE (reduction_op));
3133           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3134         }
3135       return false;
3136    }
3137
3138   mode = TYPE_MODE (vectype);
3139   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3140
3141   if (!orig_stmt)
3142     orig_stmt = STMT_VINFO_STMT (stmt_info);
3143
3144   code = gimple_assign_rhs_code (orig_stmt);
3145
3146   /* Add in cost for initial definition.  */
3147   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3148                                   stmt_info, 0, vect_prologue);
3149
3150   /* Determine cost of epilogue code.
3151
3152      We have a reduction operator that will reduce the vector in one statement.
3153      Also requires scalar extract.  */
3154
3155   if (!nested_in_vect_loop_p (loop, orig_stmt))
3156     {
3157       if (reduc_code != ERROR_MARK)
3158         {
3159           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3160                                           stmt_info, 0, vect_epilogue);
3161           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3162                                           stmt_info, 0, vect_epilogue);
3163         }
3164       else
3165         {
3166           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3167           tree bitsize =
3168             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3169           int element_bitsize = tree_to_uhwi (bitsize);
3170           int nelements = vec_size_in_bits / element_bitsize;
3171
3172           optab = optab_for_tree_code (code, vectype, optab_default);
3173
3174           /* We have a whole vector shift available.  */
3175           if (VECTOR_MODE_P (mode)
3176               && optab_handler (optab, mode) != CODE_FOR_nothing
3177               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3178             {
3179               /* Final reduction via vector shifts and the reduction operator.
3180                  Also requires scalar extract.  */
3181               epilogue_cost += add_stmt_cost (target_cost_data,
3182                                               exact_log2 (nelements) * 2,
3183                                               vector_stmt, stmt_info, 0,
3184                                               vect_epilogue);
3185               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3186                                               vec_to_scalar, stmt_info, 0,
3187                                               vect_epilogue);
3188             }
3189           else
3190             /* Use extracts and reduction op for final reduction.  For N
3191                elements, we have N extracts and N-1 reduction ops.  */
3192             epilogue_cost += add_stmt_cost (target_cost_data,
3193                                             nelements + nelements - 1,
3194                                             vector_stmt, stmt_info, 0,
3195                                             vect_epilogue);
3196         }
3197     }
3198
3199   if (dump_enabled_p ())
3200     dump_printf (MSG_NOTE,
3201                  "vect_model_reduction_cost: inside_cost = %d, "
3202                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3203                  prologue_cost, epilogue_cost);
3204
3205   return true;
3206 }
3207
3208
3209 /* Function vect_model_induction_cost.
3210
3211    Models cost for induction operations.  */
3212
3213 static void
3214 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3215 {
3216   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3217   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3218   unsigned inside_cost, prologue_cost;
3219
3220   /* loop cost for vec_loop.  */
3221   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3222                                stmt_info, 0, vect_body);
3223
3224   /* prologue cost for vec_init and vec_step.  */
3225   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3226                                  stmt_info, 0, vect_prologue);
3227
3228   if (dump_enabled_p ())
3229     dump_printf_loc (MSG_NOTE, vect_location,
3230                      "vect_model_induction_cost: inside_cost = %d, "
3231                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3232 }
3233
3234
3235 /* Function get_initial_def_for_induction
3236
3237    Input:
3238    STMT - a stmt that performs an induction operation in the loop.
3239    IV_PHI - the initial value of the induction variable
3240
3241    Output:
3242    Return a vector variable, initialized with the first VF values of
3243    the induction variable.  E.g., for an iv with IV_PHI='X' and
3244    evolution S, for a vector of 4 units, we want to return:
3245    [X, X + S, X + 2*S, X + 3*S].  */
3246
3247 static tree
3248 get_initial_def_for_induction (gimple iv_phi)
3249 {
3250   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3251   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3252   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3253   tree vectype;
3254   int nunits;
3255   edge pe = loop_preheader_edge (loop);
3256   struct loop *iv_loop;
3257   basic_block new_bb;
3258   tree new_vec, vec_init, vec_step, t;
3259   tree new_var;
3260   tree new_name;
3261   gimple init_stmt, new_stmt;
3262   gimple_phi induction_phi;
3263   tree induc_def, vec_def, vec_dest;
3264   tree init_expr, step_expr;
3265   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3266   int i;
3267   int ncopies;
3268   tree expr;
3269   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3270   bool nested_in_vect_loop = false;
3271   gimple_seq stmts = NULL;
3272   imm_use_iterator imm_iter;
3273   use_operand_p use_p;
3274   gimple exit_phi;
3275   edge latch_e;
3276   tree loop_arg;
3277   gimple_stmt_iterator si;
3278   basic_block bb = gimple_bb (iv_phi);
3279   tree stepvectype;
3280   tree resvectype;
3281
3282   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3283   if (nested_in_vect_loop_p (loop, iv_phi))
3284     {
3285       nested_in_vect_loop = true;
3286       iv_loop = loop->inner;
3287     }
3288   else
3289     iv_loop = loop;
3290   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3291
3292   latch_e = loop_latch_edge (iv_loop);
3293   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3294
3295   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3296   gcc_assert (step_expr != NULL_TREE);
3297
3298   pe = loop_preheader_edge (iv_loop);
3299   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3300                                      loop_preheader_edge (iv_loop));
3301
3302   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3303   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3304   gcc_assert (vectype);
3305   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3306   ncopies = vf / nunits;
3307
3308   gcc_assert (phi_info);
3309   gcc_assert (ncopies >= 1);
3310
3311   /* Convert the step to the desired type.  */
3312   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3313                                                   step_expr),
3314                                     &stmts, true, NULL_TREE);
3315   if (stmts)
3316     {
3317       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3318       gcc_assert (!new_bb);
3319     }
3320
3321   /* Find the first insertion point in the BB.  */
3322   si = gsi_after_labels (bb);
3323
3324   /* Create the vector that holds the initial_value of the induction.  */
3325   if (nested_in_vect_loop)
3326     {
3327       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3328          been created during vectorization of previous stmts.  We obtain it
3329          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3330       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3331       /* If the initial value is not of proper type, convert it.  */
3332       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3333         {
3334           new_stmt = gimple_build_assign_with_ops
3335               (VIEW_CONVERT_EXPR,
3336                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3337                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3338           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3339           gimple_assign_set_lhs (new_stmt, vec_init);
3340           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3341                                                  new_stmt);
3342           gcc_assert (!new_bb);
3343           set_vinfo_for_stmt (new_stmt,
3344                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3345         }
3346     }
3347   else
3348     {
3349       vec<constructor_elt, va_gc> *v;
3350
3351       /* iv_loop is the loop to be vectorized. Create:
3352          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3353       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3354                                        vect_scalar_var, "var_");
3355       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3356                                                      init_expr),
3357                                        &stmts, false, new_var);
3358       if (stmts)
3359         {
3360           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3361           gcc_assert (!new_bb);
3362         }
3363
3364       vec_alloc (v, nunits);
3365       bool constant_p = is_gimple_min_invariant (new_name);
3366       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3367       for (i = 1; i < nunits; i++)
3368         {
3369           /* Create: new_name_i = new_name + step_expr  */
3370           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3371                                   new_name, step_expr);
3372           if (!is_gimple_min_invariant (new_name))
3373             {
3374               init_stmt = gimple_build_assign (new_var, new_name);
3375               new_name = make_ssa_name (new_var, init_stmt);
3376               gimple_assign_set_lhs (init_stmt, new_name);
3377               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3378               gcc_assert (!new_bb);
3379               if (dump_enabled_p ())
3380                 {
3381                   dump_printf_loc (MSG_NOTE, vect_location,
3382                                    "created new init_stmt: ");
3383                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3384                   dump_printf (MSG_NOTE, "\n");
3385                 }
3386               constant_p = false;
3387             }
3388           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3389         }
3390       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3391       if (constant_p)
3392         new_vec = build_vector_from_ctor (vectype, v);
3393       else
3394         new_vec = build_constructor (vectype, v);
3395       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3396     }
3397
3398
3399   /* Create the vector that holds the step of the induction.  */
3400   if (nested_in_vect_loop)
3401     /* iv_loop is nested in the loop to be vectorized. Generate:
3402        vec_step = [S, S, S, S]  */
3403     new_name = step_expr;
3404   else
3405     {
3406       /* iv_loop is the loop to be vectorized. Generate:
3407           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3408       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3409         {
3410           expr = build_int_cst (integer_type_node, vf);
3411           expr = fold_convert (TREE_TYPE (step_expr), expr);
3412         }
3413       else
3414         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3415       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3416                               expr, step_expr);
3417       if (TREE_CODE (step_expr) == SSA_NAME)
3418         new_name = vect_init_vector (iv_phi, new_name,
3419                                      TREE_TYPE (step_expr), NULL);
3420     }
3421
3422   t = unshare_expr (new_name);
3423   gcc_assert (CONSTANT_CLASS_P (new_name)
3424               || TREE_CODE (new_name) == SSA_NAME);
3425   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3426   gcc_assert (stepvectype);
3427   new_vec = build_vector_from_val (stepvectype, t);
3428   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3429
3430
3431   /* Create the following def-use cycle:
3432      loop prolog:
3433          vec_init = ...
3434          vec_step = ...
3435      loop:
3436          vec_iv = PHI <vec_init, vec_loop>
3437          ...
3438          STMT
3439          ...
3440          vec_loop = vec_iv + vec_step;  */
3441
3442   /* Create the induction-phi that defines the induction-operand.  */
3443   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3444   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3445   set_vinfo_for_stmt (induction_phi,
3446                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3447   induc_def = PHI_RESULT (induction_phi);
3448
3449   /* Create the iv update inside the loop  */
3450   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3451                                            induc_def, vec_step);
3452   vec_def = make_ssa_name (vec_dest, new_stmt);
3453   gimple_assign_set_lhs (new_stmt, vec_def);
3454   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3455   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3456                                                    NULL));
3457
3458   /* Set the arguments of the phi node:  */
3459   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3460   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3461                UNKNOWN_LOCATION);
3462
3463
3464   /* In case that vectorization factor (VF) is bigger than the number
3465      of elements that we can fit in a vectype (nunits), we have to generate
3466      more than one vector stmt - i.e - we need to "unroll" the
3467      vector stmt by a factor VF/nunits.  For more details see documentation
3468      in vectorizable_operation.  */
3469
3470   if (ncopies > 1)
3471     {
3472       stmt_vec_info prev_stmt_vinfo;
3473       /* FORNOW. This restriction should be relaxed.  */
3474       gcc_assert (!nested_in_vect_loop);
3475
3476       /* Create the vector that holds the step of the induction.  */
3477       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3478         {
3479           expr = build_int_cst (integer_type_node, nunits);
3480           expr = fold_convert (TREE_TYPE (step_expr), expr);
3481         }
3482       else
3483         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3484       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3485                               expr, step_expr);
3486       if (TREE_CODE (step_expr) == SSA_NAME)
3487         new_name = vect_init_vector (iv_phi, new_name,
3488                                      TREE_TYPE (step_expr), NULL);
3489       t = unshare_expr (new_name);
3490       gcc_assert (CONSTANT_CLASS_P (new_name)
3491                   || TREE_CODE (new_name) == SSA_NAME);
3492       new_vec = build_vector_from_val (stepvectype, t);
3493       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3494
3495       vec_def = induc_def;
3496       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3497       for (i = 1; i < ncopies; i++)
3498         {
3499           /* vec_i = vec_prev + vec_step  */
3500           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3501                                                    vec_def, vec_step);
3502           vec_def = make_ssa_name (vec_dest, new_stmt);
3503           gimple_assign_set_lhs (new_stmt, vec_def);
3504
3505           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3506           if (!useless_type_conversion_p (resvectype, vectype))
3507             {
3508               new_stmt = gimple_build_assign_with_ops
3509                   (VIEW_CONVERT_EXPR,
3510                    vect_get_new_vect_var (resvectype, vect_simple_var,
3511                                           "vec_iv_"),
3512                    build1 (VIEW_CONVERT_EXPR, resvectype,
3513                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3514               gimple_assign_set_lhs (new_stmt,
3515                                      make_ssa_name
3516                                        (gimple_assign_lhs (new_stmt), new_stmt));
3517               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3518             }
3519           set_vinfo_for_stmt (new_stmt,
3520                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3521           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3522           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3523         }
3524     }
3525
3526   if (nested_in_vect_loop)
3527     {
3528       /* Find the loop-closed exit-phi of the induction, and record
3529          the final vector of induction results:  */
3530       exit_phi = NULL;
3531       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3532         {
3533           gimple use_stmt = USE_STMT (use_p);
3534           if (is_gimple_debug (use_stmt))
3535             continue;
3536
3537           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3538             {
3539               exit_phi = use_stmt;
3540               break;
3541             }
3542         }
3543       if (exit_phi)
3544         {
3545           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3546           /* FORNOW. Currently not supporting the case that an inner-loop induction
3547              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3548           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3549                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3550
3551           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3552           if (dump_enabled_p ())
3553             {
3554               dump_printf_loc (MSG_NOTE, vect_location,
3555                                "vector of inductions after inner-loop:");
3556               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3557               dump_printf (MSG_NOTE, "\n");
3558             }
3559         }
3560     }
3561
3562
3563   if (dump_enabled_p ())
3564     {
3565       dump_printf_loc (MSG_NOTE, vect_location,
3566                        "transform induction: created def-use cycle: ");
3567       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3568       dump_printf (MSG_NOTE, "\n");
3569       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3570                         SSA_NAME_DEF_STMT (vec_def), 0);
3571       dump_printf (MSG_NOTE, "\n");
3572     }
3573
3574   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3575   if (!useless_type_conversion_p (resvectype, vectype))
3576     {
3577       new_stmt = gimple_build_assign_with_ops
3578          (VIEW_CONVERT_EXPR,
3579           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3580           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3581       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3582       gimple_assign_set_lhs (new_stmt, induc_def);
3583       si = gsi_after_labels (bb);
3584       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3585       set_vinfo_for_stmt (new_stmt,
3586                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3587       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3588         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3589     }
3590
3591   return induc_def;
3592 }
3593
3594
3595 /* Function get_initial_def_for_reduction
3596
3597    Input:
3598    STMT - a stmt that performs a reduction operation in the loop.
3599    INIT_VAL - the initial value of the reduction variable
3600
3601    Output:
3602    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3603         of the reduction (used for adjusting the epilog - see below).
3604    Return a vector variable, initialized according to the operation that STMT
3605         performs. This vector will be used as the initial value of the
3606         vector of partial results.
3607
3608    Option1 (adjust in epilog): Initialize the vector as follows:
3609      add/bit or/xor:    [0,0,...,0,0]
3610      mult/bit and:      [1,1,...,1,1]
3611      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3612    and when necessary (e.g. add/mult case) let the caller know
3613    that it needs to adjust the result by init_val.
3614
3615    Option2: Initialize the vector as follows:
3616      add/bit or/xor:    [init_val,0,0,...,0]
3617      mult/bit and:      [init_val,1,1,...,1]
3618      min/max/cond_expr: [init_val,init_val,...,init_val]
3619    and no adjustments are needed.
3620
3621    For example, for the following code:
3622
3623    s = init_val;
3624    for (i=0;i<n;i++)
3625      s = s + a[i];
3626
3627    STMT is 's = s + a[i]', and the reduction variable is 's'.
3628    For a vector of 4 units, we want to return either [0,0,0,init_val],
3629    or [0,0,0,0] and let the caller know that it needs to adjust
3630    the result at the end by 'init_val'.
3631
3632    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3633    initialization vector is simpler (same element in all entries), if
3634    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3635
3636    A cost model should help decide between these two schemes.  */
3637
3638 tree
3639 get_initial_def_for_reduction (gimple stmt, tree init_val,
3640                                tree *adjustment_def)
3641 {
3642   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3643   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3644   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3645   tree scalar_type = TREE_TYPE (init_val);
3646   tree vectype = get_vectype_for_scalar_type (scalar_type);
3647   int nunits;
3648   enum tree_code code = gimple_assign_rhs_code (stmt);
3649   tree def_for_init;
3650   tree init_def;
3651   tree *elts;
3652   int i;
3653   bool nested_in_vect_loop = false;
3654   tree init_value;
3655   REAL_VALUE_TYPE real_init_val = dconst0;
3656   int int_init_val = 0;
3657   gimple def_stmt = NULL;
3658
3659   gcc_assert (vectype);
3660   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3661
3662   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3663               || SCALAR_FLOAT_TYPE_P (scalar_type));
3664
3665   if (nested_in_vect_loop_p (loop, stmt))
3666     nested_in_vect_loop = true;
3667   else
3668     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3669
3670   /* In case of double reduction we only create a vector variable to be put
3671      in the reduction phi node.  The actual statement creation is done in
3672      vect_create_epilog_for_reduction.  */
3673   if (adjustment_def && nested_in_vect_loop
3674       && TREE_CODE (init_val) == SSA_NAME
3675       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3676       && gimple_code (def_stmt) == GIMPLE_PHI
3677       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3678       && vinfo_for_stmt (def_stmt)
3679       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3680           == vect_double_reduction_def)
3681     {
3682       *adjustment_def = NULL;
3683       return vect_create_destination_var (init_val, vectype);
3684     }
3685
3686   if (TREE_CONSTANT (init_val))
3687     {
3688       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3689         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3690       else
3691         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3692     }
3693   else
3694     init_value = init_val;
3695
3696   switch (code)
3697     {
3698       case WIDEN_SUM_EXPR:
3699       case DOT_PROD_EXPR:
3700       case SAD_EXPR:
3701       case PLUS_EXPR:
3702       case MINUS_EXPR:
3703       case BIT_IOR_EXPR:
3704       case BIT_XOR_EXPR:
3705       case MULT_EXPR:
3706       case BIT_AND_EXPR:
3707         /* ADJUSMENT_DEF is NULL when called from
3708            vect_create_epilog_for_reduction to vectorize double reduction.  */
3709         if (adjustment_def)
3710           {
3711             if (nested_in_vect_loop)
3712               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3713                                                               NULL);
3714             else
3715               *adjustment_def = init_val;
3716           }
3717
3718         if (code == MULT_EXPR)
3719           {
3720             real_init_val = dconst1;
3721             int_init_val = 1;
3722           }
3723
3724         if (code == BIT_AND_EXPR)
3725           int_init_val = -1;
3726
3727         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3728           def_for_init = build_real (scalar_type, real_init_val);
3729         else
3730           def_for_init = build_int_cst (scalar_type, int_init_val);
3731
3732         /* Create a vector of '0' or '1' except the first element.  */
3733         elts = XALLOCAVEC (tree, nunits);
3734         for (i = nunits - 2; i >= 0; --i)
3735           elts[i + 1] = def_for_init;
3736
3737         /* Option1: the first element is '0' or '1' as well.  */
3738         if (adjustment_def)
3739           {
3740             elts[0] = def_for_init;
3741             init_def = build_vector (vectype, elts);
3742             break;
3743           }
3744
3745         /* Option2: the first element is INIT_VAL.  */
3746         elts[0] = init_val;
3747         if (TREE_CONSTANT (init_val))
3748           init_def = build_vector (vectype, elts);
3749         else
3750           {
3751             vec<constructor_elt, va_gc> *v;
3752             vec_alloc (v, nunits);
3753             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3754             for (i = 1; i < nunits; ++i)
3755               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3756             init_def = build_constructor (vectype, v);
3757           }
3758
3759         break;
3760
3761       case MIN_EXPR:
3762       case MAX_EXPR:
3763       case COND_EXPR:
3764         if (adjustment_def)
3765           {
3766             *adjustment_def = NULL_TREE;
3767             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3768             break;
3769           }
3770
3771         init_def = build_vector_from_val (vectype, init_value);
3772         break;
3773
3774       default:
3775         gcc_unreachable ();
3776     }
3777
3778   return init_def;
3779 }
3780
3781
3782 /* Function vect_create_epilog_for_reduction
3783
3784    Create code at the loop-epilog to finalize the result of a reduction
3785    computation.
3786
3787    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3788      reduction statements.
3789    STMT is the scalar reduction stmt that is being vectorized.
3790    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3791      number of elements that we can fit in a vectype (nunits).  In this case
3792      we have to generate more than one vector stmt - i.e - we need to "unroll"
3793      the vector stmt by a factor VF/nunits.  For more details see documentation
3794      in vectorizable_operation.
3795    REDUC_CODE is the tree-code for the epilog reduction.
3796    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3797      computation.
3798    REDUC_INDEX is the index of the operand in the right hand side of the
3799      statement that is defined by REDUCTION_PHI.
3800    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3801    SLP_NODE is an SLP node containing a group of reduction statements. The
3802      first one in this group is STMT.
3803
3804    This function:
3805    1. Creates the reduction def-use cycles: sets the arguments for
3806       REDUCTION_PHIS:
3807       The loop-entry argument is the vectorized initial-value of the reduction.
3808       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3809       sums.
3810    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3811       by applying the operation specified by REDUC_CODE if available, or by
3812       other means (whole-vector shifts or a scalar loop).
3813       The function also creates a new phi node at the loop exit to preserve
3814       loop-closed form, as illustrated below.
3815
3816      The flow at the entry to this function:
3817
3818         loop:
3819           vec_def = phi <null, null>            # REDUCTION_PHI
3820           VECT_DEF = vector_stmt                # vectorized form of STMT
3821           s_loop = scalar_stmt                  # (scalar) STMT
3822         loop_exit:
3823           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3824           use <s_out0>
3825           use <s_out0>
3826
3827      The above is transformed by this function into:
3828
3829         loop:
3830           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3831           VECT_DEF = vector_stmt                # vectorized form of STMT
3832           s_loop = scalar_stmt                  # (scalar) STMT
3833         loop_exit:
3834           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3835           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3836           v_out2 = reduce <v_out1>
3837           s_out3 = extract_field <v_out2, 0>
3838           s_out4 = adjust_result <s_out3>
3839           use <s_out4>
3840           use <s_out4>
3841 */
3842
3843 static void
3844 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3845                                   int ncopies, enum tree_code reduc_code,
3846                                   vec<gimple> reduction_phis,
3847                                   int reduc_index, bool double_reduc,
3848                                   slp_tree slp_node)
3849 {
3850   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3851   stmt_vec_info prev_phi_info;
3852   tree vectype;
3853   enum machine_mode mode;
3854   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3855   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3856   basic_block exit_bb;
3857   tree scalar_dest;
3858   tree scalar_type;
3859   gimple new_phi = NULL, phi;
3860   gimple_stmt_iterator exit_gsi;
3861   tree vec_dest;
3862   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3863   gimple epilog_stmt = NULL;
3864   enum tree_code code = gimple_assign_rhs_code (stmt);
3865   gimple exit_phi;
3866   tree bitsize, bitpos;
3867   tree adjustment_def = NULL;
3868   tree vec_initial_def = NULL;
3869   tree reduction_op, expr, def;
3870   tree orig_name, scalar_result;
3871   imm_use_iterator imm_iter, phi_imm_iter;
3872   use_operand_p use_p, phi_use_p;
3873   bool extract_scalar_result = false;
3874   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3875   bool nested_in_vect_loop = false;
3876   auto_vec<gimple> new_phis;
3877   auto_vec<gimple> inner_phis;
3878   enum vect_def_type dt = vect_unknown_def_type;
3879   int j, i;
3880   auto_vec<tree> scalar_results;
3881   unsigned int group_size = 1, k, ratio;
3882   auto_vec<tree> vec_initial_defs;
3883   auto_vec<gimple> phis;
3884   bool slp_reduc = false;
3885   tree new_phi_result;
3886   gimple inner_phi = NULL;
3887
3888   if (slp_node)
3889     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3890
3891   if (nested_in_vect_loop_p (loop, stmt))
3892     {
3893       outer_loop = loop;
3894       loop = loop->inner;
3895       nested_in_vect_loop = true;
3896       gcc_assert (!slp_node);
3897     }
3898
3899   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3900     {
3901     case GIMPLE_SINGLE_RHS:
3902       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3903                   == ternary_op);
3904       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3905       break;
3906     case GIMPLE_UNARY_RHS:
3907       reduction_op = gimple_assign_rhs1 (stmt);
3908       break;
3909     case GIMPLE_BINARY_RHS:
3910       reduction_op = reduc_index ?
3911                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3912       break;
3913     case GIMPLE_TERNARY_RHS:
3914       reduction_op = gimple_op (stmt, reduc_index + 1);
3915       break;
3916     default:
3917       gcc_unreachable ();
3918     }
3919
3920   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3921   gcc_assert (vectype);
3922   mode = TYPE_MODE (vectype);
3923
3924   /* 1. Create the reduction def-use cycle:
3925      Set the arguments of REDUCTION_PHIS, i.e., transform
3926
3927         loop:
3928           vec_def = phi <null, null>            # REDUCTION_PHI
3929           VECT_DEF = vector_stmt                # vectorized form of STMT
3930           ...
3931
3932      into:
3933
3934         loop:
3935           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3936           VECT_DEF = vector_stmt                # vectorized form of STMT
3937           ...
3938
3939      (in case of SLP, do it for all the phis). */
3940
3941   /* Get the loop-entry arguments.  */
3942   if (slp_node)
3943     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3944                        NULL, slp_node, reduc_index);
3945   else
3946     {
3947       vec_initial_defs.create (1);
3948      /* For the case of reduction, vect_get_vec_def_for_operand returns
3949         the scalar def before the loop, that defines the initial value
3950         of the reduction variable.  */
3951       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3952                                                       &adjustment_def);
3953       vec_initial_defs.quick_push (vec_initial_def);
3954     }
3955
3956   /* Set phi nodes arguments.  */
3957   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3958     {
3959       tree vec_init_def, def;
3960       gimple_seq stmts;
3961       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
3962                                            true, NULL_TREE);
3963       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3964       def = vect_defs[i];
3965       for (j = 0; j < ncopies; j++)
3966         {
3967           /* Set the loop-entry arg of the reduction-phi.  */
3968           add_phi_arg (as_a <gimple_phi> (phi), vec_init_def,
3969                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
3970
3971           /* Set the loop-latch arg for the reduction-phi.  */
3972           if (j > 0)
3973             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3974
3975           add_phi_arg (as_a <gimple_phi> (phi), def, loop_latch_edge (loop),
3976                        UNKNOWN_LOCATION);
3977
3978           if (dump_enabled_p ())
3979             {
3980               dump_printf_loc (MSG_NOTE, vect_location,
3981                                "transform reduction: created def-use cycle: ");
3982               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3983               dump_printf (MSG_NOTE, "\n");
3984               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3985               dump_printf (MSG_NOTE, "\n");
3986             }
3987
3988           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3989         }
3990     }
3991
3992   /* 2. Create epilog code.
3993         The reduction epilog code operates across the elements of the vector
3994         of partial results computed by the vectorized loop.
3995         The reduction epilog code consists of:
3996
3997         step 1: compute the scalar result in a vector (v_out2)
3998         step 2: extract the scalar result (s_out3) from the vector (v_out2)
3999         step 3: adjust the scalar result (s_out3) if needed.
4000
4001         Step 1 can be accomplished using one the following three schemes:
4002           (scheme 1) using reduc_code, if available.
4003           (scheme 2) using whole-vector shifts, if available.
4004           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4005                      combined.
4006
4007           The overall epilog code looks like this:
4008
4009           s_out0 = phi <s_loop>         # original EXIT_PHI
4010           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4011           v_out2 = reduce <v_out1>              # step 1
4012           s_out3 = extract_field <v_out2, 0>    # step 2
4013           s_out4 = adjust_result <s_out3>       # step 3
4014
4015           (step 3 is optional, and steps 1 and 2 may be combined).
4016           Lastly, the uses of s_out0 are replaced by s_out4.  */
4017
4018
4019   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4020          v_out1 = phi <VECT_DEF>
4021          Store them in NEW_PHIS.  */
4022
4023   exit_bb = single_exit (loop)->dest;
4024   prev_phi_info = NULL;
4025   new_phis.create (vect_defs.length ());
4026   FOR_EACH_VEC_ELT (vect_defs, i, def)
4027     {
4028       for (j = 0; j < ncopies; j++)
4029         {
4030           tree new_def = copy_ssa_name (def, NULL);
4031           phi = create_phi_node (new_def, exit_bb);
4032           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4033           if (j == 0)
4034             new_phis.quick_push (phi);
4035           else
4036             {
4037               def = vect_get_vec_def_for_stmt_copy (dt, def);
4038               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4039             }
4040
4041           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4042           prev_phi_info = vinfo_for_stmt (phi);
4043         }
4044     }
4045
4046   /* The epilogue is created for the outer-loop, i.e., for the loop being
4047      vectorized.  Create exit phis for the outer loop.  */
4048   if (double_reduc)
4049     {
4050       loop = outer_loop;
4051       exit_bb = single_exit (loop)->dest;
4052       inner_phis.create (vect_defs.length ());
4053       FOR_EACH_VEC_ELT (new_phis, i, phi)
4054         {
4055           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4056           gimple_phi outer_phi = create_phi_node (new_result, exit_bb);
4057           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4058                            PHI_RESULT (phi));
4059           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4060                                                             loop_vinfo, NULL));
4061           inner_phis.quick_push (phi);
4062           new_phis[i] = outer_phi;
4063           prev_phi_info = vinfo_for_stmt (outer_phi);
4064           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4065             {
4066               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4067               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4068               outer_phi = create_phi_node (new_result, exit_bb);
4069               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4070                                PHI_RESULT (phi));
4071               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4072                                                         loop_vinfo, NULL));
4073               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4074               prev_phi_info = vinfo_for_stmt (outer_phi);
4075             }
4076         }
4077     }
4078
4079   exit_gsi = gsi_after_labels (exit_bb);
4080
4081   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4082          (i.e. when reduc_code is not available) and in the final adjustment
4083          code (if needed).  Also get the original scalar reduction variable as
4084          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4085          represents a reduction pattern), the tree-code and scalar-def are
4086          taken from the original stmt that the pattern-stmt (STMT) replaces.
4087          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4088          are taken from STMT.  */
4089
4090   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4091   if (!orig_stmt)
4092     {
4093       /* Regular reduction  */
4094       orig_stmt = stmt;
4095     }
4096   else
4097     {
4098       /* Reduction pattern  */
4099       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4100       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4101       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4102     }
4103
4104   code = gimple_assign_rhs_code (orig_stmt);
4105   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4106      partial results are added and not subtracted.  */
4107   if (code == MINUS_EXPR)
4108     code = PLUS_EXPR;
4109
4110   scalar_dest = gimple_assign_lhs (orig_stmt);
4111   scalar_type = TREE_TYPE (scalar_dest);
4112   scalar_results.create (group_size);
4113   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4114   bitsize = TYPE_SIZE (scalar_type);
4115
4116   /* In case this is a reduction in an inner-loop while vectorizing an outer
4117      loop - we don't need to extract a single scalar result at the end of the
4118      inner-loop (unless it is double reduction, i.e., the use of reduction is
4119      outside the outer-loop).  The final vector of partial results will be used
4120      in the vectorized outer-loop, or reduced to a scalar result at the end of
4121      the outer-loop.  */
4122   if (nested_in_vect_loop && !double_reduc)
4123     goto vect_finalize_reduction;
4124
4125   /* SLP reduction without reduction chain, e.g.,
4126      # a1 = phi <a2, a0>
4127      # b1 = phi <b2, b0>
4128      a2 = operation (a1)
4129      b2 = operation (b1)  */
4130   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4131
4132   /* In case of reduction chain, e.g.,
4133      # a1 = phi <a3, a0>
4134      a2 = operation (a1)
4135      a3 = operation (a2),
4136
4137      we may end up with more than one vector result.  Here we reduce them to
4138      one vector.  */
4139   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4140     {
4141       tree first_vect = PHI_RESULT (new_phis[0]);
4142       tree tmp;
4143       gimple_assign new_vec_stmt = NULL;
4144
4145       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4146       for (k = 1; k < new_phis.length (); k++)
4147         {
4148           gimple next_phi = new_phis[k];
4149           tree second_vect = PHI_RESULT (next_phi);
4150
4151           tmp = build2 (code, vectype,  first_vect, second_vect);
4152           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4153           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4154           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4155           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4156         }
4157
4158       new_phi_result = first_vect;
4159       if (new_vec_stmt)
4160         {
4161           new_phis.truncate (0);
4162           new_phis.safe_push (new_vec_stmt);
4163         }
4164     }
4165   else
4166     new_phi_result = PHI_RESULT (new_phis[0]);
4167
4168   /* 2.3 Create the reduction code, using one of the three schemes described
4169          above. In SLP we simply need to extract all the elements from the
4170          vector (without reducing them), so we use scalar shifts.  */
4171   if (reduc_code != ERROR_MARK && !slp_reduc)
4172     {
4173       tree tmp;
4174
4175       /*** Case 1:  Create:
4176            v_out2 = reduc_expr <v_out1>  */
4177
4178       if (dump_enabled_p ())
4179         dump_printf_loc (MSG_NOTE, vect_location,
4180                          "Reduce using direct vector reduction.\n");
4181
4182       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4183       tmp = build1 (reduc_code, vectype, new_phi_result);
4184       epilog_stmt = gimple_build_assign (vec_dest, tmp);
4185       new_temp = make_ssa_name (vec_dest, epilog_stmt);
4186       gimple_assign_set_lhs (epilog_stmt, new_temp);
4187       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4188
4189       extract_scalar_result = true;
4190     }
4191   else
4192     {
4193       enum tree_code shift_code = ERROR_MARK;
4194       bool have_whole_vector_shift = true;
4195       int bit_offset;
4196       int element_bitsize = tree_to_uhwi (bitsize);
4197       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4198       tree vec_temp;
4199
4200       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4201         shift_code = VEC_RSHIFT_EXPR;
4202       else
4203         have_whole_vector_shift = false;
4204
4205       /* Regardless of whether we have a whole vector shift, if we're
4206          emulating the operation via tree-vect-generic, we don't want
4207          to use it.  Only the first round of the reduction is likely
4208          to still be profitable via emulation.  */
4209       /* ??? It might be better to emit a reduction tree code here, so that
4210          tree-vect-generic can expand the first round via bit tricks.  */
4211       if (!VECTOR_MODE_P (mode))
4212         have_whole_vector_shift = false;
4213       else
4214         {
4215           optab optab = optab_for_tree_code (code, vectype, optab_default);
4216           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4217             have_whole_vector_shift = false;
4218         }
4219
4220       if (have_whole_vector_shift && !slp_reduc)
4221         {
4222           /*** Case 2: Create:
4223              for (offset = VS/2; offset >= element_size; offset/=2)
4224                 {
4225                   Create:  va' = vec_shift <va, offset>
4226                   Create:  va = vop <va, va'>
4227                 }  */
4228
4229           if (dump_enabled_p ())
4230             dump_printf_loc (MSG_NOTE, vect_location,
4231                              "Reduce using vector shifts\n");
4232
4233           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4234           new_temp = new_phi_result;
4235           for (bit_offset = vec_size_in_bits/2;
4236                bit_offset >= element_bitsize;
4237                bit_offset /= 2)
4238             {
4239               tree bitpos = size_int (bit_offset);
4240
4241               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4242                                                vec_dest, new_temp, bitpos);
4243               new_name = make_ssa_name (vec_dest, epilog_stmt);
4244               gimple_assign_set_lhs (epilog_stmt, new_name);
4245               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4246
4247               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4248                                                           new_name, new_temp);
4249               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4250               gimple_assign_set_lhs (epilog_stmt, new_temp);
4251               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4252             }
4253
4254           extract_scalar_result = true;
4255         }
4256       else
4257         {
4258           tree rhs;
4259
4260           /*** Case 3: Create:
4261              s = extract_field <v_out2, 0>
4262              for (offset = element_size;
4263                   offset < vector_size;
4264                   offset += element_size;)
4265                {
4266                  Create:  s' = extract_field <v_out2, offset>
4267                  Create:  s = op <s, s'>  // For non SLP cases
4268                }  */
4269
4270           if (dump_enabled_p ())
4271             dump_printf_loc (MSG_NOTE, vect_location,
4272                              "Reduce using scalar code.\n");
4273
4274           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4275           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4276             {
4277               if (gimple_code (new_phi) == GIMPLE_PHI)
4278                 vec_temp = PHI_RESULT (new_phi);
4279               else
4280                 vec_temp = gimple_assign_lhs (new_phi);
4281               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4282                             bitsize_zero_node);
4283               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4284               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4285               gimple_assign_set_lhs (epilog_stmt, new_temp);
4286               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4287
4288               /* In SLP we don't need to apply reduction operation, so we just
4289                  collect s' values in SCALAR_RESULTS.  */
4290               if (slp_reduc)
4291                 scalar_results.safe_push (new_temp);
4292
4293               for (bit_offset = element_bitsize;
4294                    bit_offset < vec_size_in_bits;
4295                    bit_offset += element_bitsize)
4296                 {
4297                   tree bitpos = bitsize_int (bit_offset);
4298                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4299                                      bitsize, bitpos);
4300
4301                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4302                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4303                   gimple_assign_set_lhs (epilog_stmt, new_name);
4304                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4305
4306                   if (slp_reduc)
4307                     {
4308                       /* In SLP we don't need to apply reduction operation, so
4309                          we just collect s' values in SCALAR_RESULTS.  */
4310                       new_temp = new_name;
4311                       scalar_results.safe_push (new_name);
4312                     }
4313                   else
4314                     {
4315                       epilog_stmt = gimple_build_assign_with_ops (code,
4316                                           new_scalar_dest, new_name, new_temp);
4317                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4318                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4319                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4320                     }
4321                 }
4322             }
4323
4324           /* The only case where we need to reduce scalar results in SLP, is
4325              unrolling.  If the size of SCALAR_RESULTS is greater than
4326              GROUP_SIZE, we reduce them combining elements modulo
4327              GROUP_SIZE.  */
4328           if (slp_reduc)
4329             {
4330               tree res, first_res, new_res;
4331               gimple new_stmt;
4332
4333               /* Reduce multiple scalar results in case of SLP unrolling.  */
4334               for (j = group_size; scalar_results.iterate (j, &res);
4335                    j++)
4336                 {
4337                   first_res = scalar_results[j % group_size];
4338                   new_stmt = gimple_build_assign_with_ops (code,
4339                                               new_scalar_dest, first_res, res);
4340                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4341                   gimple_assign_set_lhs (new_stmt, new_res);
4342                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4343                   scalar_results[j % group_size] = new_res;
4344                 }
4345             }
4346           else
4347             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4348             scalar_results.safe_push (new_temp);
4349
4350           extract_scalar_result = false;
4351         }
4352     }
4353
4354   /* 2.4  Extract the final scalar result.  Create:
4355           s_out3 = extract_field <v_out2, bitpos>  */
4356
4357   if (extract_scalar_result)
4358     {
4359       tree rhs;
4360
4361       if (dump_enabled_p ())
4362         dump_printf_loc (MSG_NOTE, vect_location,
4363                          "extract scalar result\n");
4364
4365       if (BYTES_BIG_ENDIAN)
4366         bitpos = size_binop (MULT_EXPR,
4367                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4368                              TYPE_SIZE (scalar_type));
4369       else
4370         bitpos = bitsize_zero_node;
4371
4372       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4373       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4374       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4375       gimple_assign_set_lhs (epilog_stmt, new_temp);
4376       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4377       scalar_results.safe_push (new_temp);
4378     }
4379
4380 vect_finalize_reduction:
4381
4382   if (double_reduc)
4383     loop = loop->inner;
4384
4385   /* 2.5 Adjust the final result by the initial value of the reduction
4386          variable. (When such adjustment is not needed, then
4387          'adjustment_def' is zero).  For example, if code is PLUS we create:
4388          new_temp = loop_exit_def + adjustment_def  */
4389
4390   if (adjustment_def)
4391     {
4392       gcc_assert (!slp_reduc);
4393       if (nested_in_vect_loop)
4394         {
4395           new_phi = new_phis[0];
4396           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4397           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4398           new_dest = vect_create_destination_var (scalar_dest, vectype);
4399         }
4400       else
4401         {
4402           new_temp = scalar_results[0];
4403           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4404           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4405           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4406         }
4407
4408       epilog_stmt = gimple_build_assign (new_dest, expr);
4409       new_temp = make_ssa_name (new_dest, epilog_stmt);
4410       gimple_assign_set_lhs (epilog_stmt, new_temp);
4411       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4412       if (nested_in_vect_loop)
4413         {
4414           set_vinfo_for_stmt (epilog_stmt,
4415                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4416                                                  NULL));
4417           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4418                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4419
4420           if (!double_reduc)
4421             scalar_results.quick_push (new_temp);
4422           else
4423             scalar_results[0] = new_temp;
4424         }
4425       else
4426         scalar_results[0] = new_temp;
4427
4428       new_phis[0] = epilog_stmt;
4429     }
4430
4431   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4432           phis with new adjusted scalar results, i.e., replace use <s_out0>
4433           with use <s_out4>.
4434
4435      Transform:
4436         loop_exit:
4437           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4438           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4439           v_out2 = reduce <v_out1>
4440           s_out3 = extract_field <v_out2, 0>
4441           s_out4 = adjust_result <s_out3>
4442           use <s_out0>
4443           use <s_out0>
4444
4445      into:
4446
4447         loop_exit:
4448           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4449           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4450           v_out2 = reduce <v_out1>
4451           s_out3 = extract_field <v_out2, 0>
4452           s_out4 = adjust_result <s_out3>
4453           use <s_out4>
4454           use <s_out4> */
4455
4456
4457   /* In SLP reduction chain we reduce vector results into one vector if
4458      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4459      the last stmt in the reduction chain, since we are looking for the loop
4460      exit phi node.  */
4461   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4462     {
4463       scalar_dest = gimple_assign_lhs (
4464                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4465       group_size = 1;
4466     }
4467
4468   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4469      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4470      need to match SCALAR_RESULTS with corresponding statements.  The first
4471      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4472      the first vector stmt, etc.
4473      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4474   if (group_size > new_phis.length ())
4475     {
4476       ratio = group_size / new_phis.length ();
4477       gcc_assert (!(group_size % new_phis.length ()));
4478     }
4479   else
4480     ratio = 1;
4481
4482   for (k = 0; k < group_size; k++)
4483     {
4484       if (k % ratio == 0)
4485         {
4486           epilog_stmt = new_phis[k / ratio];
4487           reduction_phi = reduction_phis[k / ratio];
4488           if (double_reduc)
4489             inner_phi = inner_phis[k / ratio];
4490         }
4491
4492       if (slp_reduc)
4493         {
4494           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4495
4496           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4497           /* SLP statements can't participate in patterns.  */
4498           gcc_assert (!orig_stmt);
4499           scalar_dest = gimple_assign_lhs (current_stmt);
4500         }
4501
4502       phis.create (3);
4503       /* Find the loop-closed-use at the loop exit of the original scalar
4504          result.  (The reduction result is expected to have two immediate uses -
4505          one at the latch block, and one at the loop exit).  */
4506       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4507         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4508             && !is_gimple_debug (USE_STMT (use_p)))
4509           phis.safe_push (USE_STMT (use_p));
4510
4511       /* While we expect to have found an exit_phi because of loop-closed-ssa
4512          form we can end up without one if the scalar cycle is dead.  */
4513
4514       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4515         {
4516           if (outer_loop)
4517             {
4518               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4519               gimple_phi vect_phi;
4520
4521               /* FORNOW. Currently not supporting the case that an inner-loop
4522                  reduction is not used in the outer-loop (but only outside the
4523                  outer-loop), unless it is double reduction.  */
4524               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4525                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4526                           || double_reduc);
4527
4528               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4529               if (!double_reduc
4530                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4531                       != vect_double_reduction_def)
4532                 continue;
4533
4534               /* Handle double reduction:
4535
4536                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4537                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4538                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4539                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4540
4541                  At that point the regular reduction (stmt2 and stmt3) is
4542                  already vectorized, as well as the exit phi node, stmt4.
4543                  Here we vectorize the phi node of double reduction, stmt1, and
4544                  update all relevant statements.  */
4545
4546               /* Go through all the uses of s2 to find double reduction phi
4547                  node, i.e., stmt1 above.  */
4548               orig_name = PHI_RESULT (exit_phi);
4549               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4550                 {
4551                   stmt_vec_info use_stmt_vinfo;
4552                   stmt_vec_info new_phi_vinfo;
4553                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4554                   basic_block bb = gimple_bb (use_stmt);
4555                   gimple use;
4556
4557                   /* Check that USE_STMT is really double reduction phi
4558                      node.  */
4559                   if (gimple_code (use_stmt) != GIMPLE_PHI
4560                       || gimple_phi_num_args (use_stmt) != 2
4561                       || bb->loop_father != outer_loop)
4562                     continue;
4563                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4564                   if (!use_stmt_vinfo
4565                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4566                           != vect_double_reduction_def)
4567                     continue;
4568
4569                   /* Create vector phi node for double reduction:
4570                      vs1 = phi <vs0, vs2>
4571                      vs1 was created previously in this function by a call to
4572                        vect_get_vec_def_for_operand and is stored in
4573                        vec_initial_def;
4574                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4575                      vs0 is created here.  */
4576
4577                   /* Create vector phi node.  */
4578                   vect_phi = create_phi_node (vec_initial_def, bb);
4579                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4580                                     loop_vec_info_for_loop (outer_loop), NULL);
4581                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4582
4583                   /* Create vs0 - initial def of the double reduction phi.  */
4584                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4585                                              loop_preheader_edge (outer_loop));
4586                   init_def = get_initial_def_for_reduction (stmt,
4587                                                           preheader_arg, NULL);
4588                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4589                                                     vectype, NULL);
4590
4591                   /* Update phi node arguments with vs0 and vs2.  */
4592                   add_phi_arg (vect_phi, vect_phi_init,
4593                                loop_preheader_edge (outer_loop),
4594                                UNKNOWN_LOCATION);
4595                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4596                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4597                   if (dump_enabled_p ())
4598                     {
4599                       dump_printf_loc (MSG_NOTE, vect_location,
4600                                        "created double reduction phi node: ");
4601                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4602                       dump_printf (MSG_NOTE, "\n");
4603                     }
4604
4605                   vect_phi_res = PHI_RESULT (vect_phi);
4606
4607                   /* Replace the use, i.e., set the correct vs1 in the regular
4608                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4609                      loop is redundant.  */
4610                   use = reduction_phi;
4611                   for (j = 0; j < ncopies; j++)
4612                     {
4613                       edge pr_edge = loop_preheader_edge (loop);
4614                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4615                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4616                     }
4617                 }
4618             }
4619         }
4620
4621       phis.release ();
4622       if (nested_in_vect_loop)
4623         {
4624           if (double_reduc)
4625             loop = outer_loop;
4626           else
4627             continue;
4628         }
4629
4630       phis.create (3);
4631       /* Find the loop-closed-use at the loop exit of the original scalar
4632          result.  (The reduction result is expected to have two immediate uses,
4633          one at the latch block, and one at the loop exit).  For double
4634          reductions we are looking for exit phis of the outer loop.  */
4635       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4636         {
4637           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4638             {
4639               if (!is_gimple_debug (USE_STMT (use_p)))
4640                 phis.safe_push (USE_STMT (use_p));
4641             }
4642           else
4643             {
4644               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4645                 {
4646                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4647
4648                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4649                     {
4650                       if (!flow_bb_inside_loop_p (loop,
4651                                              gimple_bb (USE_STMT (phi_use_p)))
4652                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4653                         phis.safe_push (USE_STMT (phi_use_p));
4654                     }
4655                 }
4656             }
4657         }
4658
4659       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4660         {
4661           /* Replace the uses:  */
4662           orig_name = PHI_RESULT (exit_phi);
4663           scalar_result = scalar_results[k];
4664           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4665             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4666               SET_USE (use_p, scalar_result);
4667         }
4668
4669       phis.release ();
4670     }
4671 }
4672
4673
4674 /* Function vectorizable_reduction.
4675
4676    Check if STMT performs a reduction operation that can be vectorized.
4677    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4678    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4679    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4680
4681    This function also handles reduction idioms (patterns) that have been
4682    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4683    of this form:
4684      X = pattern_expr (arg0, arg1, ..., X)
4685    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4686    sequence that had been detected and replaced by the pattern-stmt (STMT).
4687
4688    In some cases of reduction patterns, the type of the reduction variable X is
4689    different than the type of the other arguments of STMT.
4690    In such cases, the vectype that is used when transforming STMT into a vector
4691    stmt is different than the vectype that is used to determine the
4692    vectorization factor, because it consists of a different number of elements
4693    than the actual number of elements that are being operated upon in parallel.
4694
4695    For example, consider an accumulation of shorts into an int accumulator.
4696    On some targets it's possible to vectorize this pattern operating on 8
4697    shorts at a time (hence, the vectype for purposes of determining the
4698    vectorization factor should be V8HI); on the other hand, the vectype that
4699    is used to create the vector form is actually V4SI (the type of the result).
4700
4701    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4702    indicates what is the actual level of parallelism (V8HI in the example), so
4703    that the right vectorization factor would be derived.  This vectype
4704    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4705    be used to create the vectorized stmt.  The right vectype for the vectorized
4706    stmt is obtained from the type of the result X:
4707         get_vectype_for_scalar_type (TREE_TYPE (X))
4708
4709    This means that, contrary to "regular" reductions (or "regular" stmts in
4710    general), the following equation:
4711       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4712    does *NOT* necessarily hold for reduction patterns.  */
4713
4714 bool
4715 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4716                         gimple *vec_stmt, slp_tree slp_node)
4717 {
4718   tree vec_dest;
4719   tree scalar_dest;
4720   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4721   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4722   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4723   tree vectype_in = NULL_TREE;
4724   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4725   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4726   enum tree_code code, orig_code, epilog_reduc_code;
4727   enum machine_mode vec_mode;
4728   int op_type;
4729   optab optab, reduc_optab;
4730   tree new_temp = NULL_TREE;
4731   tree def;
4732   gimple def_stmt;
4733   enum vect_def_type dt;
4734   gimple_phi new_phi = NULL;
4735   tree scalar_type;
4736   bool is_simple_use;
4737   gimple orig_stmt;
4738   stmt_vec_info orig_stmt_info;
4739   tree expr = NULL_TREE;
4740   int i;
4741   int ncopies;
4742   int epilog_copies;
4743   stmt_vec_info prev_stmt_info, prev_phi_info;
4744   bool single_defuse_cycle = false;
4745   tree reduc_def = NULL_TREE;
4746   gimple new_stmt = NULL;
4747   int j;
4748   tree ops[3];
4749   bool nested_cycle = false, found_nested_cycle_def = false;
4750   gimple reduc_def_stmt = NULL;
4751   /* The default is that the reduction variable is the last in statement.  */
4752   int reduc_index = 2;
4753   bool double_reduc = false, dummy;
4754   basic_block def_bb;
4755   struct loop * def_stmt_loop, *outer_loop = NULL;
4756   tree def_arg;
4757   gimple def_arg_stmt;
4758   auto_vec<tree> vec_oprnds0;
4759   auto_vec<tree> vec_oprnds1;
4760   auto_vec<tree> vect_defs;
4761   auto_vec<gimple> phis;
4762   int vec_num;
4763   tree def0, def1, tem, op0, op1 = NULL_TREE;
4764
4765   /* In case of reduction chain we switch to the first stmt in the chain, but
4766      we don't update STMT_INFO, since only the last stmt is marked as reduction
4767      and has reduction properties.  */
4768   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4769     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4770
4771   if (nested_in_vect_loop_p (loop, stmt))
4772     {
4773       outer_loop = loop;
4774       loop = loop->inner;
4775       nested_cycle = true;
4776     }
4777
4778   /* 1. Is vectorizable reduction?  */
4779   /* Not supportable if the reduction variable is used in the loop, unless
4780      it's a reduction chain.  */
4781   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4782       && !GROUP_FIRST_ELEMENT (stmt_info))
4783     return false;
4784
4785   /* Reductions that are not used even in an enclosing outer-loop,
4786      are expected to be "live" (used out of the loop).  */
4787   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4788       && !STMT_VINFO_LIVE_P (stmt_info))
4789     return false;
4790
4791   /* Make sure it was already recognized as a reduction computation.  */
4792   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4793       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4794     return false;
4795
4796   /* 2. Has this been recognized as a reduction pattern?
4797
4798      Check if STMT represents a pattern that has been recognized
4799      in earlier analysis stages.  For stmts that represent a pattern,
4800      the STMT_VINFO_RELATED_STMT field records the last stmt in
4801      the original sequence that constitutes the pattern.  */
4802
4803   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4804   if (orig_stmt)
4805     {
4806       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4807       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4808       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4809     }
4810
4811   /* 3. Check the operands of the operation.  The first operands are defined
4812         inside the loop body. The last operand is the reduction variable,
4813         which is defined by the loop-header-phi.  */
4814
4815   gcc_assert (is_gimple_assign (stmt));
4816
4817   /* Flatten RHS.  */
4818   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4819     {
4820     case GIMPLE_SINGLE_RHS:
4821       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4822       if (op_type == ternary_op)
4823         {
4824           tree rhs = gimple_assign_rhs1 (stmt);
4825           ops[0] = TREE_OPERAND (rhs, 0);
4826           ops[1] = TREE_OPERAND (rhs, 1);
4827           ops[2] = TREE_OPERAND (rhs, 2);
4828           code = TREE_CODE (rhs);
4829         }
4830       else
4831         return false;
4832       break;
4833
4834     case GIMPLE_BINARY_RHS:
4835       code = gimple_assign_rhs_code (stmt);
4836       op_type = TREE_CODE_LENGTH (code);
4837       gcc_assert (op_type == binary_op);
4838       ops[0] = gimple_assign_rhs1 (stmt);
4839       ops[1] = gimple_assign_rhs2 (stmt);
4840       break;
4841
4842     case GIMPLE_TERNARY_RHS:
4843       code = gimple_assign_rhs_code (stmt);
4844       op_type = TREE_CODE_LENGTH (code);
4845       gcc_assert (op_type == ternary_op);
4846       ops[0] = gimple_assign_rhs1 (stmt);
4847       ops[1] = gimple_assign_rhs2 (stmt);
4848       ops[2] = gimple_assign_rhs3 (stmt);
4849       break;
4850
4851     case GIMPLE_UNARY_RHS:
4852       return false;
4853
4854     default:
4855       gcc_unreachable ();
4856     }
4857
4858   if (code == COND_EXPR && slp_node)
4859     return false;
4860
4861   scalar_dest = gimple_assign_lhs (stmt);
4862   scalar_type = TREE_TYPE (scalar_dest);
4863   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4864       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4865     return false;
4866
4867   /* Do not try to vectorize bit-precision reductions.  */
4868   if ((TYPE_PRECISION (scalar_type)
4869        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4870     return false;
4871
4872   /* All uses but the last are expected to be defined in the loop.
4873      The last use is the reduction variable.  In case of nested cycle this
4874      assumption is not true: we use reduc_index to record the index of the
4875      reduction variable.  */
4876   for (i = 0; i < op_type - 1; i++)
4877     {
4878       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4879       if (i == 0 && code == COND_EXPR)
4880         continue;
4881
4882       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4883                                             &def_stmt, &def, &dt, &tem);
4884       if (!vectype_in)
4885         vectype_in = tem;
4886       gcc_assert (is_simple_use);
4887
4888       if (dt != vect_internal_def
4889           && dt != vect_external_def
4890           && dt != vect_constant_def
4891           && dt != vect_induction_def
4892           && !(dt == vect_nested_cycle && nested_cycle))
4893         return false;
4894
4895       if (dt == vect_nested_cycle)
4896         {
4897           found_nested_cycle_def = true;
4898           reduc_def_stmt = def_stmt;
4899           reduc_index = i;
4900         }
4901     }
4902
4903   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4904                                         &def_stmt, &def, &dt, &tem);
4905   if (!vectype_in)
4906     vectype_in = tem;
4907   gcc_assert (is_simple_use);
4908   if (!(dt == vect_reduction_def
4909         || dt == vect_nested_cycle
4910         || ((dt == vect_internal_def || dt == vect_external_def
4911              || dt == vect_constant_def || dt == vect_induction_def)
4912             && nested_cycle && found_nested_cycle_def)))
4913     {
4914       /* For pattern recognized stmts, orig_stmt might be a reduction,
4915          but some helper statements for the pattern might not, or
4916          might be COND_EXPRs with reduction uses in the condition.  */
4917       gcc_assert (orig_stmt);
4918       return false;
4919     }
4920   if (!found_nested_cycle_def)
4921     reduc_def_stmt = def_stmt;
4922
4923   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4924   if (orig_stmt)
4925     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4926                                                        reduc_def_stmt,
4927                                                        !nested_cycle,
4928                                                        &dummy));
4929   else
4930     {
4931       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4932                                              !nested_cycle, &dummy);
4933       /* We changed STMT to be the first stmt in reduction chain, hence we
4934          check that in this case the first element in the chain is STMT.  */
4935       gcc_assert (stmt == tmp
4936                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4937     }
4938
4939   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4940     return false;
4941
4942   if (slp_node || PURE_SLP_STMT (stmt_info))
4943     ncopies = 1;
4944   else
4945     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4946                / TYPE_VECTOR_SUBPARTS (vectype_in));
4947
4948   gcc_assert (ncopies >= 1);
4949
4950   vec_mode = TYPE_MODE (vectype_in);
4951
4952   if (code == COND_EXPR)
4953     {
4954       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4955         {
4956           if (dump_enabled_p ())
4957             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4958                              "unsupported condition in reduction\n");
4959
4960             return false;
4961         }
4962     }
4963   else
4964     {
4965       /* 4. Supportable by target?  */
4966
4967       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4968           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4969         {
4970           /* Shifts and rotates are only supported by vectorizable_shifts,
4971              not vectorizable_reduction.  */
4972           if (dump_enabled_p ())
4973             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4974                              "unsupported shift or rotation.\n");
4975           return false;
4976         }
4977
4978       /* 4.1. check support for the operation in the loop  */
4979       optab = optab_for_tree_code (code, vectype_in, optab_default);
4980       if (!optab)
4981         {
4982           if (dump_enabled_p ())
4983             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4984                              "no optab.\n");
4985
4986           return false;
4987         }
4988
4989       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
4990         {
4991           if (dump_enabled_p ())
4992             dump_printf (MSG_NOTE, "op not supported by target.\n");
4993
4994           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
4995               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4996                   < vect_min_worthwhile_factor (code))
4997             return false;
4998
4999           if (dump_enabled_p ())
5000             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5001         }
5002
5003       /* Worthwhile without SIMD support?  */
5004       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5005           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5006              < vect_min_worthwhile_factor (code))
5007         {
5008           if (dump_enabled_p ())
5009             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5010                              "not worthwhile without SIMD support.\n");
5011
5012           return false;
5013         }
5014     }
5015
5016   /* 4.2. Check support for the epilog operation.
5017
5018           If STMT represents a reduction pattern, then the type of the
5019           reduction variable may be different than the type of the rest
5020           of the arguments.  For example, consider the case of accumulation
5021           of shorts into an int accumulator; The original code:
5022                         S1: int_a = (int) short_a;
5023           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5024
5025           was replaced with:
5026                         STMT: int_acc = widen_sum <short_a, int_acc>
5027
5028           This means that:
5029           1. The tree-code that is used to create the vector operation in the
5030              epilog code (that reduces the partial results) is not the
5031              tree-code of STMT, but is rather the tree-code of the original
5032              stmt from the pattern that STMT is replacing.  I.e, in the example
5033              above we want to use 'widen_sum' in the loop, but 'plus' in the
5034              epilog.
5035           2. The type (mode) we use to check available target support
5036              for the vector operation to be created in the *epilog*, is
5037              determined by the type of the reduction variable (in the example
5038              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5039              However the type (mode) we use to check available target support
5040              for the vector operation to be created *inside the loop*, is
5041              determined by the type of the other arguments to STMT (in the
5042              example we'd check this: optab_handler (widen_sum_optab,
5043              vect_short_mode)).
5044
5045           This is contrary to "regular" reductions, in which the types of all
5046           the arguments are the same as the type of the reduction variable.
5047           For "regular" reductions we can therefore use the same vector type
5048           (and also the same tree-code) when generating the epilog code and
5049           when generating the code inside the loop.  */
5050
5051   if (orig_stmt)
5052     {
5053       /* This is a reduction pattern: get the vectype from the type of the
5054          reduction variable, and get the tree-code from orig_stmt.  */
5055       orig_code = gimple_assign_rhs_code (orig_stmt);
5056       gcc_assert (vectype_out);
5057       vec_mode = TYPE_MODE (vectype_out);
5058     }
5059   else
5060     {
5061       /* Regular reduction: use the same vectype and tree-code as used for
5062          the vector code inside the loop can be used for the epilog code. */
5063       orig_code = code;
5064     }
5065
5066   if (nested_cycle)
5067     {
5068       def_bb = gimple_bb (reduc_def_stmt);
5069       def_stmt_loop = def_bb->loop_father;
5070       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5071                                        loop_preheader_edge (def_stmt_loop));
5072       if (TREE_CODE (def_arg) == SSA_NAME
5073           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5074           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5075           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5076           && vinfo_for_stmt (def_arg_stmt)
5077           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5078               == vect_double_reduction_def)
5079         double_reduc = true;
5080     }
5081
5082   epilog_reduc_code = ERROR_MARK;
5083   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5084     {
5085       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5086                                          optab_default);
5087       if (!reduc_optab)
5088         {
5089           if (dump_enabled_p ())
5090             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5091                              "no optab for reduction.\n");
5092
5093           epilog_reduc_code = ERROR_MARK;
5094         }
5095
5096       if (reduc_optab
5097           && optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5098         {
5099           if (dump_enabled_p ())
5100             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5101                              "reduc op not supported by target.\n");
5102
5103           epilog_reduc_code = ERROR_MARK;
5104         }
5105     }
5106   else
5107     {
5108       if (!nested_cycle || double_reduc)
5109         {
5110           if (dump_enabled_p ())
5111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5112                              "no reduc code for scalar code.\n");
5113
5114           return false;
5115         }
5116     }
5117
5118   if (double_reduc && ncopies > 1)
5119     {
5120       if (dump_enabled_p ())
5121         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122                          "multiple types in double reduction\n");
5123
5124       return false;
5125     }
5126
5127   /* In case of widenning multiplication by a constant, we update the type
5128      of the constant to be the type of the other operand.  We check that the
5129      constant fits the type in the pattern recognition pass.  */
5130   if (code == DOT_PROD_EXPR
5131       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5132     {
5133       if (TREE_CODE (ops[0]) == INTEGER_CST)
5134         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5135       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5136         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5137       else
5138         {
5139           if (dump_enabled_p ())
5140             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5141                              "invalid types in dot-prod\n");
5142
5143           return false;
5144         }
5145     }
5146
5147   if (!vec_stmt) /* transformation not required.  */
5148     {
5149       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5150         return false;
5151       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5152       return true;
5153     }
5154
5155   /** Transform.  **/
5156
5157   if (dump_enabled_p ())
5158     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5159
5160   /* FORNOW: Multiple types are not supported for condition.  */
5161   if (code == COND_EXPR)
5162     gcc_assert (ncopies == 1);
5163
5164   /* Create the destination vector  */
5165   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5166
5167   /* In case the vectorization factor (VF) is bigger than the number
5168      of elements that we can fit in a vectype (nunits), we have to generate
5169      more than one vector stmt - i.e - we need to "unroll" the
5170      vector stmt by a factor VF/nunits.  For more details see documentation
5171      in vectorizable_operation.  */
5172
5173   /* If the reduction is used in an outer loop we need to generate
5174      VF intermediate results, like so (e.g. for ncopies=2):
5175         r0 = phi (init, r0)
5176         r1 = phi (init, r1)
5177         r0 = x0 + r0;
5178         r1 = x1 + r1;
5179     (i.e. we generate VF results in 2 registers).
5180     In this case we have a separate def-use cycle for each copy, and therefore
5181     for each copy we get the vector def for the reduction variable from the
5182     respective phi node created for this copy.
5183
5184     Otherwise (the reduction is unused in the loop nest), we can combine
5185     together intermediate results, like so (e.g. for ncopies=2):
5186         r = phi (init, r)
5187         r = x0 + r;
5188         r = x1 + r;
5189    (i.e. we generate VF/2 results in a single register).
5190    In this case for each copy we get the vector def for the reduction variable
5191    from the vectorized reduction operation generated in the previous iteration.
5192   */
5193
5194   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5195     {
5196       single_defuse_cycle = true;
5197       epilog_copies = 1;
5198     }
5199   else
5200     epilog_copies = ncopies;
5201
5202   prev_stmt_info = NULL;
5203   prev_phi_info = NULL;
5204   if (slp_node)
5205     {
5206       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5207       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5208                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5209     }
5210   else
5211     {
5212       vec_num = 1;
5213       vec_oprnds0.create (1);
5214       if (op_type == ternary_op)
5215         vec_oprnds1.create (1);
5216     }
5217
5218   phis.create (vec_num);
5219   vect_defs.create (vec_num);
5220   if (!slp_node)
5221     vect_defs.quick_push (NULL_TREE);
5222
5223   for (j = 0; j < ncopies; j++)
5224     {
5225       if (j == 0 || !single_defuse_cycle)
5226         {
5227           for (i = 0; i < vec_num; i++)
5228             {
5229               /* Create the reduction-phi that defines the reduction
5230                  operand.  */
5231               new_phi = create_phi_node (vec_dest, loop->header);
5232               set_vinfo_for_stmt (new_phi,
5233                                   new_stmt_vec_info (new_phi, loop_vinfo,
5234                                                      NULL));
5235                if (j == 0 || slp_node)
5236                  phis.quick_push (new_phi);
5237             }
5238         }
5239
5240       if (code == COND_EXPR)
5241         {
5242           gcc_assert (!slp_node);
5243           vectorizable_condition (stmt, gsi, vec_stmt,
5244                                   PHI_RESULT (phis[0]),
5245                                   reduc_index, NULL);
5246           /* Multiple types are not supported for condition.  */
5247           break;
5248         }
5249
5250       /* Handle uses.  */
5251       if (j == 0)
5252         {
5253           op0 = ops[!reduc_index];
5254           if (op_type == ternary_op)
5255             {
5256               if (reduc_index == 0)
5257                 op1 = ops[2];
5258               else
5259                 op1 = ops[1];
5260             }
5261
5262           if (slp_node)
5263             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5264                                slp_node, -1);
5265           else
5266             {
5267               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5268                                                             stmt, NULL);
5269               vec_oprnds0.quick_push (loop_vec_def0);
5270               if (op_type == ternary_op)
5271                {
5272                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5273                                                                NULL);
5274                  vec_oprnds1.quick_push (loop_vec_def1);
5275                }
5276             }
5277         }
5278       else
5279         {
5280           if (!slp_node)
5281             {
5282               enum vect_def_type dt;
5283               gimple dummy_stmt;
5284               tree dummy;
5285
5286               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5287                                   &dummy_stmt, &dummy, &dt);
5288               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5289                                                               loop_vec_def0);
5290               vec_oprnds0[0] = loop_vec_def0;
5291               if (op_type == ternary_op)
5292                 {
5293                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5294                                       &dummy, &dt);
5295                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5296                                                                 loop_vec_def1);
5297                   vec_oprnds1[0] = loop_vec_def1;
5298                 }
5299             }
5300
5301           if (single_defuse_cycle)
5302             reduc_def = gimple_assign_lhs (new_stmt);
5303
5304           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5305         }
5306
5307       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5308         {
5309           if (slp_node)
5310             reduc_def = PHI_RESULT (phis[i]);
5311           else
5312             {
5313               if (!single_defuse_cycle || j == 0)
5314                 reduc_def = PHI_RESULT (new_phi);
5315             }
5316
5317           def1 = ((op_type == ternary_op)
5318                   ? vec_oprnds1[i] : NULL);
5319           if (op_type == binary_op)
5320             {
5321               if (reduc_index == 0)
5322                 expr = build2 (code, vectype_out, reduc_def, def0);
5323               else
5324                 expr = build2 (code, vectype_out, def0, reduc_def);
5325             }
5326           else
5327             {
5328               if (reduc_index == 0)
5329                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5330               else
5331                 {
5332                   if (reduc_index == 1)
5333                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5334                   else
5335                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5336                 }
5337             }
5338
5339           new_stmt = gimple_build_assign (vec_dest, expr);
5340           new_temp = make_ssa_name (vec_dest, new_stmt);
5341           gimple_assign_set_lhs (new_stmt, new_temp);
5342           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5343
5344           if (slp_node)
5345             {
5346               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5347               vect_defs.quick_push (new_temp);
5348             }
5349           else
5350             vect_defs[0] = new_temp;
5351         }
5352
5353       if (slp_node)
5354         continue;
5355
5356       if (j == 0)
5357         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5358       else
5359         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5360
5361       prev_stmt_info = vinfo_for_stmt (new_stmt);
5362       prev_phi_info = vinfo_for_stmt (new_phi);
5363     }
5364
5365   /* Finalize the reduction-phi (set its arguments) and create the
5366      epilog reduction code.  */
5367   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5368     {
5369       new_temp = gimple_assign_lhs (*vec_stmt);
5370       vect_defs[0] = new_temp;
5371     }
5372
5373   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5374                                     epilog_reduc_code, phis, reduc_index,
5375                                     double_reduc, slp_node);
5376
5377   return true;
5378 }
5379
5380 /* Function vect_min_worthwhile_factor.
5381
5382    For a loop where we could vectorize the operation indicated by CODE,
5383    return the minimum vectorization factor that makes it worthwhile
5384    to use generic vectors.  */
5385 int
5386 vect_min_worthwhile_factor (enum tree_code code)
5387 {
5388   switch (code)
5389     {
5390     case PLUS_EXPR:
5391     case MINUS_EXPR:
5392     case NEGATE_EXPR:
5393       return 4;
5394
5395     case BIT_AND_EXPR:
5396     case BIT_IOR_EXPR:
5397     case BIT_XOR_EXPR:
5398     case BIT_NOT_EXPR:
5399       return 2;
5400
5401     default:
5402       return INT_MAX;
5403     }
5404 }
5405
5406
5407 /* Function vectorizable_induction
5408
5409    Check if PHI performs an induction computation that can be vectorized.
5410    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5411    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5412    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5413
5414 bool
5415 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5416                         gimple *vec_stmt)
5417 {
5418   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5419   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5420   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5421   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5422   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5423   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5424   tree vec_def;
5425
5426   gcc_assert (ncopies >= 1);
5427   /* FORNOW. These restrictions should be relaxed.  */
5428   if (nested_in_vect_loop_p (loop, phi))
5429     {
5430       imm_use_iterator imm_iter;
5431       use_operand_p use_p;
5432       gimple exit_phi;
5433       edge latch_e;
5434       tree loop_arg;
5435
5436       if (ncopies > 1)
5437         {
5438           if (dump_enabled_p ())
5439             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5440                              "multiple types in nested loop.\n");
5441           return false;
5442         }
5443
5444       exit_phi = NULL;
5445       latch_e = loop_latch_edge (loop->inner);
5446       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5447       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5448         {
5449           gimple use_stmt = USE_STMT (use_p);
5450           if (is_gimple_debug (use_stmt))
5451             continue;
5452
5453           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5454             {
5455               exit_phi = use_stmt;
5456               break;
5457             }
5458         }
5459       if (exit_phi)
5460         {
5461           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5462           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5463                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5464             {
5465               if (dump_enabled_p ())
5466                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5467                                  "inner-loop induction only used outside "
5468                                  "of the outer vectorized loop.\n");
5469               return false;
5470             }
5471         }
5472     }
5473
5474   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5475     return false;
5476
5477   /* FORNOW: SLP not supported.  */
5478   if (STMT_SLP_TYPE (stmt_info))
5479     return false;
5480
5481   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5482
5483   if (gimple_code (phi) != GIMPLE_PHI)
5484     return false;
5485
5486   if (!vec_stmt) /* transformation not required.  */
5487     {
5488       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5489       if (dump_enabled_p ())
5490         dump_printf_loc (MSG_NOTE, vect_location,
5491                          "=== vectorizable_induction ===\n");
5492       vect_model_induction_cost (stmt_info, ncopies);
5493       return true;
5494     }
5495
5496   /** Transform.  **/
5497
5498   if (dump_enabled_p ())
5499     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5500
5501   vec_def = get_initial_def_for_induction (phi);
5502   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5503   return true;
5504 }
5505
5506 /* Function vectorizable_live_operation.
5507
5508    STMT computes a value that is used outside the loop.  Check if
5509    it can be supported.  */
5510
5511 bool
5512 vectorizable_live_operation (gimple stmt,
5513                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5514                              gimple *vec_stmt)
5515 {
5516   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5517   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5518   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5519   int i;
5520   int op_type;
5521   tree op;
5522   tree def;
5523   gimple def_stmt;
5524   enum vect_def_type dt;
5525   enum tree_code code;
5526   enum gimple_rhs_class rhs_class;
5527
5528   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5529
5530   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5531     return false;
5532
5533   if (!is_gimple_assign (stmt))
5534     {
5535       if (gimple_call_internal_p (stmt)
5536           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5537           && gimple_call_lhs (stmt)
5538           && loop->simduid
5539           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5540           && loop->simduid
5541              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5542         {
5543           edge e = single_exit (loop);
5544           basic_block merge_bb = e->dest;
5545           imm_use_iterator imm_iter;
5546           use_operand_p use_p;
5547           tree lhs = gimple_call_lhs (stmt);
5548
5549           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5550             {
5551               gimple use_stmt = USE_STMT (use_p);
5552               if (gimple_code (use_stmt) == GIMPLE_PHI
5553                   && gimple_bb (use_stmt) == merge_bb)
5554                 {
5555                   if (vec_stmt)
5556                     {
5557                       tree vfm1
5558                         = build_int_cst (unsigned_type_node,
5559                                          loop_vinfo->vectorization_factor - 1);
5560                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5561                     }
5562                   return true;
5563                 }
5564             }
5565         }
5566
5567       return false;
5568     }
5569
5570   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5571     return false;
5572
5573   /* FORNOW. CHECKME. */
5574   if (nested_in_vect_loop_p (loop, stmt))
5575     return false;
5576
5577   code = gimple_assign_rhs_code (stmt);
5578   op_type = TREE_CODE_LENGTH (code);
5579   rhs_class = get_gimple_rhs_class (code);
5580   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5581   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5582
5583   /* FORNOW: support only if all uses are invariant.  This means
5584      that the scalar operations can remain in place, unvectorized.
5585      The original last scalar value that they compute will be used.  */
5586
5587   for (i = 0; i < op_type; i++)
5588     {
5589       if (rhs_class == GIMPLE_SINGLE_RHS)
5590         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5591       else
5592         op = gimple_op (stmt, i + 1);
5593       if (op
5594           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5595                                   &dt))
5596         {
5597           if (dump_enabled_p ())
5598             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5599                              "use not simple.\n");
5600           return false;
5601         }
5602
5603       if (dt != vect_external_def && dt != vect_constant_def)
5604         return false;
5605     }
5606
5607   /* No transformation is required for the cases we currently support.  */
5608   return true;
5609 }
5610
5611 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5612
5613 static void
5614 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5615 {
5616   ssa_op_iter op_iter;
5617   imm_use_iterator imm_iter;
5618   def_operand_p def_p;
5619   gimple ustmt;
5620
5621   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5622     {
5623       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5624         {
5625           basic_block bb;
5626
5627           if (!is_gimple_debug (ustmt))
5628             continue;
5629
5630           bb = gimple_bb (ustmt);
5631
5632           if (!flow_bb_inside_loop_p (loop, bb))
5633             {
5634               if (gimple_debug_bind_p (ustmt))
5635                 {
5636                   if (dump_enabled_p ())
5637                     dump_printf_loc (MSG_NOTE, vect_location,
5638                                      "killing debug use\n");
5639
5640                   gimple_debug_bind_reset_value (ustmt);
5641                   update_stmt (ustmt);
5642                 }
5643               else
5644                 gcc_unreachable ();
5645             }
5646         }
5647     }
5648 }
5649
5650
5651 /* This function builds ni_name = number of iterations.  Statements
5652    are emitted on the loop preheader edge.  */
5653
5654 static tree
5655 vect_build_loop_niters (loop_vec_info loop_vinfo)
5656 {
5657   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5658   if (TREE_CODE (ni) == INTEGER_CST)
5659     return ni;
5660   else
5661     {
5662       tree ni_name, var;
5663       gimple_seq stmts = NULL;
5664       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5665
5666       var = create_tmp_var (TREE_TYPE (ni), "niters");
5667       ni_name = force_gimple_operand (ni, &stmts, false, var);
5668       if (stmts)
5669         gsi_insert_seq_on_edge_immediate (pe, stmts);
5670
5671       return ni_name;
5672     }
5673 }
5674
5675
5676 /* This function generates the following statements:
5677
5678    ni_name = number of iterations loop executes
5679    ratio = ni_name / vf
5680    ratio_mult_vf_name = ratio * vf
5681
5682    and places them on the loop preheader edge.  */
5683
5684 static void
5685 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5686                                  tree ni_name,
5687                                  tree *ratio_mult_vf_name_ptr,
5688                                  tree *ratio_name_ptr)
5689 {
5690   tree ni_minus_gap_name;
5691   tree var;
5692   tree ratio_name;
5693   tree ratio_mult_vf_name;
5694   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5695   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5696   tree log_vf;
5697
5698   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5699
5700   /* If epilogue loop is required because of data accesses with gaps, we
5701      subtract one iteration from the total number of iterations here for
5702      correct calculation of RATIO.  */
5703   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5704     {
5705       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5706                                        ni_name,
5707                                        build_one_cst (TREE_TYPE (ni_name)));
5708       if (!is_gimple_val (ni_minus_gap_name))
5709         {
5710           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5711           gimple stmts = NULL;
5712           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5713                                                     true, var);
5714           gsi_insert_seq_on_edge_immediate (pe, stmts);
5715         }
5716     }
5717   else
5718     ni_minus_gap_name = ni_name;
5719
5720   /* Create: ratio = ni >> log2(vf) */
5721   /* ???  As we have ni == number of latch executions + 1, ni could
5722      have overflown to zero.  So avoid computing ratio based on ni
5723      but compute it using the fact that we know ratio will be at least
5724      one, thus via (ni - vf) >> log2(vf) + 1.  */
5725   ratio_name
5726     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5727                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5728                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5729                                              ni_minus_gap_name,
5730                                              build_int_cst
5731                                                (TREE_TYPE (ni_name), vf)),
5732                                 log_vf),
5733                    build_int_cst (TREE_TYPE (ni_name), 1));
5734   if (!is_gimple_val (ratio_name))
5735     {
5736       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5737       gimple stmts = NULL;
5738       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5739       gsi_insert_seq_on_edge_immediate (pe, stmts);
5740     }
5741   *ratio_name_ptr = ratio_name;
5742
5743   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5744
5745   if (ratio_mult_vf_name_ptr)
5746     {
5747       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5748                                         ratio_name, log_vf);
5749       if (!is_gimple_val (ratio_mult_vf_name))
5750         {
5751           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5752           gimple stmts = NULL;
5753           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5754                                                      true, var);
5755           gsi_insert_seq_on_edge_immediate (pe, stmts);
5756         }
5757       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5758     }
5759
5760   return;
5761 }
5762
5763
5764 /* Function vect_transform_loop.
5765
5766    The analysis phase has determined that the loop is vectorizable.
5767    Vectorize the loop - created vectorized stmts to replace the scalar
5768    stmts in the loop, and update the loop exit condition.  */
5769
5770 void
5771 vect_transform_loop (loop_vec_info loop_vinfo)
5772 {
5773   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5774   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5775   int nbbs = loop->num_nodes;
5776   gimple_stmt_iterator si;
5777   int i;
5778   tree ratio = NULL;
5779   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5780   bool grouped_store;
5781   bool slp_scheduled = false;
5782   gimple stmt, pattern_stmt;
5783   gimple_seq pattern_def_seq = NULL;
5784   gimple_stmt_iterator pattern_def_si = gsi_none ();
5785   bool transform_pattern_stmt = false;
5786   bool check_profitability = false;
5787   int th;
5788   /* Record number of iterations before we started tampering with the profile. */
5789   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5790
5791   if (dump_enabled_p ())
5792     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5793
5794   /* If profile is inprecise, we have chance to fix it up.  */
5795   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5796     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5797
5798   /* Use the more conservative vectorization threshold.  If the number
5799      of iterations is constant assume the cost check has been performed
5800      by our caller.  If the threshold makes all loops profitable that
5801      run at least the vectorization factor number of times checking
5802      is pointless, too.  */
5803   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5804   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5805       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5806     {
5807       if (dump_enabled_p ())
5808         dump_printf_loc (MSG_NOTE, vect_location,
5809                          "Profitability threshold is %d loop iterations.\n",
5810                          th);
5811       check_profitability = true;
5812     }
5813
5814   /* Version the loop first, if required, so the profitability check
5815      comes first.  */
5816
5817   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5818       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5819     {
5820       vect_loop_versioning (loop_vinfo, th, check_profitability);
5821       check_profitability = false;
5822     }
5823
5824   tree ni_name = vect_build_loop_niters (loop_vinfo);
5825   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5826
5827   /* Peel the loop if there are data refs with unknown alignment.
5828      Only one data ref with unknown store is allowed.  */
5829
5830   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5831     {
5832       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5833                                      th, check_profitability);
5834       check_profitability = false;
5835       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5836          be re-computed.  */
5837       ni_name = NULL_TREE;
5838     }
5839
5840   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5841      compile time constant), or it is a constant that doesn't divide by the
5842      vectorization factor, then an epilog loop needs to be created.
5843      We therefore duplicate the loop: the original loop will be vectorized,
5844      and will compute the first (n/VF) iterations.  The second copy of the loop
5845      will remain scalar and will compute the remaining (n%VF) iterations.
5846      (VF is the vectorization factor).  */
5847
5848   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5849       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5850     {
5851       tree ratio_mult_vf;
5852       if (!ni_name)
5853         ni_name = vect_build_loop_niters (loop_vinfo);
5854       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5855                                        &ratio);
5856       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5857                                       th, check_profitability);
5858     }
5859   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5860     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5861                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5862   else
5863     {
5864       if (!ni_name)
5865         ni_name = vect_build_loop_niters (loop_vinfo);
5866       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5867     }
5868
5869   /* 1) Make sure the loop header has exactly two entries
5870      2) Make sure we have a preheader basic block.  */
5871
5872   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5873
5874   split_edge (loop_preheader_edge (loop));
5875
5876   /* FORNOW: the vectorizer supports only loops which body consist
5877      of one basic block (header + empty latch). When the vectorizer will
5878      support more involved loop forms, the order by which the BBs are
5879      traversed need to be reconsidered.  */
5880
5881   for (i = 0; i < nbbs; i++)
5882     {
5883       basic_block bb = bbs[i];
5884       stmt_vec_info stmt_info;
5885       gimple phi;
5886
5887       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5888         {
5889           phi = gsi_stmt (si);
5890           if (dump_enabled_p ())
5891             {
5892               dump_printf_loc (MSG_NOTE, vect_location,
5893                                "------>vectorizing phi: ");
5894               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5895               dump_printf (MSG_NOTE, "\n");
5896             }
5897           stmt_info = vinfo_for_stmt (phi);
5898           if (!stmt_info)
5899             continue;
5900
5901           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5902             vect_loop_kill_debug_uses (loop, phi);
5903
5904           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5905               && !STMT_VINFO_LIVE_P (stmt_info))
5906             continue;
5907
5908           if (STMT_VINFO_VECTYPE (stmt_info)
5909               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5910                   != (unsigned HOST_WIDE_INT) vectorization_factor)
5911               && dump_enabled_p ())
5912             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5913
5914           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5915             {
5916               if (dump_enabled_p ())
5917                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5918               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5919             }
5920         }
5921
5922       pattern_stmt = NULL;
5923       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5924         {
5925           bool is_store;
5926
5927           if (transform_pattern_stmt)
5928             stmt = pattern_stmt;
5929           else
5930             {
5931               stmt = gsi_stmt (si);
5932               /* During vectorization remove existing clobber stmts.  */
5933               if (gimple_clobber_p (stmt))
5934                 {
5935                   unlink_stmt_vdef (stmt);
5936                   gsi_remove (&si, true);
5937                   release_defs (stmt);
5938                   continue;
5939                 }
5940             }
5941
5942           if (dump_enabled_p ())
5943             {
5944               dump_printf_loc (MSG_NOTE, vect_location,
5945                                "------>vectorizing statement: ");
5946               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5947               dump_printf (MSG_NOTE, "\n");
5948             }
5949
5950           stmt_info = vinfo_for_stmt (stmt);
5951
5952           /* vector stmts created in the outer-loop during vectorization of
5953              stmts in an inner-loop may not have a stmt_info, and do not
5954              need to be vectorized.  */
5955           if (!stmt_info)
5956             {
5957               gsi_next (&si);
5958               continue;
5959             }
5960
5961           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5962             vect_loop_kill_debug_uses (loop, stmt);
5963
5964           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5965               && !STMT_VINFO_LIVE_P (stmt_info))
5966             {
5967               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5968                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5969                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5970                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5971                 {
5972                   stmt = pattern_stmt;
5973                   stmt_info = vinfo_for_stmt (stmt);
5974                 }
5975               else
5976                 {
5977                   gsi_next (&si);
5978                   continue;
5979                 }
5980             }
5981           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5982                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5983                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5984                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5985             transform_pattern_stmt = true;
5986
5987           /* If pattern statement has def stmts, vectorize them too.  */
5988           if (is_pattern_stmt_p (stmt_info))
5989             {
5990               if (pattern_def_seq == NULL)
5991                 {
5992                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
5993                   pattern_def_si = gsi_start (pattern_def_seq);
5994                 }
5995               else if (!gsi_end_p (pattern_def_si))
5996                 gsi_next (&pattern_def_si);
5997               if (pattern_def_seq != NULL)
5998                 {
5999                   gimple pattern_def_stmt = NULL;
6000                   stmt_vec_info pattern_def_stmt_info = NULL;
6001
6002                   while (!gsi_end_p (pattern_def_si))
6003                     {
6004                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6005                       pattern_def_stmt_info
6006                         = vinfo_for_stmt (pattern_def_stmt);
6007                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6008                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6009                         break;
6010                       gsi_next (&pattern_def_si);
6011                     }
6012
6013                   if (!gsi_end_p (pattern_def_si))
6014                     {
6015                       if (dump_enabled_p ())
6016                         {
6017                           dump_printf_loc (MSG_NOTE, vect_location,
6018                                            "==> vectorizing pattern def "
6019                                            "stmt: ");
6020                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6021                                             pattern_def_stmt, 0);
6022                           dump_printf (MSG_NOTE, "\n");
6023                         }
6024
6025                       stmt = pattern_def_stmt;
6026                       stmt_info = pattern_def_stmt_info;
6027                     }
6028                   else
6029                     {
6030                       pattern_def_si = gsi_none ();
6031                       transform_pattern_stmt = false;
6032                     }
6033                 }
6034               else
6035                 transform_pattern_stmt = false;
6036             }
6037
6038           if (STMT_VINFO_VECTYPE (stmt_info))
6039             {
6040               unsigned int nunits
6041                 = (unsigned int)
6042                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6043               if (!STMT_SLP_TYPE (stmt_info)
6044                   && nunits != (unsigned int) vectorization_factor
6045                   && dump_enabled_p ())
6046                   /* For SLP VF is set according to unrolling factor, and not
6047                      to vector size, hence for SLP this print is not valid.  */
6048                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6049             }
6050
6051           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6052              reached.  */
6053           if (STMT_SLP_TYPE (stmt_info))
6054             {
6055               if (!slp_scheduled)
6056                 {
6057                   slp_scheduled = true;
6058
6059                   if (dump_enabled_p ())
6060                     dump_printf_loc (MSG_NOTE, vect_location,
6061                                      "=== scheduling SLP instances ===\n");
6062
6063                   vect_schedule_slp (loop_vinfo, NULL);
6064                 }
6065
6066               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6067               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6068                 {
6069                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6070                     {
6071                       pattern_def_seq = NULL;
6072                       gsi_next (&si);
6073                     }
6074                   continue;
6075                 }
6076             }
6077
6078           /* -------- vectorize statement ------------ */
6079           if (dump_enabled_p ())
6080             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6081
6082           grouped_store = false;
6083           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6084           if (is_store)
6085             {
6086               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6087                 {
6088                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6089                      interleaving chain was completed - free all the stores in
6090                      the chain.  */
6091                   gsi_next (&si);
6092                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6093                 }
6094               else
6095                 {
6096                   /* Free the attached stmt_vec_info and remove the stmt.  */
6097                   gimple store = gsi_stmt (si);
6098                   free_stmt_vec_info (store);
6099                   unlink_stmt_vdef (store);
6100                   gsi_remove (&si, true);
6101                   release_defs (store);
6102                 }
6103
6104               /* Stores can only appear at the end of pattern statements.  */
6105               gcc_assert (!transform_pattern_stmt);
6106               pattern_def_seq = NULL;
6107             }
6108           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6109             {
6110               pattern_def_seq = NULL;
6111               gsi_next (&si);
6112             }
6113         }                       /* stmts in BB */
6114     }                           /* BBs in loop */
6115
6116   slpeel_make_loop_iterate_ntimes (loop, ratio);
6117
6118   /* Reduce loop iterations by the vectorization factor.  */
6119   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6120                       expected_iterations / vectorization_factor);
6121   loop->nb_iterations_upper_bound
6122     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6123   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6124       && loop->nb_iterations_upper_bound != 0)
6125     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6126   if (loop->any_estimate)
6127     {
6128       loop->nb_iterations_estimate
6129         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6130        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6131            && loop->nb_iterations_estimate != 0)
6132          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6133     }
6134
6135   if (dump_enabled_p ())
6136     {
6137       dump_printf_loc (MSG_NOTE, vect_location,
6138                        "LOOP VECTORIZED\n");
6139       if (loop->inner)
6140         dump_printf_loc (MSG_NOTE, vect_location,
6141                          "OUTER LOOP VECTORIZED\n");
6142       dump_printf (MSG_NOTE, "\n");
6143     }
6144 }