gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "predict.h"
  30 #include "vec.h"
  31 #include "hashtab.h"
  32 #include "hash-set.h"
  33 #include "machmode.h"
  34 #include "hard-reg-set.h"
  35 #include "input.h"
  36 #include "function.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfganal.h"
  40 #include "basic-block.h"
  41 #include "gimple-pretty-print.h"
  42 #include "tree-ssa-alias.h"
  43 #include "internal-fn.h"
  44 #include "gimple-expr.h"
  45 #include "is-a.h"
  46 #include "gimple.h"
  47 #include "gimplify.h"
  48 #include "gimple-iterator.h"
  49 #include "gimplify-me.h"
  50 #include "gimple-ssa.h"
  51 #include "tree-phinodes.h"
  52 #include "ssa-iterators.h"
  53 #include "stringpool.h"
  54 #include "tree-ssanames.h"
  55 #include "tree-ssa-loop-ivopts.h"
  56 #include "tree-ssa-loop-manip.h"
  57 #include "tree-ssa-loop-niter.h"
  58 #include "tree-pass.h"
  59 #include "cfgloop.h"
  60 #include "expr.h"
  61 #include "recog.h"
  62 #include "optabs.h"
  63 #include "params.h"
  64 #include "diagnostic-core.h"
  65 #include "tree-chrec.h"
  66 #include "tree-scalar-evolution.h"
  67 #include "tree-vectorizer.h"
  68 #include "target.h"
  69
  70 /* Loop Vectorization Pass.
  71
  72    This pass tries to vectorize loops.
  73
  74    For example, the vectorizer transforms the following simple loop:
  75
  76         short a[N]; short b[N]; short c[N]; int i;
  77
  78         for (i=0; i<N; i++){
  79           a[i] = b[i] + c[i];
  80         }
  81
  82    as if it was manually vectorized by rewriting the source code into:
  83
  84         typedef int __attribute__((mode(V8HI))) v8hi;
  85         short a[N];  short b[N]; short c[N];   int i;
  86         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  87         v8hi va, vb, vc;
  88
  89         for (i=0; i<N/8; i++){
  90           vb = pb[i];
  91           vc = pc[i];
  92           va = vb + vc;
  93           pa[i] = va;
  94         }
  95
  96         The main entry to this pass is vectorize_loops(), in which
  97    the vectorizer applies a set of analyses on a given set of loops,
  98    followed by the actual vectorization transformation for the loops that
  99    had successfully passed the analysis phase.
 100         Throughout this pass we make a distinction between two types of
 101    data: scalars (which are represented by SSA_NAMES), and memory references
 102    ("data-refs").  These two types of data require different handling both
 103    during analysis and transformation. The types of data-refs that the
 104    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 105    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 106    accesses are required to have a simple (consecutive) access pattern.
 107
 108    Analysis phase:
 109    ===============
 110         The driver for the analysis phase is vect_analyze_loop().
 111    It applies a set of analyses, some of which rely on the scalar evolution
 112    analyzer (scev) developed by Sebastian Pop.
 113
 114         During the analysis phase the vectorizer records some information
 115    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 116    loop, as well as general information about the loop as a whole, which is
 117    recorded in a "loop_vec_info" struct attached to each loop.
 118
 119    Transformation phase:
 120    =====================
 121         The loop transformation phase scans all the stmts in the loop, and
 122    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 123    the loop that needs to be vectorized.  It inserts the vector code sequence
 124    just before the scalar stmt S, and records a pointer to the vector code
 125    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 126    attached to S).  This pointer will be used for the vectorization of following
 127    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 128    otherwise, we rely on dead code elimination for removing it.
 129
 130         For example, say stmt S1 was vectorized into stmt VS1:
 131
 132    VS1: vb = px[i];
 133    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 134    S2:  a = b;
 135
 136    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 137    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 138    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 139    resulting sequence would be:
 140
 141    VS1: vb = px[i];
 142    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 143    VS2: va = vb;
 144    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 145
 146         Operands that are not SSA_NAMEs, are data-refs that appear in
 147    load/store operations (like 'x[i]' in S1), and are handled differently.
 148
 149    Target modeling:
 150    =================
 151         Currently the only target specific information that is used is the
 152    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 153    Targets that can support different sizes of vectors, for now will need
 154    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 155    flexibility will be added in the future.
 156
 157         Since we only vectorize operations which vector form can be
 158    expressed using existing tree codes, to verify that an operation is
 159    supported, the vectorizer checks the relevant optab at the relevant
 160    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 161    the value found is CODE_FOR_nothing, then there's no target support, and
 162    we can't vectorize the stmt.
 163
 164    For additional information on this project see:
 165    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 166 */
 167
 168 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 169
 170 /* Function vect_determine_vectorization_factor
 171
 172    Determine the vectorization factor (VF).  VF is the number of data elements
 173    that are operated upon in parallel in a single iteration of the vectorized
 174    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 175    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 176    elements can fit in a single vector register.
 177
 178    We currently support vectorization of loops in which all types operated upon
 179    are of the same size.  Therefore this function currently sets VF according to
 180    the size of the types operated upon, and fails if there are multiple sizes
 181    in the loop.
 182
 183    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 184    original loop:
 185         for (i=0; i<N; i++){
 186           a[i] = b[i] + c[i];
 187         }
 188
 189    vectorized loop:
 190         for (i=0; i<N; i+=VF){
 191           a[i:VF] = b[i:VF] + c[i:VF];
 192         }
 193 */
 194
 195 static bool
 196 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 197 {
 198   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 199   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 200   int nbbs = loop->num_nodes;
 201   gimple_stmt_iterator si;
 202   unsigned int vectorization_factor = 0;
 203   tree scalar_type;
 204   gimple phi;
 205   tree vectype;
 206   unsigned int nunits;
 207   stmt_vec_info stmt_info;
 208   int i;
 209   HOST_WIDE_INT dummy;
 210   gimple stmt, pattern_stmt = NULL;
 211   gimple_seq pattern_def_seq = NULL;
 212   gimple_stmt_iterator pattern_def_si = gsi_none ();
 213   bool analyze_pattern_stmt = false;
 214
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location,
 217                      "=== vect_determine_vectorization_factor ===\n");
 218
 219   for (i = 0; i < nbbs; i++)
 220     {
 221       basic_block bb = bbs[i];
 222
 223       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 224         {
 225           phi = gsi_stmt (si);
 226           stmt_info = vinfo_for_stmt (phi);
 227           if (dump_enabled_p ())
 228             {
 229               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 230               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 231               dump_printf (MSG_NOTE, "\n");
 232             }
 233
 234           gcc_assert (stmt_info);
 235
 236           if (STMT_VINFO_RELEVANT_P (stmt_info))
 237             {
 238               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 239               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 240
 241               if (dump_enabled_p ())
 242                 {
 243                   dump_printf_loc (MSG_NOTE, vect_location,
 244                                    "get vectype for scalar type:  ");
 245                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 246                   dump_printf (MSG_NOTE, "\n");
 247                 }
 248
 249               vectype = get_vectype_for_scalar_type (scalar_type);
 250               if (!vectype)
 251                 {
 252                   if (dump_enabled_p ())
 253                     {
 254                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 255                                        "not vectorized: unsupported "
 256                                        "data-type ");
 257                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 258                                          scalar_type);
 259                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 260                     }
 261                   return false;
 262                 }
 263               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 264
 265               if (dump_enabled_p ())
 266                 {
 267                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 268                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 269                   dump_printf (MSG_NOTE, "\n");
 270                 }
 271
 272               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 273               if (dump_enabled_p ())
 274                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 275                                  nunits);
 276
 277               if (!vectorization_factor
 278                   || (nunits > vectorization_factor))
 279                 vectorization_factor = nunits;
 280             }
 281         }
 282
 283       for (si = gsi_start_bb (bb); !gsi_end_p (si) || analyze_pattern_stmt;)
 284         {
 285           tree vf_vectype;
 286
 287           if (analyze_pattern_stmt)
 288             stmt = pattern_stmt;
 289           else
 290             stmt = gsi_stmt (si);
 291
 292           stmt_info = vinfo_for_stmt (stmt);
 293
 294           if (dump_enabled_p ())
 295             {
 296               dump_printf_loc (MSG_NOTE, vect_location,
 297                                "==> examining statement: ");
 298               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 299               dump_printf (MSG_NOTE, "\n");
 300             }
 301
 302           gcc_assert (stmt_info);
 303
 304           /* Skip stmts which do not need to be vectorized.  */
 305           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 306                && !STMT_VINFO_LIVE_P (stmt_info))
 307               || gimple_clobber_p (stmt))
 308             {
 309               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 310                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 311                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 312                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 313                 {
 314                   stmt = pattern_stmt;
 315                   stmt_info = vinfo_for_stmt (pattern_stmt);
 316                   if (dump_enabled_p ())
 317                     {
 318                       dump_printf_loc (MSG_NOTE, vect_location,
 319                                        "==> examining pattern statement: ");
 320                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 321                       dump_printf (MSG_NOTE, "\n");
 322                     }
 323                 }
 324               else
 325                 {
 326                   if (dump_enabled_p ())
 327                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 328                   gsi_next (&si);
 329                   continue;
 330                 }
 331             }
 332           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 333                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 334                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 335                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 336             analyze_pattern_stmt = true;
 337
 338           /* If a pattern statement has def stmts, analyze them too.  */
 339           if (is_pattern_stmt_p (stmt_info))
 340             {
 341               if (pattern_def_seq == NULL)
 342                 {
 343                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 344                   pattern_def_si = gsi_start (pattern_def_seq);
 345                 }
 346               else if (!gsi_end_p (pattern_def_si))
 347                 gsi_next (&pattern_def_si);
 348               if (pattern_def_seq != NULL)
 349                 {
 350                   gimple pattern_def_stmt = NULL;
 351                   stmt_vec_info pattern_def_stmt_info = NULL;
 352
 353                   while (!gsi_end_p (pattern_def_si))
 354                     {
 355                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 356                       pattern_def_stmt_info
 357                         = vinfo_for_stmt (pattern_def_stmt);
 358                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 359                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 360                         break;
 361                       gsi_next (&pattern_def_si);
 362                     }
 363
 364                   if (!gsi_end_p (pattern_def_si))
 365                     {
 366                       if (dump_enabled_p ())
 367                         {
 368                           dump_printf_loc (MSG_NOTE, vect_location,
 369                                            "==> examining pattern def stmt: ");
 370                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 371                                             pattern_def_stmt, 0);
 372                           dump_printf (MSG_NOTE, "\n");
 373                         }
 374
 375                       stmt = pattern_def_stmt;
 376                       stmt_info = pattern_def_stmt_info;
 377                     }
 378                   else
 379                     {
 380                       pattern_def_si = gsi_none ();
 381                       analyze_pattern_stmt = false;
 382                     }
 383                 }
 384               else
 385                 analyze_pattern_stmt = false;
 386             }
 387
 388           if (gimple_get_lhs (stmt) == NULL_TREE
 389               /* MASK_STORE has no lhs, but is ok.  */
 390               && (!is_gimple_call (stmt)
 391                   || !gimple_call_internal_p (stmt)
 392                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 393             {
 394               if (is_gimple_call (stmt))
 395                 {
 396                   /* Ignore calls with no lhs.  These must be calls to
 397                      #pragma omp simd functions, and what vectorization factor
 398                      it really needs can't be determined until
 399                      vectorizable_simd_clone_call.  */
 400                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 401                     {
 402                       pattern_def_seq = NULL;
 403                       gsi_next (&si);
 404                     }
 405                   continue;
 406                 }
 407               if (dump_enabled_p ())
 408                 {
 409                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 410                                    "not vectorized: irregular stmt.");
 411                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 412                                     0);
 413                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 414                 }
 415               return false;
 416             }
 417
 418           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 419             {
 420               if (dump_enabled_p ())
 421                 {
 422                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 423                                    "not vectorized: vector stmt in loop:");
 424                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 425                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 426                 }
 427               return false;
 428             }
 429
 430           if (STMT_VINFO_VECTYPE (stmt_info))
 431             {
 432               /* The only case when a vectype had been already set is for stmts
 433                  that contain a dataref, or for "pattern-stmts" (stmts
 434                  generated by the vectorizer to represent/replace a certain
 435                  idiom).  */
 436               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 437                           || is_pattern_stmt_p (stmt_info)
 438                           || !gsi_end_p (pattern_def_si));
 439               vectype = STMT_VINFO_VECTYPE (stmt_info);
 440             }
 441           else
 442             {
 443               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 444               if (is_gimple_call (stmt)
 445                   && gimple_call_internal_p (stmt)
 446                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 447                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 448               else
 449                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 450               if (dump_enabled_p ())
 451                 {
 452                   dump_printf_loc (MSG_NOTE, vect_location,
 453                                    "get vectype for scalar type:  ");
 454                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 455                   dump_printf (MSG_NOTE, "\n");
 456                 }
 457               vectype = get_vectype_for_scalar_type (scalar_type);
 458               if (!vectype)
 459                 {
 460                   if (dump_enabled_p ())
 461                     {
 462                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 463                                        "not vectorized: unsupported "
 464                                        "data-type ");
 465                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 466                                          scalar_type);
 467                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 468                     }
 469                   return false;
 470                 }
 471
 472               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 473
 474               if (dump_enabled_p ())
 475                 {
 476                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 477                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 478                   dump_printf (MSG_NOTE, "\n");
 479                 }
 480             }
 481
 482           /* The vectorization factor is according to the smallest
 483              scalar type (or the largest vector size, but we only
 484              support one vector size per loop).  */
 485           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 486                                                        &dummy);
 487           if (dump_enabled_p ())
 488             {
 489               dump_printf_loc (MSG_NOTE, vect_location,
 490                                "get vectype for scalar type:  ");
 491               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 492               dump_printf (MSG_NOTE, "\n");
 493             }
 494           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 495           if (!vf_vectype)
 496             {
 497               if (dump_enabled_p ())
 498                 {
 499                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 500                                    "not vectorized: unsupported data-type ");
 501                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 502                                      scalar_type);
 503                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 504                 }
 505               return false;
 506             }
 507
 508           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 509                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 510             {
 511               if (dump_enabled_p ())
 512                 {
 513                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 514                                    "not vectorized: different sized vector "
 515                                    "types in statement, ");
 516                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 517                                      vectype);
 518                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 519                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 520                                      vf_vectype);
 521                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 522                 }
 523               return false;
 524             }
 525
 526           if (dump_enabled_p ())
 527             {
 528               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 529               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 530               dump_printf (MSG_NOTE, "\n");
 531             }
 532
 533           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 534           if (dump_enabled_p ())
 535             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 536           if (!vectorization_factor
 537               || (nunits > vectorization_factor))
 538             vectorization_factor = nunits;
 539
 540           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 541             {
 542               pattern_def_seq = NULL;
 543               gsi_next (&si);
 544             }
 545         }
 546     }
 547
 548   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 549   if (dump_enabled_p ())
 550     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 551                      vectorization_factor);
 552   if (vectorization_factor <= 1)
 553     {
 554       if (dump_enabled_p ())
 555         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 556                          "not vectorized: unsupported data-type\n");
 557       return false;
 558     }
 559   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 560
 561   return true;
 562 }
 563
 564
 565 /* Function vect_is_simple_iv_evolution.
 566
 567    FORNOW: A simple evolution of an induction variables in the loop is
 568    considered a polynomial evolution.  */
 569
 570 static bool
 571 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 572                              tree * step)
 573 {
 574   tree init_expr;
 575   tree step_expr;
 576   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 577   basic_block bb;
 578
 579   /* When there is no evolution in this loop, the evolution function
 580      is not "simple".  */
 581   if (evolution_part == NULL_TREE)
 582     return false;
 583
 584   /* When the evolution is a polynomial of degree >= 2
 585      the evolution function is not "simple".  */
 586   if (tree_is_chrec (evolution_part))
 587     return false;
 588
 589   step_expr = evolution_part;
 590   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 591
 592   if (dump_enabled_p ())
 593     {
 594       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 595       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 596       dump_printf (MSG_NOTE, ",  init: ");
 597       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 598       dump_printf (MSG_NOTE, "\n");
 599     }
 600
 601   *init = init_expr;
 602   *step = step_expr;
 603
 604   if (TREE_CODE (step_expr) != INTEGER_CST
 605       && (TREE_CODE (step_expr) != SSA_NAME
 606           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 607               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 608           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 609               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 610                   || !flag_associative_math)))
 611       && (TREE_CODE (step_expr) != REAL_CST
 612           || !flag_associative_math))
 613     {
 614       if (dump_enabled_p ())
 615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 616                          "step unknown.\n");
 617       return false;
 618     }
 619
 620   return true;
 621 }
 622
 623 /* Function vect_analyze_scalar_cycles_1.
 624
 625    Examine the cross iteration def-use cycles of scalar variables
 626    in LOOP.  LOOP_VINFO represents the loop that is now being
 627    considered for vectorization (can be LOOP, or an outer-loop
 628    enclosing LOOP).  */
 629
 630 static void
 631 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 632 {
 633   basic_block bb = loop->header;
 634   tree init, step;
 635   auto_vec<gimple, 64> worklist;
 636   gimple_stmt_iterator gsi;
 637   bool double_reduc;
 638
 639   if (dump_enabled_p ())
 640     dump_printf_loc (MSG_NOTE, vect_location,
 641                      "=== vect_analyze_scalar_cycles ===\n");
 642
 643   /* First - identify all inductions.  Reduction detection assumes that all the
 644      inductions have been identified, therefore, this order must not be
 645      changed.  */
 646   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 647     {
 648       gimple phi = gsi_stmt (gsi);
 649       tree access_fn = NULL;
 650       tree def = PHI_RESULT (phi);
 651       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 652
 653       if (dump_enabled_p ())
 654         {
 655           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 656           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 657           dump_printf (MSG_NOTE, "\n");
 658         }
 659
 660       /* Skip virtual phi's.  The data dependences that are associated with
 661          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 662       if (virtual_operand_p (def))
 663         continue;
 664
 665       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 666
 667       /* Analyze the evolution function.  */
 668       access_fn = analyze_scalar_evolution (loop, def);
 669       if (access_fn)
 670         {
 671           STRIP_NOPS (access_fn);
 672           if (dump_enabled_p ())
 673             {
 674               dump_printf_loc (MSG_NOTE, vect_location,
 675                                "Access function of PHI: ");
 676               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 677               dump_printf (MSG_NOTE, "\n");
 678             }
 679           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 680             = evolution_part_in_loop_num (access_fn, loop->num);
 681         }
 682
 683       if (!access_fn
 684           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 685           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 686               && TREE_CODE (step) != INTEGER_CST))
 687         {
 688           worklist.safe_push (phi);
 689           continue;
 690         }
 691
 692       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 693
 694       if (dump_enabled_p ())
 695         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 696       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 697     }
 698
 699
 700   /* Second - identify all reductions and nested cycles.  */
 701   while (worklist.length () > 0)
 702     {
 703       gimple phi = worklist.pop ();
 704       tree def = PHI_RESULT (phi);
 705       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 706       gimple reduc_stmt;
 707       bool nested_cycle;
 708
 709       if (dump_enabled_p ())
 710         {
 711           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 712           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 713           dump_printf (MSG_NOTE, "\n");
 714         }
 715
 716       gcc_assert (!virtual_operand_p (def)
 717                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 718
 719       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 720       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 721                                                 &double_reduc);
 722       if (reduc_stmt)
 723         {
 724           if (double_reduc)
 725             {
 726               if (dump_enabled_p ())
 727                 dump_printf_loc (MSG_NOTE, vect_location,
 728                                  "Detected double reduction.\n");
 729
 730               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 731               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 732                                                     vect_double_reduction_def;
 733             }
 734           else
 735             {
 736               if (nested_cycle)
 737                 {
 738                   if (dump_enabled_p ())
 739                     dump_printf_loc (MSG_NOTE, vect_location,
 740                                      "Detected vectorizable nested cycle.\n");
 741
 742                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 743                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 744                                                              vect_nested_cycle;
 745                 }
 746               else
 747                 {
 748                   if (dump_enabled_p ())
 749                     dump_printf_loc (MSG_NOTE, vect_location,
 750                                      "Detected reduction.\n");
 751
 752                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 753                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 754                                                            vect_reduction_def;
 755                   /* Store the reduction cycles for possible vectorization in
 756                      loop-aware SLP.  */
 757                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 758                 }
 759             }
 760         }
 761       else
 762         if (dump_enabled_p ())
 763           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 764                            "Unknown def-use cycle pattern.\n");
 765     }
 766 }
 767
 768
 769 /* Function vect_analyze_scalar_cycles.
 770
 771    Examine the cross iteration def-use cycles of scalar variables, by
 772    analyzing the loop-header PHIs of scalar variables.  Classify each
 773    cycle as one of the following: invariant, induction, reduction, unknown.
 774    We do that for the loop represented by LOOP_VINFO, and also to its
 775    inner-loop, if exists.
 776    Examples for scalar cycles:
 777
 778    Example1: reduction:
 779
 780               loop1:
 781               for (i=0; i<N; i++)
 782                  sum += a[i];
 783
 784    Example2: induction:
 785
 786               loop2:
 787               for (i=0; i<N; i++)
 788                  a[i] = i;  */
 789
 790 static void
 791 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 792 {
 793   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 794
 795   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 796
 797   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 798      Reductions in such inner-loop therefore have different properties than
 799      the reductions in the nest that gets vectorized:
 800      1. When vectorized, they are executed in the same order as in the original
 801         scalar loop, so we can't change the order of computation when
 802         vectorizing them.
 803      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 804         current checks are too strict.  */
 805
 806   if (loop->inner)
 807     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 808 }
 809
 810
 811 /* Function vect_get_loop_niters.
 812
 813    Determine how many iterations the loop is executed and place it
 814    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 815    in NUMBER_OF_ITERATIONSM1.
 816
 817    Return the loop exit condition.  */
 818
 819 static gimple
 820 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 821                       tree *number_of_iterationsm1)
 822 {
 823   tree niters;
 824
 825   if (dump_enabled_p ())
 826     dump_printf_loc (MSG_NOTE, vect_location,
 827                      "=== get_loop_niters ===\n");
 828
 829   niters = number_of_latch_executions (loop);
 830   *number_of_iterationsm1 = niters;
 831
 832   /* We want the number of loop header executions which is the number
 833      of latch executions plus one.
 834      ???  For UINT_MAX latch executions this number overflows to zero
 835      for loops like do { n++; } while (n != 0);  */
 836   if (niters && !chrec_contains_undetermined (niters))
 837     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 838                           build_int_cst (TREE_TYPE (niters), 1));
 839   *number_of_iterations = niters;
 840
 841   return get_loop_exit_condition (loop);
 842 }
 843
 844
 845 /* Function bb_in_loop_p
 846
 847    Used as predicate for dfs order traversal of the loop bbs.  */
 848
 849 static bool
 850 bb_in_loop_p (const_basic_block bb, const void *data)
 851 {
 852   const struct loop *const loop = (const struct loop *)data;
 853   if (flow_bb_inside_loop_p (loop, bb))
 854     return true;
 855   return false;
 856 }
 857
 858
 859 /* Function new_loop_vec_info.
 860
 861    Create and initialize a new loop_vec_info struct for LOOP, as well as
 862    stmt_vec_info structs for all the stmts in LOOP.  */
 863
 864 static loop_vec_info
 865 new_loop_vec_info (struct loop *loop)
 866 {
 867   loop_vec_info res;
 868   basic_block *bbs;
 869   gimple_stmt_iterator si;
 870   unsigned int i, nbbs;
 871
 872   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 873   LOOP_VINFO_LOOP (res) = loop;
 874
 875   bbs = get_loop_body (loop);
 876
 877   /* Create/Update stmt_info for all stmts in the loop.  */
 878   for (i = 0; i < loop->num_nodes; i++)
 879     {
 880       basic_block bb = bbs[i];
 881
 882       /* BBs in a nested inner-loop will have been already processed (because
 883          we will have called vect_analyze_loop_form for any nested inner-loop).
 884          Therefore, for stmts in an inner-loop we just want to update the
 885          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 886          loop_info of the outer-loop we are currently considering to vectorize
 887          (instead of the loop_info of the inner-loop).
 888          For stmts in other BBs we need to create a stmt_info from scratch.  */
 889       if (bb->loop_father != loop)
 890         {
 891           /* Inner-loop bb.  */
 892           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 893           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 894             {
 895               gimple phi = gsi_stmt (si);
 896               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 897               loop_vec_info inner_loop_vinfo =
 898                 STMT_VINFO_LOOP_VINFO (stmt_info);
 899               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 900               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 901             }
 902           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 903            {
 904               gimple stmt = gsi_stmt (si);
 905               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 906               loop_vec_info inner_loop_vinfo =
 907                  STMT_VINFO_LOOP_VINFO (stmt_info);
 908               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 909               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 910            }
 911         }
 912       else
 913         {
 914           /* bb in current nest.  */
 915           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 916             {
 917               gimple phi = gsi_stmt (si);
 918               gimple_set_uid (phi, 0);
 919               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 920             }
 921
 922           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 923             {
 924               gimple stmt = gsi_stmt (si);
 925               gimple_set_uid (stmt, 0);
 926               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 927             }
 928         }
 929     }
 930
 931   /* CHECKME: We want to visit all BBs before their successors (except for
 932      latch blocks, for which this assertion wouldn't hold).  In the simple
 933      case of the loop forms we allow, a dfs order of the BBs would the same
 934      as reversed postorder traversal, so we are safe.  */
 935
 936    free (bbs);
 937    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 938    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 939                               bbs, loop->num_nodes, loop);
 940    gcc_assert (nbbs == loop->num_nodes);
 941
 942   LOOP_VINFO_BBS (res) = bbs;
 943   LOOP_VINFO_NITERSM1 (res) = NULL;
 944   LOOP_VINFO_NITERS (res) = NULL;
 945   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 946   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 947   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 948   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 949   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 950   LOOP_VINFO_VECT_FACTOR (res) = 0;
 951   LOOP_VINFO_LOOP_NEST (res).create (3);
 952   LOOP_VINFO_DATAREFS (res).create (10);
 953   LOOP_VINFO_DDRS (res).create (10 * 10);
 954   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 955   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 956              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 957   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 958              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 959   LOOP_VINFO_GROUPED_STORES (res).create (10);
 960   LOOP_VINFO_REDUCTIONS (res).create (10);
 961   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 962   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 963   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 964   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 965   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 966   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 967   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 968
 969   return res;
 970 }
 971
 972
 973 /* Function destroy_loop_vec_info.
 974
 975    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 976    stmts in the loop.  */
 977
 978 void
 979 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 980 {
 981   struct loop *loop;
 982   basic_block *bbs;
 983   int nbbs;
 984   gimple_stmt_iterator si;
 985   int j;
 986   vec<slp_instance> slp_instances;
 987   slp_instance instance;
 988   bool swapped;
 989
 990   if (!loop_vinfo)
 991     return;
 992
 993   loop = LOOP_VINFO_LOOP (loop_vinfo);
 994
 995   bbs = LOOP_VINFO_BBS (loop_vinfo);
 996   nbbs = clean_stmts ? loop->num_nodes : 0;
 997   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 998
 999   for (j = 0; j < nbbs; j++)
1000     {
1001       basic_block bb = bbs[j];
1002       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1003         free_stmt_vec_info (gsi_stmt (si));
1004
1005       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1006         {
1007           gimple stmt = gsi_stmt (si);
1008
1009           /* We may have broken canonical form by moving a constant
1010              into RHS1 of a commutative op.  Fix such occurrences.  */
1011           if (swapped && is_gimple_assign (stmt))
1012             {
1013               enum tree_code code = gimple_assign_rhs_code (stmt);
1014
1015               if ((code == PLUS_EXPR
1016                    || code == POINTER_PLUS_EXPR
1017                    || code == MULT_EXPR)
1018                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1019                 swap_ssa_operands (stmt,
1020                                    gimple_assign_rhs1_ptr (stmt),
1021                                    gimple_assign_rhs2_ptr (stmt));
1022             }
1023
1024           /* Free stmt_vec_info.  */
1025           free_stmt_vec_info (stmt);
1026           gsi_next (&si);
1027         }
1028     }
1029
1030   free (LOOP_VINFO_BBS (loop_vinfo));
1031   vect_destroy_datarefs (loop_vinfo, NULL);
1032   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1033   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1034   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1035   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1036   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1037   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1038     vect_free_slp_instance (instance);
1039
1040   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1041   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1042   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1043   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1044
1045   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1046   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1047
1048   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1049
1050   free (loop_vinfo);
1051   loop->aux = NULL;
1052 }
1053
1054
1055 /* Function vect_analyze_loop_1.
1056
1057    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1058    for it. The different analyses will record information in the
1059    loop_vec_info struct.  This is a subset of the analyses applied in
1060    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1061    that is now considered for (outer-loop) vectorization.  */
1062
1063 static loop_vec_info
1064 vect_analyze_loop_1 (struct loop *loop)
1065 {
1066   loop_vec_info loop_vinfo;
1067
1068   if (dump_enabled_p ())
1069     dump_printf_loc (MSG_NOTE, vect_location,
1070                      "===== analyze_loop_nest_1 =====\n");
1071
1072   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1073
1074   loop_vinfo = vect_analyze_loop_form (loop);
1075   if (!loop_vinfo)
1076     {
1077       if (dump_enabled_p ())
1078         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1079                          "bad inner-loop form.\n");
1080       return NULL;
1081     }
1082
1083   return loop_vinfo;
1084 }
1085
1086
1087 /* Function vect_analyze_loop_form.
1088
1089    Verify that certain CFG restrictions hold, including:
1090    - the loop has a pre-header
1091    - the loop has a single entry and exit
1092    - the loop exit condition is simple enough, and the number of iterations
1093      can be analyzed (a countable loop).  */
1094
1095 loop_vec_info
1096 vect_analyze_loop_form (struct loop *loop)
1097 {
1098   loop_vec_info loop_vinfo;
1099   gimple loop_cond;
1100   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1101   loop_vec_info inner_loop_vinfo = NULL;
1102
1103   if (dump_enabled_p ())
1104     dump_printf_loc (MSG_NOTE, vect_location,
1105                      "=== vect_analyze_loop_form ===\n");
1106
1107   /* Different restrictions apply when we are considering an inner-most loop,
1108      vs. an outer (nested) loop.
1109      (FORNOW. May want to relax some of these restrictions in the future).  */
1110
1111   if (!loop->inner)
1112     {
1113       /* Inner-most loop.  We currently require that the number of BBs is
1114          exactly 2 (the header and latch).  Vectorizable inner-most loops
1115          look like this:
1116
1117                         (pre-header)
1118                            |
1119                           header <--------+
1120                            | |            |
1121                            | +--> latch --+
1122                            |
1123                         (exit-bb)  */
1124
1125       if (loop->num_nodes != 2)
1126         {
1127           if (dump_enabled_p ())
1128             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1129                              "not vectorized: control flow in loop.\n");
1130           return NULL;
1131         }
1132
1133       if (empty_block_p (loop->header))
1134         {
1135           if (dump_enabled_p ())
1136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1137                              "not vectorized: empty loop.\n");
1138           return NULL;
1139         }
1140     }
1141   else
1142     {
1143       struct loop *innerloop = loop->inner;
1144       edge entryedge;
1145
1146       /* Nested loop. We currently require that the loop is doubly-nested,
1147          contains a single inner loop, and the number of BBs is exactly 5.
1148          Vectorizable outer-loops look like this:
1149
1150                         (pre-header)
1151                            |
1152                           header <---+
1153                            |         |
1154                           inner-loop |
1155                            |         |
1156                           tail ------+
1157                            |
1158                         (exit-bb)
1159
1160          The inner-loop has the properties expected of inner-most loops
1161          as described above.  */
1162
1163       if ((loop->inner)->inner || (loop->inner)->next)
1164         {
1165           if (dump_enabled_p ())
1166             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1167                              "not vectorized: multiple nested loops.\n");
1168           return NULL;
1169         }
1170
1171       /* Analyze the inner-loop.  */
1172       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1173       if (!inner_loop_vinfo)
1174         {
1175           if (dump_enabled_p ())
1176             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1177                              "not vectorized: Bad inner loop.\n");
1178           return NULL;
1179         }
1180
1181       if (!expr_invariant_in_loop_p (loop,
1182                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1183         {
1184           if (dump_enabled_p ())
1185             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                              "not vectorized: inner-loop count not"
1187                              " invariant.\n");
1188           destroy_loop_vec_info (inner_loop_vinfo, true);
1189           return NULL;
1190         }
1191
1192       if (loop->num_nodes != 5)
1193         {
1194           if (dump_enabled_p ())
1195             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1196                              "not vectorized: control flow in loop.\n");
1197           destroy_loop_vec_info (inner_loop_vinfo, true);
1198           return NULL;
1199         }
1200
1201       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1202       entryedge = EDGE_PRED (innerloop->header, 0);
1203       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1204         entryedge = EDGE_PRED (innerloop->header, 1);
1205
1206       if (entryedge->src != loop->header
1207           || !single_exit (innerloop)
1208           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1209         {
1210           if (dump_enabled_p ())
1211             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1212                              "not vectorized: unsupported outerloop form.\n");
1213           destroy_loop_vec_info (inner_loop_vinfo, true);
1214           return NULL;
1215         }
1216
1217       if (dump_enabled_p ())
1218         dump_printf_loc (MSG_NOTE, vect_location,
1219                          "Considering outer-loop vectorization.\n");
1220     }
1221
1222   if (!single_exit (loop)
1223       || EDGE_COUNT (loop->header->preds) != 2)
1224     {
1225       if (dump_enabled_p ())
1226         {
1227           if (!single_exit (loop))
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: multiple exits.\n");
1230           else if (EDGE_COUNT (loop->header->preds) != 2)
1231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                              "not vectorized: too many incoming edges.\n");
1233         }
1234       if (inner_loop_vinfo)
1235         destroy_loop_vec_info (inner_loop_vinfo, true);
1236       return NULL;
1237     }
1238
1239   /* We assume that the loop exit condition is at the end of the loop. i.e,
1240      that the loop is represented as a do-while (with a proper if-guard
1241      before the loop if needed), where the loop header contains all the
1242      executable statements, and the latch is empty.  */
1243   if (!empty_block_p (loop->latch)
1244       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1245     {
1246       if (dump_enabled_p ())
1247         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1248                          "not vectorized: latch block not empty.\n");
1249       if (inner_loop_vinfo)
1250         destroy_loop_vec_info (inner_loop_vinfo, true);
1251       return NULL;
1252     }
1253
1254   /* Make sure there exists a single-predecessor exit bb:  */
1255   if (!single_pred_p (single_exit (loop)->dest))
1256     {
1257       edge e = single_exit (loop);
1258       if (!(e->flags & EDGE_ABNORMAL))
1259         {
1260           split_loop_exit_edge (e);
1261           if (dump_enabled_p ())
1262             dump_printf (MSG_NOTE, "split exit edge.\n");
1263         }
1264       else
1265         {
1266           if (dump_enabled_p ())
1267             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1268                              "not vectorized: abnormal loop exit edge.\n");
1269           if (inner_loop_vinfo)
1270             destroy_loop_vec_info (inner_loop_vinfo, true);
1271           return NULL;
1272         }
1273     }
1274
1275   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1276                                     &number_of_iterationsm1);
1277   if (!loop_cond)
1278     {
1279       if (dump_enabled_p ())
1280         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1281                          "not vectorized: complicated exit condition.\n");
1282       if (inner_loop_vinfo)
1283         destroy_loop_vec_info (inner_loop_vinfo, true);
1284       return NULL;
1285     }
1286
1287   if (!number_of_iterations
1288       || chrec_contains_undetermined (number_of_iterations))
1289     {
1290       if (dump_enabled_p ())
1291         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                          "not vectorized: number of iterations cannot be "
1293                          "computed.\n");
1294       if (inner_loop_vinfo)
1295         destroy_loop_vec_info (inner_loop_vinfo, true);
1296       return NULL;
1297     }
1298
1299   if (integer_zerop (number_of_iterations))
1300     {
1301       if (dump_enabled_p ())
1302         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                          "not vectorized: number of iterations = 0.\n");
1304       if (inner_loop_vinfo)
1305         destroy_loop_vec_info (inner_loop_vinfo, true);
1306       return NULL;
1307     }
1308
1309   loop_vinfo = new_loop_vec_info (loop);
1310   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1311   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1312   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1313
1314   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1315     {
1316       if (dump_enabled_p ())
1317         {
1318           dump_printf_loc (MSG_NOTE, vect_location,
1319                            "Symbolic number of iterations is ");
1320           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1321           dump_printf (MSG_NOTE, "\n");
1322         }
1323     }
1324
1325   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1326
1327   /* CHECKME: May want to keep it around it in the future.  */
1328   if (inner_loop_vinfo)
1329     destroy_loop_vec_info (inner_loop_vinfo, false);
1330
1331   gcc_assert (!loop->aux);
1332   loop->aux = loop_vinfo;
1333   return loop_vinfo;
1334 }
1335
1336
1337 /* Function vect_analyze_loop_operations.
1338
1339    Scan the loop stmts and make sure they are all vectorizable.  */
1340
1341 static bool
1342 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1343 {
1344   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1345   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1346   int nbbs = loop->num_nodes;
1347   gimple_stmt_iterator si;
1348   unsigned int vectorization_factor = 0;
1349   int i;
1350   gimple phi;
1351   stmt_vec_info stmt_info;
1352   bool need_to_vectorize = false;
1353   int min_profitable_iters;
1354   int min_scalar_loop_bound;
1355   unsigned int th;
1356   bool only_slp_in_loop = true, ok;
1357   HOST_WIDE_INT max_niter;
1358   HOST_WIDE_INT estimated_niter;
1359   int min_profitable_estimate;
1360
1361   if (dump_enabled_p ())
1362     dump_printf_loc (MSG_NOTE, vect_location,
1363                      "=== vect_analyze_loop_operations ===\n");
1364
1365   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1366   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1367   if (slp)
1368     {
1369       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1370          vectorization factor of the loop is the unrolling factor required by
1371          the SLP instances.  If that unrolling factor is 1, we say, that we
1372          perform pure SLP on loop - cross iteration parallelism is not
1373          exploited.  */
1374       for (i = 0; i < nbbs; i++)
1375         {
1376           basic_block bb = bbs[i];
1377           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1378             {
1379               gimple stmt = gsi_stmt (si);
1380               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1381               gcc_assert (stmt_info);
1382               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1383                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1384                   && !PURE_SLP_STMT (stmt_info))
1385                 /* STMT needs both SLP and loop-based vectorization.  */
1386                 only_slp_in_loop = false;
1387             }
1388         }
1389
1390       if (only_slp_in_loop)
1391         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1392       else
1393         vectorization_factor = least_common_multiple (vectorization_factor,
1394                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1395
1396       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1397       if (dump_enabled_p ())
1398         dump_printf_loc (MSG_NOTE, vect_location,
1399                          "Updating vectorization factor to %d\n",
1400                          vectorization_factor);
1401     }
1402
1403   for (i = 0; i < nbbs; i++)
1404     {
1405       basic_block bb = bbs[i];
1406
1407       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1408         {
1409           phi = gsi_stmt (si);
1410           ok = true;
1411
1412           stmt_info = vinfo_for_stmt (phi);
1413           if (dump_enabled_p ())
1414             {
1415               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1416               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1417               dump_printf (MSG_NOTE, "\n");
1418             }
1419
1420           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1421              (i.e., a phi in the tail of the outer-loop).  */
1422           if (! is_loop_header_bb_p (bb))
1423             {
1424               /* FORNOW: we currently don't support the case that these phis
1425                  are not used in the outerloop (unless it is double reduction,
1426                  i.e., this phi is vect_reduction_def), cause this case
1427                  requires to actually do something here.  */
1428               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1429                    || STMT_VINFO_LIVE_P (stmt_info))
1430                   && STMT_VINFO_DEF_TYPE (stmt_info)
1431                      != vect_double_reduction_def)
1432                 {
1433                   if (dump_enabled_p ())
1434                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1435                                      "Unsupported loop-closed phi in "
1436                                      "outer-loop.\n");
1437                   return false;
1438                 }
1439
1440               /* If PHI is used in the outer loop, we check that its operand
1441                  is defined in the inner loop.  */
1442               if (STMT_VINFO_RELEVANT_P (stmt_info))
1443                 {
1444                   tree phi_op;
1445                   gimple op_def_stmt;
1446
1447                   if (gimple_phi_num_args (phi) != 1)
1448                     return false;
1449
1450                   phi_op = PHI_ARG_DEF (phi, 0);
1451                   if (TREE_CODE (phi_op) != SSA_NAME)
1452                     return false;
1453
1454                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1455                   if (gimple_nop_p (op_def_stmt)
1456                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1457                       || !vinfo_for_stmt (op_def_stmt))
1458                     return false;
1459
1460                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1461                         != vect_used_in_outer
1462                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1463                            != vect_used_in_outer_by_reduction)
1464                     return false;
1465                 }
1466
1467               continue;
1468             }
1469
1470           gcc_assert (stmt_info);
1471
1472           if (STMT_VINFO_LIVE_P (stmt_info))
1473             {
1474               /* FORNOW: not yet supported.  */
1475               if (dump_enabled_p ())
1476                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1477                                  "not vectorized: value used after loop.\n");
1478               return false;
1479             }
1480
1481           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1482               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1483             {
1484               /* A scalar-dependence cycle that we don't support.  */
1485               if (dump_enabled_p ())
1486                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1487                                  "not vectorized: scalar dependence cycle.\n");
1488               return false;
1489             }
1490
1491           if (STMT_VINFO_RELEVANT_P (stmt_info))
1492             {
1493               need_to_vectorize = true;
1494               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1495                 ok = vectorizable_induction (phi, NULL, NULL);
1496             }
1497
1498           if (!ok)
1499             {
1500               if (dump_enabled_p ())
1501                 {
1502                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1503                                    "not vectorized: relevant phi not "
1504                                    "supported: ");
1505                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1506                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1507                 }
1508               return false;
1509             }
1510         }
1511
1512       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1513         {
1514           gimple stmt = gsi_stmt (si);
1515           if (!gimple_clobber_p (stmt)
1516               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1517             return false;
1518         }
1519     } /* bbs */
1520
1521   /* All operations in the loop are either irrelevant (deal with loop
1522      control, or dead), or only used outside the loop and can be moved
1523      out of the loop (e.g. invariants, inductions).  The loop can be
1524      optimized away by scalar optimizations.  We're better off not
1525      touching this loop.  */
1526   if (!need_to_vectorize)
1527     {
1528       if (dump_enabled_p ())
1529         dump_printf_loc (MSG_NOTE, vect_location,
1530                          "All the computation can be taken out of the loop.\n");
1531       if (dump_enabled_p ())
1532         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1533                          "not vectorized: redundant loop. no profit to "
1534                          "vectorize.\n");
1535       return false;
1536     }
1537
1538   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1539     dump_printf_loc (MSG_NOTE, vect_location,
1540                      "vectorization_factor = %d, niters = "
1541                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1542                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1543
1544   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1545        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1546       || ((max_niter = max_stmt_executions_int (loop)) != -1
1547           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1548     {
1549       if (dump_enabled_p ())
1550         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1551                          "not vectorized: iteration count too small.\n");
1552       if (dump_enabled_p ())
1553         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1554                          "not vectorized: iteration count smaller than "
1555                          "vectorization factor.\n");
1556       return false;
1557     }
1558
1559   /* Analyze cost.  Decide if worth while to vectorize.  */
1560
1561   /* Once VF is set, SLP costs should be updated since the number of created
1562      vector stmts depends on VF.  */
1563   vect_update_slp_costs_according_to_vf (loop_vinfo);
1564
1565   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1566                                       &min_profitable_estimate);
1567   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1568
1569   if (min_profitable_iters < 0)
1570     {
1571       if (dump_enabled_p ())
1572         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1573                          "not vectorized: vectorization not profitable.\n");
1574       if (dump_enabled_p ())
1575         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1576                          "not vectorized: vector version will never be "
1577                          "profitable.\n");
1578       return false;
1579     }
1580
1581   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1582                             * vectorization_factor) - 1);
1583
1584
1585   /* Use the cost model only if it is more conservative than user specified
1586      threshold.  */
1587
1588   th = (unsigned) min_scalar_loop_bound;
1589   if (min_profitable_iters
1590       && (!min_scalar_loop_bound
1591           || min_profitable_iters > min_scalar_loop_bound))
1592     th = (unsigned) min_profitable_iters;
1593
1594   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1595
1596   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1597       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1598     {
1599       if (dump_enabled_p ())
1600         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1601                          "not vectorized: vectorization not profitable.\n");
1602       if (dump_enabled_p ())
1603         dump_printf_loc (MSG_NOTE, vect_location,
1604                          "not vectorized: iteration count smaller than user "
1605                          "specified loop bound parameter or minimum profitable "
1606                          "iterations (whichever is more conservative).\n");
1607       return false;
1608     }
1609
1610   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1611       && ((unsigned HOST_WIDE_INT) estimated_niter
1612           <= MAX (th, (unsigned)min_profitable_estimate)))
1613     {
1614       if (dump_enabled_p ())
1615         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1616                          "not vectorized: estimated iteration count too "
1617                          "small.\n");
1618       if (dump_enabled_p ())
1619         dump_printf_loc (MSG_NOTE, vect_location,
1620                          "not vectorized: estimated iteration count smaller "
1621                          "than specified loop bound parameter or minimum "
1622                          "profitable iterations (whichever is more "
1623                          "conservative).\n");
1624       return false;
1625     }
1626
1627   return true;
1628 }
1629
1630
1631 /* Function vect_analyze_loop_2.
1632
1633    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1634    for it.  The different analyses will record information in the
1635    loop_vec_info struct.  */
1636 static bool
1637 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1638 {
1639   bool ok, slp = false;
1640   int max_vf = MAX_VECTORIZATION_FACTOR;
1641   int min_vf = 2;
1642   unsigned int th;
1643   unsigned int n_stmts = 0;
1644
1645   /* Find all data references in the loop (which correspond to vdefs/vuses)
1646      and analyze their evolution in the loop.  Also adjust the minimal
1647      vectorization factor according to the loads and stores.
1648
1649      FORNOW: Handle only simple, array references, which
1650      alignment can be forced, and aligned pointer-references.  */
1651
1652   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1653   if (!ok)
1654     {
1655       if (dump_enabled_p ())
1656         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1657                          "bad data references.\n");
1658       return false;
1659     }
1660
1661   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1662      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1663
1664   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1665   if (!ok)
1666     {
1667       if (dump_enabled_p ())
1668         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1669                          "bad data access.\n");
1670       return false;
1671     }
1672
1673   /* Classify all cross-iteration scalar data-flow cycles.
1674      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1675
1676   vect_analyze_scalar_cycles (loop_vinfo);
1677
1678   vect_pattern_recog (loop_vinfo, NULL);
1679
1680   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1681
1682   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1683   if (!ok)
1684     {
1685       if (dump_enabled_p ())
1686         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1687                          "unexpected pattern.\n");
1688       return false;
1689     }
1690
1691   /* Analyze data dependences between the data-refs in the loop
1692      and adjust the maximum vectorization factor according to
1693      the dependences.
1694      FORNOW: fail at the first data dependence that we encounter.  */
1695
1696   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1697   if (!ok
1698       || max_vf < min_vf)
1699     {
1700       if (dump_enabled_p ())
1701             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1702                              "bad data dependence.\n");
1703       return false;
1704     }
1705
1706   ok = vect_determine_vectorization_factor (loop_vinfo);
1707   if (!ok)
1708     {
1709       if (dump_enabled_p ())
1710         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1711                          "can't determine vectorization factor.\n");
1712       return false;
1713     }
1714   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1715     {
1716       if (dump_enabled_p ())
1717         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1718                          "bad data dependence.\n");
1719       return false;
1720     }
1721
1722   /* Analyze the alignment of the data-refs in the loop.
1723      Fail if a data reference is found that cannot be vectorized.  */
1724
1725   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1726   if (!ok)
1727     {
1728       if (dump_enabled_p ())
1729         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1730                          "bad data alignment.\n");
1731       return false;
1732     }
1733
1734   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1735      It is important to call pruning after vect_analyze_data_ref_accesses,
1736      since we use grouping information gathered by interleaving analysis.  */
1737   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1738   if (!ok)
1739     {
1740       if (dump_enabled_p ())
1741         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1742                          "number of versioning for alias "
1743                          "run-time tests exceeds %d "
1744                          "(--param vect-max-version-for-alias-checks)\n",
1745                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1746       return false;
1747     }
1748
1749   /* This pass will decide on using loop versioning and/or loop peeling in
1750      order to enhance the alignment of data references in the loop.  */
1751
1752   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1753   if (!ok)
1754     {
1755       if (dump_enabled_p ())
1756         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1757                          "bad data alignment.\n");
1758       return false;
1759     }
1760
1761   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1762   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1763   if (ok)
1764     {
1765       /* Decide which possible SLP instances to SLP.  */
1766       slp = vect_make_slp_decision (loop_vinfo);
1767
1768       /* Find stmts that need to be both vectorized and SLPed.  */
1769       vect_detect_hybrid_slp (loop_vinfo);
1770     }
1771   else
1772     return false;
1773
1774   /* Scan all the operations in the loop and make sure they are
1775      vectorizable.  */
1776
1777   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1778   if (!ok)
1779     {
1780       if (dump_enabled_p ())
1781         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1782                          "bad operation or unsupported loop bound.\n");
1783       return false;
1784     }
1785
1786   /* Decide whether we need to create an epilogue loop to handle
1787      remaining scalar iterations.  */
1788   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1789         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1790        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1791
1792   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1793       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1794     {
1795       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1796                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1797           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1798         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1799     }
1800   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1801            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1802                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1803                /* In case of versioning, check if the maximum number of
1804                   iterations is greater than th.  If they are identical,
1805                   the epilogue is unnecessary.  */
1806                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1807                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1808                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1809                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1810     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1811
1812   /* If an epilogue loop is required make sure we can create one.  */
1813   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1814       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1815     {
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1818       if (!vect_can_advance_ivs_p (loop_vinfo)
1819           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1820                                            single_exit (LOOP_VINFO_LOOP
1821                                                          (loop_vinfo))))
1822         {
1823           if (dump_enabled_p ())
1824             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1825                              "not vectorized: can't create required "
1826                              "epilog loop\n");
1827           return false;
1828         }
1829     }
1830
1831   return true;
1832 }
1833
1834 /* Function vect_analyze_loop.
1835
1836    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1837    for it.  The different analyses will record information in the
1838    loop_vec_info struct.  */
1839 loop_vec_info
1840 vect_analyze_loop (struct loop *loop)
1841 {
1842   loop_vec_info loop_vinfo;
1843   unsigned int vector_sizes;
1844
1845   /* Autodetect first vector size we try.  */
1846   current_vector_size = 0;
1847   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1848
1849   if (dump_enabled_p ())
1850     dump_printf_loc (MSG_NOTE, vect_location,
1851                      "===== analyze_loop_nest =====\n");
1852
1853   if (loop_outer (loop)
1854       && loop_vec_info_for_loop (loop_outer (loop))
1855       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1856     {
1857       if (dump_enabled_p ())
1858         dump_printf_loc (MSG_NOTE, vect_location,
1859                          "outer-loop already vectorized.\n");
1860       return NULL;
1861     }
1862
1863   while (1)
1864     {
1865       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1866       loop_vinfo = vect_analyze_loop_form (loop);
1867       if (!loop_vinfo)
1868         {
1869           if (dump_enabled_p ())
1870             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1871                              "bad loop form.\n");
1872           return NULL;
1873         }
1874
1875       if (vect_analyze_loop_2 (loop_vinfo))
1876         {
1877           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1878
1879           return loop_vinfo;
1880         }
1881
1882       destroy_loop_vec_info (loop_vinfo, true);
1883
1884       vector_sizes &= ~current_vector_size;
1885       if (vector_sizes == 0
1886           || current_vector_size == 0)
1887         return NULL;
1888
1889       /* Try the next biggest vector size.  */
1890       current_vector_size = 1 << floor_log2 (vector_sizes);
1891       if (dump_enabled_p ())
1892         dump_printf_loc (MSG_NOTE, vect_location,
1893                          "***** Re-trying analysis with "
1894                          "vector size %d\n", current_vector_size);
1895     }
1896 }
1897
1898
1899 /* Function reduction_code_for_scalar_code
1900
1901    Input:
1902    CODE - tree_code of a reduction operations.
1903
1904    Output:
1905    REDUC_CODE - the corresponding tree-code to be used to reduce the
1906       vector of partial results into a single scalar result, or ERROR_MARK
1907       if the operation is a supported reduction operation, but does not have
1908       such a tree-code.
1909
1910    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1911
1912 static bool
1913 reduction_code_for_scalar_code (enum tree_code code,
1914                                 enum tree_code *reduc_code)
1915 {
1916   switch (code)
1917     {
1918       case MAX_EXPR:
1919         *reduc_code = REDUC_MAX_EXPR;
1920         return true;
1921
1922       case MIN_EXPR:
1923         *reduc_code = REDUC_MIN_EXPR;
1924         return true;
1925
1926       case PLUS_EXPR:
1927         *reduc_code = REDUC_PLUS_EXPR;
1928         return true;
1929
1930       case MULT_EXPR:
1931       case MINUS_EXPR:
1932       case BIT_IOR_EXPR:
1933       case BIT_XOR_EXPR:
1934       case BIT_AND_EXPR:
1935         *reduc_code = ERROR_MARK;
1936         return true;
1937
1938       default:
1939        return false;
1940     }
1941 }
1942
1943
1944 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1945    STMT is printed with a message MSG. */
1946
1947 static void
1948 report_vect_op (int msg_type, gimple stmt, const char *msg)
1949 {
1950   dump_printf_loc (msg_type, vect_location, "%s", msg);
1951   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1952   dump_printf (msg_type, "\n");
1953 }
1954
1955
1956 /* Detect SLP reduction of the form:
1957
1958    #a1 = phi <a5, a0>
1959    a2 = operation (a1)
1960    a3 = operation (a2)
1961    a4 = operation (a3)
1962    a5 = operation (a4)
1963
1964    #a = phi <a5>
1965
1966    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1967    FIRST_STMT is the first reduction stmt in the chain
1968    (a2 = operation (a1)).
1969
1970    Return TRUE if a reduction chain was detected.  */
1971
1972 static bool
1973 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1974 {
1975   struct loop *loop = (gimple_bb (phi))->loop_father;
1976   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1977   enum tree_code code;
1978   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1979   stmt_vec_info use_stmt_info, current_stmt_info;
1980   tree lhs;
1981   imm_use_iterator imm_iter;
1982   use_operand_p use_p;
1983   int nloop_uses, size = 0, n_out_of_loop_uses;
1984   bool found = false;
1985
1986   if (loop != vect_loop)
1987     return false;
1988
1989   lhs = PHI_RESULT (phi);
1990   code = gimple_assign_rhs_code (first_stmt);
1991   while (1)
1992     {
1993       nloop_uses = 0;
1994       n_out_of_loop_uses = 0;
1995       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
1996         {
1997           gimple use_stmt = USE_STMT (use_p);
1998           if (is_gimple_debug (use_stmt))
1999             continue;
2000
2001           /* Check if we got back to the reduction phi.  */
2002           if (use_stmt == phi)
2003             {
2004               loop_use_stmt = use_stmt;
2005               found = true;
2006               break;
2007             }
2008
2009           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2010             {
2011               if (vinfo_for_stmt (use_stmt)
2012                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2013                 {
2014                   loop_use_stmt = use_stmt;
2015                   nloop_uses++;
2016                 }
2017             }
2018            else
2019              n_out_of_loop_uses++;
2020
2021            /* There are can be either a single use in the loop or two uses in
2022               phi nodes.  */
2023            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2024              return false;
2025         }
2026
2027       if (found)
2028         break;
2029
2030       /* We reached a statement with no loop uses.  */
2031       if (nloop_uses == 0)
2032         return false;
2033
2034       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2035       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2036         return false;
2037
2038       if (!is_gimple_assign (loop_use_stmt)
2039           || code != gimple_assign_rhs_code (loop_use_stmt)
2040           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2041         return false;
2042
2043       /* Insert USE_STMT into reduction chain.  */
2044       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2045       if (current_stmt)
2046         {
2047           current_stmt_info = vinfo_for_stmt (current_stmt);
2048           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2049           GROUP_FIRST_ELEMENT (use_stmt_info)
2050             = GROUP_FIRST_ELEMENT (current_stmt_info);
2051         }
2052       else
2053         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2054
2055       lhs = gimple_assign_lhs (loop_use_stmt);
2056       current_stmt = loop_use_stmt;
2057       size++;
2058    }
2059
2060   if (!found || loop_use_stmt != phi || size < 2)
2061     return false;
2062
2063   /* Swap the operands, if needed, to make the reduction operand be the second
2064      operand.  */
2065   lhs = PHI_RESULT (phi);
2066   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2067   while (next_stmt)
2068     {
2069       if (gimple_assign_rhs2 (next_stmt) == lhs)
2070         {
2071           tree op = gimple_assign_rhs1 (next_stmt);
2072           gimple def_stmt = NULL;
2073
2074           if (TREE_CODE (op) == SSA_NAME)
2075             def_stmt = SSA_NAME_DEF_STMT (op);
2076
2077           /* Check that the other def is either defined in the loop
2078              ("vect_internal_def"), or it's an induction (defined by a
2079              loop-header phi-node).  */
2080           if (def_stmt
2081               && gimple_bb (def_stmt)
2082               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2083               && (is_gimple_assign (def_stmt)
2084                   || is_gimple_call (def_stmt)
2085                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2086                            == vect_induction_def
2087                   || (gimple_code (def_stmt) == GIMPLE_PHI
2088                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2089                                   == vect_internal_def
2090                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2091             {
2092               lhs = gimple_assign_lhs (next_stmt);
2093               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2094               continue;
2095             }
2096
2097           return false;
2098         }
2099       else
2100         {
2101           tree op = gimple_assign_rhs2 (next_stmt);
2102           gimple def_stmt = NULL;
2103
2104           if (TREE_CODE (op) == SSA_NAME)
2105             def_stmt = SSA_NAME_DEF_STMT (op);
2106
2107           /* Check that the other def is either defined in the loop
2108             ("vect_internal_def"), or it's an induction (defined by a
2109             loop-header phi-node).  */
2110           if (def_stmt
2111               && gimple_bb (def_stmt)
2112               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2113               && (is_gimple_assign (def_stmt)
2114                   || is_gimple_call (def_stmt)
2115                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2116                               == vect_induction_def
2117                   || (gimple_code (def_stmt) == GIMPLE_PHI
2118                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2119                                   == vect_internal_def
2120                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2121             {
2122               if (dump_enabled_p ())
2123                 {
2124                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2125                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2126                   dump_printf (MSG_NOTE, "\n");
2127                 }
2128
2129               swap_ssa_operands (next_stmt,
2130                                  gimple_assign_rhs1_ptr (next_stmt),
2131                                  gimple_assign_rhs2_ptr (next_stmt));
2132               update_stmt (next_stmt);
2133
2134               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2135                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2136             }
2137           else
2138             return false;
2139         }
2140
2141       lhs = gimple_assign_lhs (next_stmt);
2142       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2143     }
2144
2145   /* Save the chain for further analysis in SLP detection.  */
2146   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2147   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2148   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2149
2150   return true;
2151 }
2152
2153
2154 /* Function vect_is_simple_reduction_1
2155
2156    (1) Detect a cross-iteration def-use cycle that represents a simple
2157    reduction computation.  We look for the following pattern:
2158
2159    loop_header:
2160      a1 = phi < a0, a2 >
2161      a3 = ...
2162      a2 = operation (a3, a1)
2163
2164    or
2165
2166    a3 = ...
2167    loop_header:
2168      a1 = phi < a0, a2 >
2169      a2 = operation (a3, a1)
2170
2171    such that:
2172    1. operation is commutative and associative and it is safe to
2173       change the order of the computation (if CHECK_REDUCTION is true)
2174    2. no uses for a2 in the loop (a2 is used out of the loop)
2175    3. no uses of a1 in the loop besides the reduction operation
2176    4. no uses of a1 outside the loop.
2177
2178    Conditions 1,4 are tested here.
2179    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2180
2181    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2182    nested cycles, if CHECK_REDUCTION is false.
2183
2184    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2185    reductions:
2186
2187      a1 = phi < a0, a2 >
2188      inner loop (def of a3)
2189      a2 = phi < a3 >
2190
2191    If MODIFY is true it tries also to rework the code in-place to enable
2192    detection of more reduction patterns.  For the time being we rewrite
2193    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2194 */
2195
2196 static gimple
2197 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2198                             bool check_reduction, bool *double_reduc,
2199                             bool modify)
2200 {
2201   struct loop *loop = (gimple_bb (phi))->loop_father;
2202   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2203   edge latch_e = loop_latch_edge (loop);
2204   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2205   gimple def_stmt, def1 = NULL, def2 = NULL;
2206   enum tree_code orig_code, code;
2207   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2208   tree type;
2209   int nloop_uses;
2210   tree name;
2211   imm_use_iterator imm_iter;
2212   use_operand_p use_p;
2213   bool phi_def;
2214
2215   *double_reduc = false;
2216
2217   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2218      otherwise, we assume outer loop vectorization.  */
2219   gcc_assert ((check_reduction && loop == vect_loop)
2220               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2221
2222   name = PHI_RESULT (phi);
2223   /* ???  If there are no uses of the PHI result the inner loop reduction
2224      won't be detected as possibly double-reduction by vectorizable_reduction
2225      because that tries to walk the PHI arg from the preheader edge which
2226      can be constant.  See PR60382.  */
2227   if (has_zero_uses (name))
2228     return NULL;
2229   nloop_uses = 0;
2230   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2231     {
2232       gimple use_stmt = USE_STMT (use_p);
2233       if (is_gimple_debug (use_stmt))
2234         continue;
2235
2236       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2237         {
2238           if (dump_enabled_p ())
2239             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2240                              "intermediate value used outside loop.\n");
2241
2242           return NULL;
2243         }
2244
2245       if (vinfo_for_stmt (use_stmt)
2246           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2247         nloop_uses++;
2248       if (nloop_uses > 1)
2249         {
2250           if (dump_enabled_p ())
2251             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2252                              "reduction used in loop.\n");
2253           return NULL;
2254         }
2255     }
2256
2257   if (TREE_CODE (loop_arg) != SSA_NAME)
2258     {
2259       if (dump_enabled_p ())
2260         {
2261           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2262                            "reduction: not ssa_name: ");
2263           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2264           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2265         }
2266       return NULL;
2267     }
2268
2269   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2270   if (!def_stmt)
2271     {
2272       if (dump_enabled_p ())
2273         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2274                          "reduction: no def_stmt.\n");
2275       return NULL;
2276     }
2277
2278   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2279     {
2280       if (dump_enabled_p ())
2281         {
2282           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2283           dump_printf (MSG_NOTE, "\n");
2284         }
2285       return NULL;
2286     }
2287
2288   if (is_gimple_assign (def_stmt))
2289     {
2290       name = gimple_assign_lhs (def_stmt);
2291       phi_def = false;
2292     }
2293   else
2294     {
2295       name = PHI_RESULT (def_stmt);
2296       phi_def = true;
2297     }
2298
2299   nloop_uses = 0;
2300   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2301     {
2302       gimple use_stmt = USE_STMT (use_p);
2303       if (is_gimple_debug (use_stmt))
2304         continue;
2305       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2306           && vinfo_for_stmt (use_stmt)
2307           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2308         nloop_uses++;
2309       if (nloop_uses > 1)
2310         {
2311           if (dump_enabled_p ())
2312             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2313                              "reduction used in loop.\n");
2314           return NULL;
2315         }
2316     }
2317
2318   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2319      defined in the inner loop.  */
2320   if (phi_def)
2321     {
2322       op1 = PHI_ARG_DEF (def_stmt, 0);
2323
2324       if (gimple_phi_num_args (def_stmt) != 1
2325           || TREE_CODE (op1) != SSA_NAME)
2326         {
2327           if (dump_enabled_p ())
2328             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2329                              "unsupported phi node definition.\n");
2330
2331           return NULL;
2332         }
2333
2334       def1 = SSA_NAME_DEF_STMT (op1);
2335       if (gimple_bb (def1)
2336           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2337           && loop->inner
2338           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2339           && is_gimple_assign (def1))
2340         {
2341           if (dump_enabled_p ())
2342             report_vect_op (MSG_NOTE, def_stmt,
2343                             "detected double reduction: ");
2344
2345           *double_reduc = true;
2346           return def_stmt;
2347         }
2348
2349       return NULL;
2350     }
2351
2352   code = orig_code = gimple_assign_rhs_code (def_stmt);
2353
2354   /* We can handle "res -= x[i]", which is non-associative by
2355      simply rewriting this into "res += -x[i]".  Avoid changing
2356      gimple instruction for the first simple tests and only do this
2357      if we're allowed to change code at all.  */
2358   if (code == MINUS_EXPR
2359       && modify
2360       && (op1 = gimple_assign_rhs1 (def_stmt))
2361       && TREE_CODE (op1) == SSA_NAME
2362       && SSA_NAME_DEF_STMT (op1) == phi)
2363     code = PLUS_EXPR;
2364
2365   if (check_reduction
2366       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2367     {
2368       if (dump_enabled_p ())
2369         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2370                         "reduction: not commutative/associative: ");
2371       return NULL;
2372     }
2373
2374   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2375     {
2376       if (code != COND_EXPR)
2377         {
2378           if (dump_enabled_p ())
2379             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2380                             "reduction: not binary operation: ");
2381
2382           return NULL;
2383         }
2384
2385       op3 = gimple_assign_rhs1 (def_stmt);
2386       if (COMPARISON_CLASS_P (op3))
2387         {
2388           op4 = TREE_OPERAND (op3, 1);
2389           op3 = TREE_OPERAND (op3, 0);
2390         }
2391
2392       op1 = gimple_assign_rhs2 (def_stmt);
2393       op2 = gimple_assign_rhs3 (def_stmt);
2394
2395       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2396         {
2397           if (dump_enabled_p ())
2398             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2399                             "reduction: uses not ssa_names: ");
2400
2401           return NULL;
2402         }
2403     }
2404   else
2405     {
2406       op1 = gimple_assign_rhs1 (def_stmt);
2407       op2 = gimple_assign_rhs2 (def_stmt);
2408
2409       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2410         {
2411           if (dump_enabled_p ())
2412             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2413                             "reduction: uses not ssa_names: ");
2414
2415           return NULL;
2416         }
2417    }
2418
2419   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2420   if ((TREE_CODE (op1) == SSA_NAME
2421        && !types_compatible_p (type,TREE_TYPE (op1)))
2422       || (TREE_CODE (op2) == SSA_NAME
2423           && !types_compatible_p (type, TREE_TYPE (op2)))
2424       || (op3 && TREE_CODE (op3) == SSA_NAME
2425           && !types_compatible_p (type, TREE_TYPE (op3)))
2426       || (op4 && TREE_CODE (op4) == SSA_NAME
2427           && !types_compatible_p (type, TREE_TYPE (op4))))
2428     {
2429       if (dump_enabled_p ())
2430         {
2431           dump_printf_loc (MSG_NOTE, vect_location,
2432                            "reduction: multiple types: operation type: ");
2433           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2434           dump_printf (MSG_NOTE, ", operands types: ");
2435           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2436                              TREE_TYPE (op1));
2437           dump_printf (MSG_NOTE, ",");
2438           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2439                              TREE_TYPE (op2));
2440           if (op3)
2441             {
2442               dump_printf (MSG_NOTE, ",");
2443               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2444                                  TREE_TYPE (op3));
2445             }
2446
2447           if (op4)
2448             {
2449               dump_printf (MSG_NOTE, ",");
2450               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2451                                  TREE_TYPE (op4));
2452             }
2453           dump_printf (MSG_NOTE, "\n");
2454         }
2455
2456       return NULL;
2457     }
2458
2459   /* Check that it's ok to change the order of the computation.
2460      Generally, when vectorizing a reduction we change the order of the
2461      computation.  This may change the behavior of the program in some
2462      cases, so we need to check that this is ok.  One exception is when
2463      vectorizing an outer-loop: the inner-loop is executed sequentially,
2464      and therefore vectorizing reductions in the inner-loop during
2465      outer-loop vectorization is safe.  */
2466
2467   /* CHECKME: check for !flag_finite_math_only too?  */
2468   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2469       && check_reduction)
2470     {
2471       /* Changing the order of operations changes the semantics.  */
2472       if (dump_enabled_p ())
2473         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2474                         "reduction: unsafe fp math optimization: ");
2475       return NULL;
2476     }
2477   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2478            && check_reduction)
2479     {
2480       /* Changing the order of operations changes the semantics.  */
2481       if (dump_enabled_p ())
2482         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2483                         "reduction: unsafe int math optimization: ");
2484       return NULL;
2485     }
2486   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2487     {
2488       /* Changing the order of operations changes the semantics.  */
2489       if (dump_enabled_p ())
2490         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2491                         "reduction: unsafe fixed-point math optimization: ");
2492       return NULL;
2493     }
2494
2495   /* If we detected "res -= x[i]" earlier, rewrite it into
2496      "res += -x[i]" now.  If this turns out to be useless reassoc
2497      will clean it up again.  */
2498   if (orig_code == MINUS_EXPR)
2499     {
2500       tree rhs = gimple_assign_rhs2 (def_stmt);
2501       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2502       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2503                                                          rhs, NULL);
2504       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2505       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2506                                                           loop_info, NULL));
2507       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2508       gimple_assign_set_rhs2 (def_stmt, negrhs);
2509       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2510       update_stmt (def_stmt);
2511     }
2512
2513   /* Reduction is safe. We're dealing with one of the following:
2514      1) integer arithmetic and no trapv
2515      2) floating point arithmetic, and special flags permit this optimization
2516      3) nested cycle (i.e., outer loop vectorization).  */
2517   if (TREE_CODE (op1) == SSA_NAME)
2518     def1 = SSA_NAME_DEF_STMT (op1);
2519
2520   if (TREE_CODE (op2) == SSA_NAME)
2521     def2 = SSA_NAME_DEF_STMT (op2);
2522
2523   if (code != COND_EXPR
2524       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2525     {
2526       if (dump_enabled_p ())
2527         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2528       return NULL;
2529     }
2530
2531   /* Check that one def is the reduction def, defined by PHI,
2532      the other def is either defined in the loop ("vect_internal_def"),
2533      or it's an induction (defined by a loop-header phi-node).  */
2534
2535   if (def2 && def2 == phi
2536       && (code == COND_EXPR
2537           || !def1 || gimple_nop_p (def1)
2538           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2539           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2540               && (is_gimple_assign (def1)
2541                   || is_gimple_call (def1)
2542                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2543                       == vect_induction_def
2544                   || (gimple_code (def1) == GIMPLE_PHI
2545                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2546                           == vect_internal_def
2547                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2548     {
2549       if (dump_enabled_p ())
2550         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2551       return def_stmt;
2552     }
2553
2554   if (def1 && def1 == phi
2555       && (code == COND_EXPR
2556           || !def2 || gimple_nop_p (def2)
2557           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2558           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2559               && (is_gimple_assign (def2)
2560                   || is_gimple_call (def2)
2561                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2562                       == vect_induction_def
2563                   || (gimple_code (def2) == GIMPLE_PHI
2564                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2565                           == vect_internal_def
2566                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2567     {
2568       if (check_reduction)
2569         {
2570           /* Swap operands (just for simplicity - so that the rest of the code
2571              can assume that the reduction variable is always the last (second)
2572              argument).  */
2573           if (dump_enabled_p ())
2574             report_vect_op (MSG_NOTE, def_stmt,
2575                             "detected reduction: need to swap operands: ");
2576
2577           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2578                              gimple_assign_rhs2_ptr (def_stmt));
2579
2580           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2581             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2582         }
2583       else
2584         {
2585           if (dump_enabled_p ())
2586             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2587         }
2588
2589       return def_stmt;
2590     }
2591
2592   /* Try to find SLP reduction chain.  */
2593   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2594     {
2595       if (dump_enabled_p ())
2596         report_vect_op (MSG_NOTE, def_stmt,
2597                         "reduction: detected reduction chain: ");
2598
2599       return def_stmt;
2600     }
2601
2602   if (dump_enabled_p ())
2603     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2604                     "reduction: unknown pattern: ");
2605
2606   return NULL;
2607 }
2608
2609 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2610    in-place.  Arguments as there.  */
2611
2612 static gimple
2613 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2614                           bool check_reduction, bool *double_reduc)
2615 {
2616   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2617                                      double_reduc, false);
2618 }
2619
2620 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2621    in-place if it enables detection of more reductions.  Arguments
2622    as there.  */
2623
2624 gimple
2625 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2626                           bool check_reduction, bool *double_reduc)
2627 {
2628   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2629                                      double_reduc, true);
2630 }
2631
2632 /* Calculate the cost of one scalar iteration of the loop.  */
2633 int
2634 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2635 {
2636   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2637   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2638   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2639   int innerloop_iters, i, stmt_cost;
2640
2641   /* Count statements in scalar loop.  Using this as scalar cost for a single
2642      iteration for now.
2643
2644      TODO: Add outer loop support.
2645
2646      TODO: Consider assigning different costs to different scalar
2647      statements.  */
2648
2649   /* FORNOW.  */
2650   innerloop_iters = 1;
2651   if (loop->inner)
2652     innerloop_iters = 50; /* FIXME */
2653
2654   for (i = 0; i < nbbs; i++)
2655     {
2656       gimple_stmt_iterator si;
2657       basic_block bb = bbs[i];
2658
2659       if (bb->loop_father == loop->inner)
2660         factor = innerloop_iters;
2661       else
2662         factor = 1;
2663
2664       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2665         {
2666           gimple stmt = gsi_stmt (si);
2667           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2668
2669           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2670             continue;
2671
2672           /* Skip stmts that are not vectorized inside the loop.  */
2673           if (stmt_info
2674               && !STMT_VINFO_RELEVANT_P (stmt_info)
2675               && (!STMT_VINFO_LIVE_P (stmt_info)
2676                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2677               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2678             continue;
2679
2680           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2681             {
2682               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2683                stmt_cost = vect_get_stmt_cost (scalar_load);
2684              else
2685                stmt_cost = vect_get_stmt_cost (scalar_store);
2686             }
2687           else
2688             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2689
2690           scalar_single_iter_cost += stmt_cost * factor;
2691         }
2692     }
2693   return scalar_single_iter_cost;
2694 }
2695
2696 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2697 int
2698 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2699                              int *peel_iters_epilogue,
2700                              int scalar_single_iter_cost,
2701                              stmt_vector_for_cost *prologue_cost_vec,
2702                              stmt_vector_for_cost *epilogue_cost_vec)
2703 {
2704   int retval = 0;
2705   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2706
2707   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2708     {
2709       *peel_iters_epilogue = vf/2;
2710       if (dump_enabled_p ())
2711         dump_printf_loc (MSG_NOTE, vect_location,
2712                          "cost model: epilogue peel iters set to vf/2 "
2713                          "because loop iterations are unknown .\n");
2714
2715       /* If peeled iterations are known but number of scalar loop
2716          iterations are unknown, count a taken branch per peeled loop.  */
2717       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2718                                  NULL, 0, vect_prologue);
2719     }
2720   else
2721     {
2722       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2723       peel_iters_prologue = niters < peel_iters_prologue ?
2724                             niters : peel_iters_prologue;
2725       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2726       /* If we need to peel for gaps, but no peeling is required, we have to
2727          peel VF iterations.  */
2728       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2729         *peel_iters_epilogue = vf;
2730     }
2731
2732   if (peel_iters_prologue)
2733     retval += record_stmt_cost (prologue_cost_vec,
2734                                 peel_iters_prologue * scalar_single_iter_cost,
2735                                 scalar_stmt, NULL, 0, vect_prologue);
2736   if (*peel_iters_epilogue)
2737     retval += record_stmt_cost (epilogue_cost_vec,
2738                                 *peel_iters_epilogue * scalar_single_iter_cost,
2739                                 scalar_stmt, NULL, 0, vect_epilogue);
2740   return retval;
2741 }
2742
2743 /* Function vect_estimate_min_profitable_iters
2744
2745    Return the number of iterations required for the vector version of the
2746    loop to be profitable relative to the cost of the scalar version of the
2747    loop.  */
2748
2749 static void
2750 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2751                                     int *ret_min_profitable_niters,
2752                                     int *ret_min_profitable_estimate)
2753 {
2754   int min_profitable_iters;
2755   int min_profitable_estimate;
2756   int peel_iters_prologue;
2757   int peel_iters_epilogue;
2758   unsigned vec_inside_cost = 0;
2759   int vec_outside_cost = 0;
2760   unsigned vec_prologue_cost = 0;
2761   unsigned vec_epilogue_cost = 0;
2762   int scalar_single_iter_cost = 0;
2763   int scalar_outside_cost = 0;
2764   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2765   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2766   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2767
2768   /* Cost model disabled.  */
2769   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2770     {
2771       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2772       *ret_min_profitable_niters = 0;
2773       *ret_min_profitable_estimate = 0;
2774       return;
2775     }
2776
2777   /* Requires loop versioning tests to handle misalignment.  */
2778   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2779     {
2780       /*  FIXME: Make cost depend on complexity of individual check.  */
2781       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2782       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2783                             vect_prologue);
2784       dump_printf (MSG_NOTE,
2785                    "cost model: Adding cost of checks for loop "
2786                    "versioning to treat misalignment.\n");
2787     }
2788
2789   /* Requires loop versioning with alias checks.  */
2790   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2791     {
2792       /*  FIXME: Make cost depend on complexity of individual check.  */
2793       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2794       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2795                             vect_prologue);
2796       dump_printf (MSG_NOTE,
2797                    "cost model: Adding cost of checks for loop "
2798                    "versioning aliasing.\n");
2799     }
2800
2801   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2802       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2803     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2804                           vect_prologue);
2805
2806   /* Count statements in scalar loop.  Using this as scalar cost for a single
2807      iteration for now.
2808
2809      TODO: Add outer loop support.
2810
2811      TODO: Consider assigning different costs to different scalar
2812      statements.  */
2813
2814   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2815
2816   /* Add additional cost for the peeled instructions in prologue and epilogue
2817      loop.
2818
2819      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2820      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2821
2822      TODO: Build an expression that represents peel_iters for prologue and
2823      epilogue to be used in a run-time test.  */
2824
2825   if (npeel  < 0)
2826     {
2827       peel_iters_prologue = vf/2;
2828       dump_printf (MSG_NOTE, "cost model: "
2829                    "prologue peel iters set to vf/2.\n");
2830
2831       /* If peeling for alignment is unknown, loop bound of main loop becomes
2832          unknown.  */
2833       peel_iters_epilogue = vf/2;
2834       dump_printf (MSG_NOTE, "cost model: "
2835                    "epilogue peel iters set to vf/2 because "
2836                    "peeling for alignment is unknown.\n");
2837
2838       /* If peeled iterations are unknown, count a taken branch and a not taken
2839          branch per peeled loop. Even if scalar loop iterations are known,
2840          vector iterations are not known since peeled prologue iterations are
2841          not known. Hence guards remain the same.  */
2842       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2843                             NULL, 0, vect_prologue);
2844       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2845                             NULL, 0, vect_prologue);
2846       /* FORNOW: Don't attempt to pass individual scalar instructions to
2847          the model; just assume linear cost for scalar iterations.  */
2848       (void) add_stmt_cost (target_cost_data,
2849                             peel_iters_prologue * scalar_single_iter_cost,
2850                             scalar_stmt, NULL, 0, vect_prologue);
2851       (void) add_stmt_cost (target_cost_data,
2852                             peel_iters_epilogue * scalar_single_iter_cost,
2853                             scalar_stmt, NULL, 0, vect_epilogue);
2854     }
2855   else
2856     {
2857       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2858       stmt_info_for_cost *si;
2859       int j;
2860       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2861
2862       prologue_cost_vec.create (2);
2863       epilogue_cost_vec.create (2);
2864       peel_iters_prologue = npeel;
2865
2866       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2867                                           &peel_iters_epilogue,
2868                                           scalar_single_iter_cost,
2869                                           &prologue_cost_vec,
2870                                           &epilogue_cost_vec);
2871
2872       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2873         {
2874           struct _stmt_vec_info *stmt_info
2875             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2876           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2877                                 si->misalign, vect_prologue);
2878         }
2879
2880       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2881         {
2882           struct _stmt_vec_info *stmt_info
2883             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2884           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2885                                 si->misalign, vect_epilogue);
2886         }
2887
2888       prologue_cost_vec.release ();
2889       epilogue_cost_vec.release ();
2890     }
2891
2892   /* FORNOW: The scalar outside cost is incremented in one of the
2893      following ways:
2894
2895      1. The vectorizer checks for alignment and aliasing and generates
2896      a condition that allows dynamic vectorization.  A cost model
2897      check is ANDED with the versioning condition.  Hence scalar code
2898      path now has the added cost of the versioning check.
2899
2900        if (cost > th & versioning_check)
2901          jmp to vector code
2902
2903      Hence run-time scalar is incremented by not-taken branch cost.
2904
2905      2. The vectorizer then checks if a prologue is required.  If the
2906      cost model check was not done before during versioning, it has to
2907      be done before the prologue check.
2908
2909        if (cost <= th)
2910          prologue = scalar_iters
2911        if (prologue == 0)
2912          jmp to vector code
2913        else
2914          execute prologue
2915        if (prologue == num_iters)
2916          go to exit
2917
2918      Hence the run-time scalar cost is incremented by a taken branch,
2919      plus a not-taken branch, plus a taken branch cost.
2920
2921      3. The vectorizer then checks if an epilogue is required.  If the
2922      cost model check was not done before during prologue check, it
2923      has to be done with the epilogue check.
2924
2925        if (prologue == 0)
2926          jmp to vector code
2927        else
2928          execute prologue
2929        if (prologue == num_iters)
2930          go to exit
2931        vector code:
2932          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2933            jmp to epilogue
2934
2935      Hence the run-time scalar cost should be incremented by 2 taken
2936      branches.
2937
2938      TODO: The back end may reorder the BBS's differently and reverse
2939      conditions/branch directions.  Change the estimates below to
2940      something more reasonable.  */
2941
2942   /* If the number of iterations is known and we do not do versioning, we can
2943      decide whether to vectorize at compile time.  Hence the scalar version
2944      do not carry cost model guard costs.  */
2945   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2946       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2947       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2948     {
2949       /* Cost model check occurs at versioning.  */
2950       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2951           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2952         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2953       else
2954         {
2955           /* Cost model check occurs at prologue generation.  */
2956           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2957             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2958               + vect_get_stmt_cost (cond_branch_not_taken);
2959           /* Cost model check occurs at epilogue generation.  */
2960           else
2961             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2962         }
2963     }
2964
2965   /* Complete the target-specific cost calculations.  */
2966   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2967                &vec_inside_cost, &vec_epilogue_cost);
2968
2969   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2970
2971   /* Calculate number of iterations required to make the vector version
2972      profitable, relative to the loop bodies only.  The following condition
2973      must hold true:
2974      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2975      where
2976      SIC = scalar iteration cost, VIC = vector iteration cost,
2977      VOC = vector outside cost, VF = vectorization factor,
2978      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2979      SOC = scalar outside cost for run time cost model check.  */
2980
2981   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2982     {
2983       if (vec_outside_cost <= 0)
2984         min_profitable_iters = 1;
2985       else
2986         {
2987           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2988                                   - vec_inside_cost * peel_iters_prologue
2989                                   - vec_inside_cost * peel_iters_epilogue)
2990                                  / ((scalar_single_iter_cost * vf)
2991                                     - vec_inside_cost);
2992
2993           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2994               <= (((int) vec_inside_cost * min_profitable_iters)
2995                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2996             min_profitable_iters++;
2997         }
2998     }
2999   /* vector version will never be profitable.  */
3000   else
3001     {
3002       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3003         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3004                     "did not happen for a simd loop");
3005
3006       if (dump_enabled_p ())
3007         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3008                          "cost model: the vector iteration cost = %d "
3009                          "divided by the scalar iteration cost = %d "
3010                          "is greater or equal to the vectorization factor = %d"
3011                          ".\n",
3012                          vec_inside_cost, scalar_single_iter_cost, vf);
3013       *ret_min_profitable_niters = -1;
3014       *ret_min_profitable_estimate = -1;
3015       return;
3016     }
3017
3018   if (dump_enabled_p ())
3019     {
3020       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3021       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3022                    vec_inside_cost);
3023       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3024                    vec_prologue_cost);
3025       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3026                    vec_epilogue_cost);
3027       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3028                    scalar_single_iter_cost);
3029       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3030                    scalar_outside_cost);
3031       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3032                    vec_outside_cost);
3033       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3034                    peel_iters_prologue);
3035       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3036                    peel_iters_epilogue);
3037       dump_printf (MSG_NOTE,
3038                    "  Calculated minimum iters for profitability: %d\n",
3039                    min_profitable_iters);
3040       dump_printf (MSG_NOTE, "\n");
3041     }
3042
3043   min_profitable_iters =
3044         min_profitable_iters < vf ? vf : min_profitable_iters;
3045
3046   /* Because the condition we create is:
3047      if (niters <= min_profitable_iters)
3048        then skip the vectorized loop.  */
3049   min_profitable_iters--;
3050
3051   if (dump_enabled_p ())
3052     dump_printf_loc (MSG_NOTE, vect_location,
3053                      "  Runtime profitability threshold = %d\n",
3054                      min_profitable_iters);
3055
3056   *ret_min_profitable_niters = min_profitable_iters;
3057
3058   /* Calculate number of iterations required to make the vector version
3059      profitable, relative to the loop bodies only.
3060
3061      Non-vectorized variant is SIC * niters and it must win over vector
3062      variant on the expected loop trip count.  The following condition must hold true:
3063      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3064
3065   if (vec_outside_cost <= 0)
3066     min_profitable_estimate = 1;
3067   else
3068     {
3069       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3070                                  - vec_inside_cost * peel_iters_prologue
3071                                  - vec_inside_cost * peel_iters_epilogue)
3072                                  / ((scalar_single_iter_cost * vf)
3073                                    - vec_inside_cost);
3074     }
3075   min_profitable_estimate --;
3076   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3077   if (dump_enabled_p ())
3078     dump_printf_loc (MSG_NOTE, vect_location,
3079                      "  Static estimate profitability threshold = %d\n",
3080                       min_profitable_iters);
3081
3082   *ret_min_profitable_estimate = min_profitable_estimate;
3083 }
3084
3085
3086 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3087    functions. Design better to avoid maintenance issues.  */
3088
3089 /* Function vect_model_reduction_cost.
3090
3091    Models cost for a reduction operation, including the vector ops
3092    generated within the strip-mine loop, the initial definition before
3093    the loop, and the epilogue code that must be generated.  */
3094
3095 static bool
3096 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3097                            int ncopies)
3098 {
3099   int prologue_cost = 0, epilogue_cost = 0;
3100   enum tree_code code;
3101   optab optab;
3102   tree vectype;
3103   gimple stmt, orig_stmt;
3104   tree reduction_op;
3105   machine_mode mode;
3106   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3107   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3108   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3109
3110   /* Cost of reduction op inside loop.  */
3111   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3112                                         stmt_info, 0, vect_body);
3113   stmt = STMT_VINFO_STMT (stmt_info);
3114
3115   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3116     {
3117     case GIMPLE_SINGLE_RHS:
3118       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3119       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3120       break;
3121     case GIMPLE_UNARY_RHS:
3122       reduction_op = gimple_assign_rhs1 (stmt);
3123       break;
3124     case GIMPLE_BINARY_RHS:
3125       reduction_op = gimple_assign_rhs2 (stmt);
3126       break;
3127     case GIMPLE_TERNARY_RHS:
3128       reduction_op = gimple_assign_rhs3 (stmt);
3129       break;
3130     default:
3131       gcc_unreachable ();
3132     }
3133
3134   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3135   if (!vectype)
3136     {
3137       if (dump_enabled_p ())
3138         {
3139           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3140                            "unsupported data-type ");
3141           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3142                              TREE_TYPE (reduction_op));
3143           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3144         }
3145       return false;
3146    }
3147
3148   mode = TYPE_MODE (vectype);
3149   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3150
3151   if (!orig_stmt)
3152     orig_stmt = STMT_VINFO_STMT (stmt_info);
3153
3154   code = gimple_assign_rhs_code (orig_stmt);
3155
3156   /* Add in cost for initial definition.  */
3157   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3158                                   stmt_info, 0, vect_prologue);
3159
3160   /* Determine cost of epilogue code.
3161
3162      We have a reduction operator that will reduce the vector in one statement.
3163      Also requires scalar extract.  */
3164
3165   if (!nested_in_vect_loop_p (loop, orig_stmt))
3166     {
3167       if (reduc_code != ERROR_MARK)
3168         {
3169           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3170                                           stmt_info, 0, vect_epilogue);
3171           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3172                                           stmt_info, 0, vect_epilogue);
3173         }
3174       else
3175         {
3176           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3177           tree bitsize =
3178             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3179           int element_bitsize = tree_to_uhwi (bitsize);
3180           int nelements = vec_size_in_bits / element_bitsize;
3181
3182           optab = optab_for_tree_code (code, vectype, optab_default);
3183
3184           /* We have a whole vector shift available.  */
3185           if (VECTOR_MODE_P (mode)
3186               && optab_handler (optab, mode) != CODE_FOR_nothing
3187               && optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3188             {
3189               /* Final reduction via vector shifts and the reduction operator.
3190                  Also requires scalar extract.  */
3191               epilogue_cost += add_stmt_cost (target_cost_data,
3192                                               exact_log2 (nelements) * 2,
3193                                               vector_stmt, stmt_info, 0,
3194                                               vect_epilogue);
3195               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3196                                               vec_to_scalar, stmt_info, 0,
3197                                               vect_epilogue);
3198             }
3199           else
3200             /* Use extracts and reduction op for final reduction.  For N
3201                elements, we have N extracts and N-1 reduction ops.  */
3202             epilogue_cost += add_stmt_cost (target_cost_data,
3203                                             nelements + nelements - 1,
3204                                             vector_stmt, stmt_info, 0,
3205                                             vect_epilogue);
3206         }
3207     }
3208
3209   if (dump_enabled_p ())
3210     dump_printf (MSG_NOTE,
3211                  "vect_model_reduction_cost: inside_cost = %d, "
3212                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3213                  prologue_cost, epilogue_cost);
3214
3215   return true;
3216 }
3217
3218
3219 /* Function vect_model_induction_cost.
3220
3221    Models cost for induction operations.  */
3222
3223 static void
3224 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3225 {
3226   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3227   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3228   unsigned inside_cost, prologue_cost;
3229
3230   /* loop cost for vec_loop.  */
3231   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3232                                stmt_info, 0, vect_body);
3233
3234   /* prologue cost for vec_init and vec_step.  */
3235   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3236                                  stmt_info, 0, vect_prologue);
3237
3238   if (dump_enabled_p ())
3239     dump_printf_loc (MSG_NOTE, vect_location,
3240                      "vect_model_induction_cost: inside_cost = %d, "
3241                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3242 }
3243
3244
3245 /* Function get_initial_def_for_induction
3246
3247    Input:
3248    STMT - a stmt that performs an induction operation in the loop.
3249    IV_PHI - the initial value of the induction variable
3250
3251    Output:
3252    Return a vector variable, initialized with the first VF values of
3253    the induction variable.  E.g., for an iv with IV_PHI='X' and
3254    evolution S, for a vector of 4 units, we want to return:
3255    [X, X + S, X + 2*S, X + 3*S].  */
3256
3257 static tree
3258 get_initial_def_for_induction (gimple iv_phi)
3259 {
3260   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3261   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3262   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3263   tree vectype;
3264   int nunits;
3265   edge pe = loop_preheader_edge (loop);
3266   struct loop *iv_loop;
3267   basic_block new_bb;
3268   tree new_vec, vec_init, vec_step, t;
3269   tree new_var;
3270   tree new_name;
3271   gimple init_stmt, induction_phi, new_stmt;
3272   tree induc_def, vec_def, vec_dest;
3273   tree init_expr, step_expr;
3274   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3275   int i;
3276   int ncopies;
3277   tree expr;
3278   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3279   bool nested_in_vect_loop = false;
3280   gimple_seq stmts = NULL;
3281   imm_use_iterator imm_iter;
3282   use_operand_p use_p;
3283   gimple exit_phi;
3284   edge latch_e;
3285   tree loop_arg;
3286   gimple_stmt_iterator si;
3287   basic_block bb = gimple_bb (iv_phi);
3288   tree stepvectype;
3289   tree resvectype;
3290
3291   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3292   if (nested_in_vect_loop_p (loop, iv_phi))
3293     {
3294       nested_in_vect_loop = true;
3295       iv_loop = loop->inner;
3296     }
3297   else
3298     iv_loop = loop;
3299   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3300
3301   latch_e = loop_latch_edge (iv_loop);
3302   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3303
3304   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3305   gcc_assert (step_expr != NULL_TREE);
3306
3307   pe = loop_preheader_edge (iv_loop);
3308   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3309                                      loop_preheader_edge (iv_loop));
3310
3311   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3312   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3313   gcc_assert (vectype);
3314   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3315   ncopies = vf / nunits;
3316
3317   gcc_assert (phi_info);
3318   gcc_assert (ncopies >= 1);
3319
3320   /* Convert the step to the desired type.  */
3321   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3322                                                   step_expr),
3323                                     &stmts, true, NULL_TREE);
3324   if (stmts)
3325     {
3326       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3327       gcc_assert (!new_bb);
3328     }
3329
3330   /* Find the first insertion point in the BB.  */
3331   si = gsi_after_labels (bb);
3332
3333   /* Create the vector that holds the initial_value of the induction.  */
3334   if (nested_in_vect_loop)
3335     {
3336       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3337          been created during vectorization of previous stmts.  We obtain it
3338          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3339       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3340       /* If the initial value is not of proper type, convert it.  */
3341       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3342         {
3343           new_stmt = gimple_build_assign_with_ops
3344               (VIEW_CONVERT_EXPR,
3345                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3346                build1 (VIEW_CONVERT_EXPR, vectype, vec_init), NULL_TREE);
3347           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3348           gimple_assign_set_lhs (new_stmt, vec_init);
3349           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3350                                                  new_stmt);
3351           gcc_assert (!new_bb);
3352           set_vinfo_for_stmt (new_stmt,
3353                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3354         }
3355     }
3356   else
3357     {
3358       vec<constructor_elt, va_gc> *v;
3359
3360       /* iv_loop is the loop to be vectorized. Create:
3361          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3362       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3363                                        vect_scalar_var, "var_");
3364       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3365                                                      init_expr),
3366                                        &stmts, false, new_var);
3367       if (stmts)
3368         {
3369           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3370           gcc_assert (!new_bb);
3371         }
3372
3373       vec_alloc (v, nunits);
3374       bool constant_p = is_gimple_min_invariant (new_name);
3375       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3376       for (i = 1; i < nunits; i++)
3377         {
3378           /* Create: new_name_i = new_name + step_expr  */
3379           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3380                                   new_name, step_expr);
3381           if (!is_gimple_min_invariant (new_name))
3382             {
3383               init_stmt = gimple_build_assign (new_var, new_name);
3384               new_name = make_ssa_name (new_var, init_stmt);
3385               gimple_assign_set_lhs (init_stmt, new_name);
3386               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3387               gcc_assert (!new_bb);
3388               if (dump_enabled_p ())
3389                 {
3390                   dump_printf_loc (MSG_NOTE, vect_location,
3391                                    "created new init_stmt: ");
3392                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3393                   dump_printf (MSG_NOTE, "\n");
3394                 }
3395               constant_p = false;
3396             }
3397           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3398         }
3399       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3400       if (constant_p)
3401         new_vec = build_vector_from_ctor (vectype, v);
3402       else
3403         new_vec = build_constructor (vectype, v);
3404       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3405     }
3406
3407
3408   /* Create the vector that holds the step of the induction.  */
3409   if (nested_in_vect_loop)
3410     /* iv_loop is nested in the loop to be vectorized. Generate:
3411        vec_step = [S, S, S, S]  */
3412     new_name = step_expr;
3413   else
3414     {
3415       /* iv_loop is the loop to be vectorized. Generate:
3416           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3417       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3418         {
3419           expr = build_int_cst (integer_type_node, vf);
3420           expr = fold_convert (TREE_TYPE (step_expr), expr);
3421         }
3422       else
3423         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3424       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3425                               expr, step_expr);
3426       if (TREE_CODE (step_expr) == SSA_NAME)
3427         new_name = vect_init_vector (iv_phi, new_name,
3428                                      TREE_TYPE (step_expr), NULL);
3429     }
3430
3431   t = unshare_expr (new_name);
3432   gcc_assert (CONSTANT_CLASS_P (new_name)
3433               || TREE_CODE (new_name) == SSA_NAME);
3434   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3435   gcc_assert (stepvectype);
3436   new_vec = build_vector_from_val (stepvectype, t);
3437   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3438
3439
3440   /* Create the following def-use cycle:
3441      loop prolog:
3442          vec_init = ...
3443          vec_step = ...
3444      loop:
3445          vec_iv = PHI <vec_init, vec_loop>
3446          ...
3447          STMT
3448          ...
3449          vec_loop = vec_iv + vec_step;  */
3450
3451   /* Create the induction-phi that defines the induction-operand.  */
3452   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3453   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3454   set_vinfo_for_stmt (induction_phi,
3455                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3456   induc_def = PHI_RESULT (induction_phi);
3457
3458   /* Create the iv update inside the loop  */
3459   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3460                                            induc_def, vec_step);
3461   vec_def = make_ssa_name (vec_dest, new_stmt);
3462   gimple_assign_set_lhs (new_stmt, vec_def);
3463   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3464   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3465                                                    NULL));
3466
3467   /* Set the arguments of the phi node:  */
3468   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3469   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3470                UNKNOWN_LOCATION);
3471
3472
3473   /* In case that vectorization factor (VF) is bigger than the number
3474      of elements that we can fit in a vectype (nunits), we have to generate
3475      more than one vector stmt - i.e - we need to "unroll" the
3476      vector stmt by a factor VF/nunits.  For more details see documentation
3477      in vectorizable_operation.  */
3478
3479   if (ncopies > 1)
3480     {
3481       stmt_vec_info prev_stmt_vinfo;
3482       /* FORNOW. This restriction should be relaxed.  */
3483       gcc_assert (!nested_in_vect_loop);
3484
3485       /* Create the vector that holds the step of the induction.  */
3486       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3487         {
3488           expr = build_int_cst (integer_type_node, nunits);
3489           expr = fold_convert (TREE_TYPE (step_expr), expr);
3490         }
3491       else
3492         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3493       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3494                               expr, step_expr);
3495       if (TREE_CODE (step_expr) == SSA_NAME)
3496         new_name = vect_init_vector (iv_phi, new_name,
3497                                      TREE_TYPE (step_expr), NULL);
3498       t = unshare_expr (new_name);
3499       gcc_assert (CONSTANT_CLASS_P (new_name)
3500                   || TREE_CODE (new_name) == SSA_NAME);
3501       new_vec = build_vector_from_val (stepvectype, t);
3502       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3503
3504       vec_def = induc_def;
3505       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3506       for (i = 1; i < ncopies; i++)
3507         {
3508           /* vec_i = vec_prev + vec_step  */
3509           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3510                                                    vec_def, vec_step);
3511           vec_def = make_ssa_name (vec_dest, new_stmt);
3512           gimple_assign_set_lhs (new_stmt, vec_def);
3513
3514           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3515           if (!useless_type_conversion_p (resvectype, vectype))
3516             {
3517               new_stmt = gimple_build_assign_with_ops
3518                   (VIEW_CONVERT_EXPR,
3519                    vect_get_new_vect_var (resvectype, vect_simple_var,
3520                                           "vec_iv_"),
3521                    build1 (VIEW_CONVERT_EXPR, resvectype,
3522                            gimple_assign_lhs (new_stmt)), NULL_TREE);
3523               gimple_assign_set_lhs (new_stmt,
3524                                      make_ssa_name
3525                                        (gimple_assign_lhs (new_stmt), new_stmt));
3526               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3527             }
3528           set_vinfo_for_stmt (new_stmt,
3529                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3530           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3531           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3532         }
3533     }
3534
3535   if (nested_in_vect_loop)
3536     {
3537       /* Find the loop-closed exit-phi of the induction, and record
3538          the final vector of induction results:  */
3539       exit_phi = NULL;
3540       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3541         {
3542           gimple use_stmt = USE_STMT (use_p);
3543           if (is_gimple_debug (use_stmt))
3544             continue;
3545
3546           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3547             {
3548               exit_phi = use_stmt;
3549               break;
3550             }
3551         }
3552       if (exit_phi)
3553         {
3554           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3555           /* FORNOW. Currently not supporting the case that an inner-loop induction
3556              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3557           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3558                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3559
3560           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3561           if (dump_enabled_p ())
3562             {
3563               dump_printf_loc (MSG_NOTE, vect_location,
3564                                "vector of inductions after inner-loop:");
3565               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3566               dump_printf (MSG_NOTE, "\n");
3567             }
3568         }
3569     }
3570
3571
3572   if (dump_enabled_p ())
3573     {
3574       dump_printf_loc (MSG_NOTE, vect_location,
3575                        "transform induction: created def-use cycle: ");
3576       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3577       dump_printf (MSG_NOTE, "\n");
3578       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3579                         SSA_NAME_DEF_STMT (vec_def), 0);
3580       dump_printf (MSG_NOTE, "\n");
3581     }
3582
3583   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3584   if (!useless_type_conversion_p (resvectype, vectype))
3585     {
3586       new_stmt = gimple_build_assign_with_ops
3587          (VIEW_CONVERT_EXPR,
3588           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3589           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def), NULL_TREE);
3590       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3591       gimple_assign_set_lhs (new_stmt, induc_def);
3592       si = gsi_after_labels (bb);
3593       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3594       set_vinfo_for_stmt (new_stmt,
3595                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3596       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3597         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3598     }
3599
3600   return induc_def;
3601 }
3602
3603
3604 /* Function get_initial_def_for_reduction
3605
3606    Input:
3607    STMT - a stmt that performs a reduction operation in the loop.
3608    INIT_VAL - the initial value of the reduction variable
3609
3610    Output:
3611    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3612         of the reduction (used for adjusting the epilog - see below).
3613    Return a vector variable, initialized according to the operation that STMT
3614         performs. This vector will be used as the initial value of the
3615         vector of partial results.
3616
3617    Option1 (adjust in epilog): Initialize the vector as follows:
3618      add/bit or/xor:    [0,0,...,0,0]
3619      mult/bit and:      [1,1,...,1,1]
3620      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3621    and when necessary (e.g. add/mult case) let the caller know
3622    that it needs to adjust the result by init_val.
3623
3624    Option2: Initialize the vector as follows:
3625      add/bit or/xor:    [init_val,0,0,...,0]
3626      mult/bit and:      [init_val,1,1,...,1]
3627      min/max/cond_expr: [init_val,init_val,...,init_val]
3628    and no adjustments are needed.
3629
3630    For example, for the following code:
3631
3632    s = init_val;
3633    for (i=0;i<n;i++)
3634      s = s + a[i];
3635
3636    STMT is 's = s + a[i]', and the reduction variable is 's'.
3637    For a vector of 4 units, we want to return either [0,0,0,init_val],
3638    or [0,0,0,0] and let the caller know that it needs to adjust
3639    the result at the end by 'init_val'.
3640
3641    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3642    initialization vector is simpler (same element in all entries), if
3643    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3644
3645    A cost model should help decide between these two schemes.  */
3646
3647 tree
3648 get_initial_def_for_reduction (gimple stmt, tree init_val,
3649                                tree *adjustment_def)
3650 {
3651   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3652   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3653   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3654   tree scalar_type = TREE_TYPE (init_val);
3655   tree vectype = get_vectype_for_scalar_type (scalar_type);
3656   int nunits;
3657   enum tree_code code = gimple_assign_rhs_code (stmt);
3658   tree def_for_init;
3659   tree init_def;
3660   tree *elts;
3661   int i;
3662   bool nested_in_vect_loop = false;
3663   tree init_value;
3664   REAL_VALUE_TYPE real_init_val = dconst0;
3665   int int_init_val = 0;
3666   gimple def_stmt = NULL;
3667
3668   gcc_assert (vectype);
3669   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3670
3671   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3672               || SCALAR_FLOAT_TYPE_P (scalar_type));
3673
3674   if (nested_in_vect_loop_p (loop, stmt))
3675     nested_in_vect_loop = true;
3676   else
3677     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3678
3679   /* In case of double reduction we only create a vector variable to be put
3680      in the reduction phi node.  The actual statement creation is done in
3681      vect_create_epilog_for_reduction.  */
3682   if (adjustment_def && nested_in_vect_loop
3683       && TREE_CODE (init_val) == SSA_NAME
3684       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3685       && gimple_code (def_stmt) == GIMPLE_PHI
3686       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3687       && vinfo_for_stmt (def_stmt)
3688       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3689           == vect_double_reduction_def)
3690     {
3691       *adjustment_def = NULL;
3692       return vect_create_destination_var (init_val, vectype);
3693     }
3694
3695   if (TREE_CONSTANT (init_val))
3696     {
3697       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3698         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3699       else
3700         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3701     }
3702   else
3703     init_value = init_val;
3704
3705   switch (code)
3706     {
3707       case WIDEN_SUM_EXPR:
3708       case DOT_PROD_EXPR:
3709       case SAD_EXPR:
3710       case PLUS_EXPR:
3711       case MINUS_EXPR:
3712       case BIT_IOR_EXPR:
3713       case BIT_XOR_EXPR:
3714       case MULT_EXPR:
3715       case BIT_AND_EXPR:
3716         /* ADJUSMENT_DEF is NULL when called from
3717            vect_create_epilog_for_reduction to vectorize double reduction.  */
3718         if (adjustment_def)
3719           {
3720             if (nested_in_vect_loop)
3721               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3722                                                               NULL);
3723             else
3724               *adjustment_def = init_val;
3725           }
3726
3727         if (code == MULT_EXPR)
3728           {
3729             real_init_val = dconst1;
3730             int_init_val = 1;
3731           }
3732
3733         if (code == BIT_AND_EXPR)
3734           int_init_val = -1;
3735
3736         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3737           def_for_init = build_real (scalar_type, real_init_val);
3738         else
3739           def_for_init = build_int_cst (scalar_type, int_init_val);
3740
3741         /* Create a vector of '0' or '1' except the first element.  */
3742         elts = XALLOCAVEC (tree, nunits);
3743         for (i = nunits - 2; i >= 0; --i)
3744           elts[i + 1] = def_for_init;
3745
3746         /* Option1: the first element is '0' or '1' as well.  */
3747         if (adjustment_def)
3748           {
3749             elts[0] = def_for_init;
3750             init_def = build_vector (vectype, elts);
3751             break;
3752           }
3753
3754         /* Option2: the first element is INIT_VAL.  */
3755         elts[0] = init_val;
3756         if (TREE_CONSTANT (init_val))
3757           init_def = build_vector (vectype, elts);
3758         else
3759           {
3760             vec<constructor_elt, va_gc> *v;
3761             vec_alloc (v, nunits);
3762             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3763             for (i = 1; i < nunits; ++i)
3764               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3765             init_def = build_constructor (vectype, v);
3766           }
3767
3768         break;
3769
3770       case MIN_EXPR:
3771       case MAX_EXPR:
3772       case COND_EXPR:
3773         if (adjustment_def)
3774           {
3775             *adjustment_def = NULL_TREE;
3776             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3777             break;
3778           }
3779
3780         init_def = build_vector_from_val (vectype, init_value);
3781         break;
3782
3783       default:
3784         gcc_unreachable ();
3785     }
3786
3787   return init_def;
3788 }
3789
3790
3791 /* Function vect_create_epilog_for_reduction
3792
3793    Create code at the loop-epilog to finalize the result of a reduction
3794    computation.
3795
3796    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3797      reduction statements.
3798    STMT is the scalar reduction stmt that is being vectorized.
3799    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3800      number of elements that we can fit in a vectype (nunits).  In this case
3801      we have to generate more than one vector stmt - i.e - we need to "unroll"
3802      the vector stmt by a factor VF/nunits.  For more details see documentation
3803      in vectorizable_operation.
3804    REDUC_CODE is the tree-code for the epilog reduction.
3805    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3806      computation.
3807    REDUC_INDEX is the index of the operand in the right hand side of the
3808      statement that is defined by REDUCTION_PHI.
3809    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3810    SLP_NODE is an SLP node containing a group of reduction statements. The
3811      first one in this group is STMT.
3812
3813    This function:
3814    1. Creates the reduction def-use cycles: sets the arguments for
3815       REDUCTION_PHIS:
3816       The loop-entry argument is the vectorized initial-value of the reduction.
3817       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3818       sums.
3819    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3820       by applying the operation specified by REDUC_CODE if available, or by
3821       other means (whole-vector shifts or a scalar loop).
3822       The function also creates a new phi node at the loop exit to preserve
3823       loop-closed form, as illustrated below.
3824
3825      The flow at the entry to this function:
3826
3827         loop:
3828           vec_def = phi <null, null>            # REDUCTION_PHI
3829           VECT_DEF = vector_stmt                # vectorized form of STMT
3830           s_loop = scalar_stmt                  # (scalar) STMT
3831         loop_exit:
3832           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3833           use <s_out0>
3834           use <s_out0>
3835
3836      The above is transformed by this function into:
3837
3838         loop:
3839           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3840           VECT_DEF = vector_stmt                # vectorized form of STMT
3841           s_loop = scalar_stmt                  # (scalar) STMT
3842         loop_exit:
3843           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3844           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3845           v_out2 = reduce <v_out1>
3846           s_out3 = extract_field <v_out2, 0>
3847           s_out4 = adjust_result <s_out3>
3848           use <s_out4>
3849           use <s_out4>
3850 */
3851
3852 static void
3853 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3854                                   int ncopies, enum tree_code reduc_code,
3855                                   vec<gimple> reduction_phis,
3856                                   int reduc_index, bool double_reduc,
3857                                   slp_tree slp_node)
3858 {
3859   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3860   stmt_vec_info prev_phi_info;
3861   tree vectype;
3862   machine_mode mode;
3863   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3864   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3865   basic_block exit_bb;
3866   tree scalar_dest;
3867   tree scalar_type;
3868   gimple new_phi = NULL, phi;
3869   gimple_stmt_iterator exit_gsi;
3870   tree vec_dest;
3871   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3872   gimple epilog_stmt = NULL;
3873   enum tree_code code = gimple_assign_rhs_code (stmt);
3874   gimple exit_phi;
3875   tree bitsize, bitpos;
3876   tree adjustment_def = NULL;
3877   tree vec_initial_def = NULL;
3878   tree reduction_op, expr, def;
3879   tree orig_name, scalar_result;
3880   imm_use_iterator imm_iter, phi_imm_iter;
3881   use_operand_p use_p, phi_use_p;
3882   bool extract_scalar_result = false;
3883   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3884   bool nested_in_vect_loop = false;
3885   auto_vec<gimple> new_phis;
3886   auto_vec<gimple> inner_phis;
3887   enum vect_def_type dt = vect_unknown_def_type;
3888   int j, i;
3889   auto_vec<tree> scalar_results;
3890   unsigned int group_size = 1, k, ratio;
3891   auto_vec<tree> vec_initial_defs;
3892   auto_vec<gimple> phis;
3893   bool slp_reduc = false;
3894   tree new_phi_result;
3895   gimple inner_phi = NULL;
3896
3897   if (slp_node)
3898     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3899
3900   if (nested_in_vect_loop_p (loop, stmt))
3901     {
3902       outer_loop = loop;
3903       loop = loop->inner;
3904       nested_in_vect_loop = true;
3905       gcc_assert (!slp_node);
3906     }
3907
3908   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3909     {
3910     case GIMPLE_SINGLE_RHS:
3911       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3912                   == ternary_op);
3913       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3914       break;
3915     case GIMPLE_UNARY_RHS:
3916       reduction_op = gimple_assign_rhs1 (stmt);
3917       break;
3918     case GIMPLE_BINARY_RHS:
3919       reduction_op = reduc_index ?
3920                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3921       break;
3922     case GIMPLE_TERNARY_RHS:
3923       reduction_op = gimple_op (stmt, reduc_index + 1);
3924       break;
3925     default:
3926       gcc_unreachable ();
3927     }
3928
3929   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3930   gcc_assert (vectype);
3931   mode = TYPE_MODE (vectype);
3932
3933   /* 1. Create the reduction def-use cycle:
3934      Set the arguments of REDUCTION_PHIS, i.e., transform
3935
3936         loop:
3937           vec_def = phi <null, null>            # REDUCTION_PHI
3938           VECT_DEF = vector_stmt                # vectorized form of STMT
3939           ...
3940
3941      into:
3942
3943         loop:
3944           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3945           VECT_DEF = vector_stmt                # vectorized form of STMT
3946           ...
3947
3948      (in case of SLP, do it for all the phis). */
3949
3950   /* Get the loop-entry arguments.  */
3951   if (slp_node)
3952     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3953                        NULL, slp_node, reduc_index);
3954   else
3955     {
3956       vec_initial_defs.create (1);
3957      /* For the case of reduction, vect_get_vec_def_for_operand returns
3958         the scalar def before the loop, that defines the initial value
3959         of the reduction variable.  */
3960       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3961                                                       &adjustment_def);
3962       vec_initial_defs.quick_push (vec_initial_def);
3963     }
3964
3965   /* Set phi nodes arguments.  */
3966   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
3967     {
3968       tree vec_init_def, def;
3969       gimple_seq stmts;
3970       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
3971                                            true, NULL_TREE);
3972       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
3973       def = vect_defs[i];
3974       for (j = 0; j < ncopies; j++)
3975         {
3976           /* Set the loop-entry arg of the reduction-phi.  */
3977           add_phi_arg (phi, vec_init_def, loop_preheader_edge (loop),
3978                        UNKNOWN_LOCATION);
3979
3980           /* Set the loop-latch arg for the reduction-phi.  */
3981           if (j > 0)
3982             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
3983
3984           add_phi_arg (phi, def, loop_latch_edge (loop), UNKNOWN_LOCATION);
3985
3986           if (dump_enabled_p ())
3987             {
3988               dump_printf_loc (MSG_NOTE, vect_location,
3989                                "transform reduction: created def-use cycle: ");
3990               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
3991               dump_printf (MSG_NOTE, "\n");
3992               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
3993               dump_printf (MSG_NOTE, "\n");
3994             }
3995
3996           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
3997         }
3998     }
3999
4000   /* 2. Create epilog code.
4001         The reduction epilog code operates across the elements of the vector
4002         of partial results computed by the vectorized loop.
4003         The reduction epilog code consists of:
4004
4005         step 1: compute the scalar result in a vector (v_out2)
4006         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4007         step 3: adjust the scalar result (s_out3) if needed.
4008
4009         Step 1 can be accomplished using one the following three schemes:
4010           (scheme 1) using reduc_code, if available.
4011           (scheme 2) using whole-vector shifts, if available.
4012           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4013                      combined.
4014
4015           The overall epilog code looks like this:
4016
4017           s_out0 = phi <s_loop>         # original EXIT_PHI
4018           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4019           v_out2 = reduce <v_out1>              # step 1
4020           s_out3 = extract_field <v_out2, 0>    # step 2
4021           s_out4 = adjust_result <s_out3>       # step 3
4022
4023           (step 3 is optional, and steps 1 and 2 may be combined).
4024           Lastly, the uses of s_out0 are replaced by s_out4.  */
4025
4026
4027   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4028          v_out1 = phi <VECT_DEF>
4029          Store them in NEW_PHIS.  */
4030
4031   exit_bb = single_exit (loop)->dest;
4032   prev_phi_info = NULL;
4033   new_phis.create (vect_defs.length ());
4034   FOR_EACH_VEC_ELT (vect_defs, i, def)
4035     {
4036       for (j = 0; j < ncopies; j++)
4037         {
4038           tree new_def = copy_ssa_name (def, NULL);
4039           phi = create_phi_node (new_def, exit_bb);
4040           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4041           if (j == 0)
4042             new_phis.quick_push (phi);
4043           else
4044             {
4045               def = vect_get_vec_def_for_stmt_copy (dt, def);
4046               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4047             }
4048
4049           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4050           prev_phi_info = vinfo_for_stmt (phi);
4051         }
4052     }
4053
4054   /* The epilogue is created for the outer-loop, i.e., for the loop being
4055      vectorized.  Create exit phis for the outer loop.  */
4056   if (double_reduc)
4057     {
4058       loop = outer_loop;
4059       exit_bb = single_exit (loop)->dest;
4060       inner_phis.create (vect_defs.length ());
4061       FOR_EACH_VEC_ELT (new_phis, i, phi)
4062         {
4063           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4064           gimple outer_phi = create_phi_node (new_result, exit_bb);
4065           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4066                            PHI_RESULT (phi));
4067           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4068                                                             loop_vinfo, NULL));
4069           inner_phis.quick_push (phi);
4070           new_phis[i] = outer_phi;
4071           prev_phi_info = vinfo_for_stmt (outer_phi);
4072           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4073             {
4074               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4075               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4076               outer_phi = create_phi_node (new_result, exit_bb);
4077               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4078                                PHI_RESULT (phi));
4079               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4080                                                         loop_vinfo, NULL));
4081               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4082               prev_phi_info = vinfo_for_stmt (outer_phi);
4083             }
4084         }
4085     }
4086
4087   exit_gsi = gsi_after_labels (exit_bb);
4088
4089   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4090          (i.e. when reduc_code is not available) and in the final adjustment
4091          code (if needed).  Also get the original scalar reduction variable as
4092          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4093          represents a reduction pattern), the tree-code and scalar-def are
4094          taken from the original stmt that the pattern-stmt (STMT) replaces.
4095          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4096          are taken from STMT.  */
4097
4098   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4099   if (!orig_stmt)
4100     {
4101       /* Regular reduction  */
4102       orig_stmt = stmt;
4103     }
4104   else
4105     {
4106       /* Reduction pattern  */
4107       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4108       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4109       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4110     }
4111
4112   code = gimple_assign_rhs_code (orig_stmt);
4113   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4114      partial results are added and not subtracted.  */
4115   if (code == MINUS_EXPR)
4116     code = PLUS_EXPR;
4117
4118   scalar_dest = gimple_assign_lhs (orig_stmt);
4119   scalar_type = TREE_TYPE (scalar_dest);
4120   scalar_results.create (group_size);
4121   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4122   bitsize = TYPE_SIZE (scalar_type);
4123
4124   /* In case this is a reduction in an inner-loop while vectorizing an outer
4125      loop - we don't need to extract a single scalar result at the end of the
4126      inner-loop (unless it is double reduction, i.e., the use of reduction is
4127      outside the outer-loop).  The final vector of partial results will be used
4128      in the vectorized outer-loop, or reduced to a scalar result at the end of
4129      the outer-loop.  */
4130   if (nested_in_vect_loop && !double_reduc)
4131     goto vect_finalize_reduction;
4132
4133   /* SLP reduction without reduction chain, e.g.,
4134      # a1 = phi <a2, a0>
4135      # b1 = phi <b2, b0>
4136      a2 = operation (a1)
4137      b2 = operation (b1)  */
4138   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4139
4140   /* In case of reduction chain, e.g.,
4141      # a1 = phi <a3, a0>
4142      a2 = operation (a1)
4143      a3 = operation (a2),
4144
4145      we may end up with more than one vector result.  Here we reduce them to
4146      one vector.  */
4147   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4148     {
4149       tree first_vect = PHI_RESULT (new_phis[0]);
4150       tree tmp;
4151       gimple new_vec_stmt = NULL;
4152
4153       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4154       for (k = 1; k < new_phis.length (); k++)
4155         {
4156           gimple next_phi = new_phis[k];
4157           tree second_vect = PHI_RESULT (next_phi);
4158
4159           tmp = build2 (code, vectype,  first_vect, second_vect);
4160           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4161           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4162           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4163           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4164         }
4165
4166       new_phi_result = first_vect;
4167       if (new_vec_stmt)
4168         {
4169           new_phis.truncate (0);
4170           new_phis.safe_push (new_vec_stmt);
4171         }
4172     }
4173   else
4174     new_phi_result = PHI_RESULT (new_phis[0]);
4175
4176   /* 2.3 Create the reduction code, using one of the three schemes described
4177          above. In SLP we simply need to extract all the elements from the
4178          vector (without reducing them), so we use scalar shifts.  */
4179   if (reduc_code != ERROR_MARK && !slp_reduc)
4180     {
4181       tree tmp;
4182       tree vec_elem_type;
4183
4184       /*** Case 1:  Create:
4185            v_out2 = reduc_expr <v_out1>  */
4186
4187       if (dump_enabled_p ())
4188         dump_printf_loc (MSG_NOTE, vect_location,
4189                          "Reduce using direct vector reduction.\n");
4190
4191       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4192       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4193         {
4194           tree tmp_dest =
4195               vect_create_destination_var (scalar_dest, vec_elem_type);
4196           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4197           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4198           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4199           gimple_assign_set_lhs (epilog_stmt, new_temp);
4200           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4201
4202           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4203         }
4204       else
4205         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4206       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4207       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4208       gimple_assign_set_lhs (epilog_stmt, new_temp);
4209       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4210       scalar_results.safe_push (new_temp);
4211     }
4212   else
4213     {
4214       enum tree_code shift_code = ERROR_MARK;
4215       bool have_whole_vector_shift = true;
4216       int bit_offset;
4217       int element_bitsize = tree_to_uhwi (bitsize);
4218       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4219       tree vec_temp;
4220
4221       if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
4222         shift_code = VEC_RSHIFT_EXPR;
4223       else
4224         have_whole_vector_shift = false;
4225
4226       /* Regardless of whether we have a whole vector shift, if we're
4227          emulating the operation via tree-vect-generic, we don't want
4228          to use it.  Only the first round of the reduction is likely
4229          to still be profitable via emulation.  */
4230       /* ??? It might be better to emit a reduction tree code here, so that
4231          tree-vect-generic can expand the first round via bit tricks.  */
4232       if (!VECTOR_MODE_P (mode))
4233         have_whole_vector_shift = false;
4234       else
4235         {
4236           optab optab = optab_for_tree_code (code, vectype, optab_default);
4237           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4238             have_whole_vector_shift = false;
4239         }
4240
4241       if (have_whole_vector_shift && !slp_reduc)
4242         {
4243           /*** Case 2: Create:
4244              for (offset = VS/2; offset >= element_size; offset/=2)
4245                 {
4246                   Create:  va' = vec_shift <va, offset>
4247                   Create:  va = vop <va, va'>
4248                 }  */
4249
4250           if (dump_enabled_p ())
4251             dump_printf_loc (MSG_NOTE, vect_location,
4252                              "Reduce using vector shifts\n");
4253
4254           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4255           new_temp = new_phi_result;
4256           for (bit_offset = vec_size_in_bits/2;
4257                bit_offset >= element_bitsize;
4258                bit_offset /= 2)
4259             {
4260               tree bitpos = size_int (bit_offset);
4261
4262               epilog_stmt = gimple_build_assign_with_ops (shift_code,
4263                                                vec_dest, new_temp, bitpos);
4264               new_name = make_ssa_name (vec_dest, epilog_stmt);
4265               gimple_assign_set_lhs (epilog_stmt, new_name);
4266               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4267
4268               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4269                                                           new_name, new_temp);
4270               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4271               gimple_assign_set_lhs (epilog_stmt, new_temp);
4272               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4273             }
4274
4275           extract_scalar_result = true;
4276         }
4277       else
4278         {
4279           tree rhs;
4280
4281           /*** Case 3: Create:
4282              s = extract_field <v_out2, 0>
4283              for (offset = element_size;
4284                   offset < vector_size;
4285                   offset += element_size;)
4286                {
4287                  Create:  s' = extract_field <v_out2, offset>
4288                  Create:  s = op <s, s'>  // For non SLP cases
4289                }  */
4290
4291           if (dump_enabled_p ())
4292             dump_printf_loc (MSG_NOTE, vect_location,
4293                              "Reduce using scalar code.\n");
4294
4295           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4296           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4297             {
4298               if (gimple_code (new_phi) == GIMPLE_PHI)
4299                 vec_temp = PHI_RESULT (new_phi);
4300               else
4301                 vec_temp = gimple_assign_lhs (new_phi);
4302               rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4303                             bitsize_zero_node);
4304               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4305               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4306               gimple_assign_set_lhs (epilog_stmt, new_temp);
4307               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4308
4309               /* In SLP we don't need to apply reduction operation, so we just
4310                  collect s' values in SCALAR_RESULTS.  */
4311               if (slp_reduc)
4312                 scalar_results.safe_push (new_temp);
4313
4314               for (bit_offset = element_bitsize;
4315                    bit_offset < vec_size_in_bits;
4316                    bit_offset += element_bitsize)
4317                 {
4318                   tree bitpos = bitsize_int (bit_offset);
4319                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4320                                      bitsize, bitpos);
4321
4322                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4323                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4324                   gimple_assign_set_lhs (epilog_stmt, new_name);
4325                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4326
4327                   if (slp_reduc)
4328                     {
4329                       /* In SLP we don't need to apply reduction operation, so
4330                          we just collect s' values in SCALAR_RESULTS.  */
4331                       new_temp = new_name;
4332                       scalar_results.safe_push (new_name);
4333                     }
4334                   else
4335                     {
4336                       epilog_stmt = gimple_build_assign_with_ops (code,
4337                                           new_scalar_dest, new_name, new_temp);
4338                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4339                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4340                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4341                     }
4342                 }
4343             }
4344
4345           /* The only case where we need to reduce scalar results in SLP, is
4346              unrolling.  If the size of SCALAR_RESULTS is greater than
4347              GROUP_SIZE, we reduce them combining elements modulo
4348              GROUP_SIZE.  */
4349           if (slp_reduc)
4350             {
4351               tree res, first_res, new_res;
4352               gimple new_stmt;
4353
4354               /* Reduce multiple scalar results in case of SLP unrolling.  */
4355               for (j = group_size; scalar_results.iterate (j, &res);
4356                    j++)
4357                 {
4358                   first_res = scalar_results[j % group_size];
4359                   new_stmt = gimple_build_assign_with_ops (code,
4360                                               new_scalar_dest, first_res, res);
4361                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4362                   gimple_assign_set_lhs (new_stmt, new_res);
4363                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4364                   scalar_results[j % group_size] = new_res;
4365                 }
4366             }
4367           else
4368             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4369             scalar_results.safe_push (new_temp);
4370
4371           extract_scalar_result = false;
4372         }
4373     }
4374
4375   /* 2.4  Extract the final scalar result.  Create:
4376           s_out3 = extract_field <v_out2, bitpos>  */
4377
4378   if (extract_scalar_result)
4379     {
4380       tree rhs;
4381
4382       if (dump_enabled_p ())
4383         dump_printf_loc (MSG_NOTE, vect_location,
4384                          "extract scalar result\n");
4385
4386       if (BYTES_BIG_ENDIAN)
4387         bitpos = size_binop (MULT_EXPR,
4388                              bitsize_int (TYPE_VECTOR_SUBPARTS (vectype) - 1),
4389                              TYPE_SIZE (scalar_type));
4390       else
4391         bitpos = bitsize_zero_node;
4392
4393       rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp, bitsize, bitpos);
4394       epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4395       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4396       gimple_assign_set_lhs (epilog_stmt, new_temp);
4397       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4398       scalar_results.safe_push (new_temp);
4399     }
4400
4401 vect_finalize_reduction:
4402
4403   if (double_reduc)
4404     loop = loop->inner;
4405
4406   /* 2.5 Adjust the final result by the initial value of the reduction
4407          variable. (When such adjustment is not needed, then
4408          'adjustment_def' is zero).  For example, if code is PLUS we create:
4409          new_temp = loop_exit_def + adjustment_def  */
4410
4411   if (adjustment_def)
4412     {
4413       gcc_assert (!slp_reduc);
4414       if (nested_in_vect_loop)
4415         {
4416           new_phi = new_phis[0];
4417           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4418           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4419           new_dest = vect_create_destination_var (scalar_dest, vectype);
4420         }
4421       else
4422         {
4423           new_temp = scalar_results[0];
4424           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4425           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4426           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4427         }
4428
4429       epilog_stmt = gimple_build_assign (new_dest, expr);
4430       new_temp = make_ssa_name (new_dest, epilog_stmt);
4431       gimple_assign_set_lhs (epilog_stmt, new_temp);
4432       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4433       if (nested_in_vect_loop)
4434         {
4435           set_vinfo_for_stmt (epilog_stmt,
4436                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4437                                                  NULL));
4438           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4439                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4440
4441           if (!double_reduc)
4442             scalar_results.quick_push (new_temp);
4443           else
4444             scalar_results[0] = new_temp;
4445         }
4446       else
4447         scalar_results[0] = new_temp;
4448
4449       new_phis[0] = epilog_stmt;
4450     }
4451
4452   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4453           phis with new adjusted scalar results, i.e., replace use <s_out0>
4454           with use <s_out4>.
4455
4456      Transform:
4457         loop_exit:
4458           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4459           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4460           v_out2 = reduce <v_out1>
4461           s_out3 = extract_field <v_out2, 0>
4462           s_out4 = adjust_result <s_out3>
4463           use <s_out0>
4464           use <s_out0>
4465
4466      into:
4467
4468         loop_exit:
4469           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4470           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4471           v_out2 = reduce <v_out1>
4472           s_out3 = extract_field <v_out2, 0>
4473           s_out4 = adjust_result <s_out3>
4474           use <s_out4>
4475           use <s_out4> */
4476
4477
4478   /* In SLP reduction chain we reduce vector results into one vector if
4479      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4480      the last stmt in the reduction chain, since we are looking for the loop
4481      exit phi node.  */
4482   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4483     {
4484       scalar_dest = gimple_assign_lhs (
4485                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4486       group_size = 1;
4487     }
4488
4489   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4490      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4491      need to match SCALAR_RESULTS with corresponding statements.  The first
4492      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4493      the first vector stmt, etc.
4494      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4495   if (group_size > new_phis.length ())
4496     {
4497       ratio = group_size / new_phis.length ();
4498       gcc_assert (!(group_size % new_phis.length ()));
4499     }
4500   else
4501     ratio = 1;
4502
4503   for (k = 0; k < group_size; k++)
4504     {
4505       if (k % ratio == 0)
4506         {
4507           epilog_stmt = new_phis[k / ratio];
4508           reduction_phi = reduction_phis[k / ratio];
4509           if (double_reduc)
4510             inner_phi = inner_phis[k / ratio];
4511         }
4512
4513       if (slp_reduc)
4514         {
4515           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4516
4517           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4518           /* SLP statements can't participate in patterns.  */
4519           gcc_assert (!orig_stmt);
4520           scalar_dest = gimple_assign_lhs (current_stmt);
4521         }
4522
4523       phis.create (3);
4524       /* Find the loop-closed-use at the loop exit of the original scalar
4525          result.  (The reduction result is expected to have two immediate uses -
4526          one at the latch block, and one at the loop exit).  */
4527       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4528         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4529             && !is_gimple_debug (USE_STMT (use_p)))
4530           phis.safe_push (USE_STMT (use_p));
4531
4532       /* While we expect to have found an exit_phi because of loop-closed-ssa
4533          form we can end up without one if the scalar cycle is dead.  */
4534
4535       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4536         {
4537           if (outer_loop)
4538             {
4539               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4540               gimple vect_phi;
4541
4542               /* FORNOW. Currently not supporting the case that an inner-loop
4543                  reduction is not used in the outer-loop (but only outside the
4544                  outer-loop), unless it is double reduction.  */
4545               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4546                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4547                           || double_reduc);
4548
4549               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4550               if (!double_reduc
4551                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4552                       != vect_double_reduction_def)
4553                 continue;
4554
4555               /* Handle double reduction:
4556
4557                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4558                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4559                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4560                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4561
4562                  At that point the regular reduction (stmt2 and stmt3) is
4563                  already vectorized, as well as the exit phi node, stmt4.
4564                  Here we vectorize the phi node of double reduction, stmt1, and
4565                  update all relevant statements.  */
4566
4567               /* Go through all the uses of s2 to find double reduction phi
4568                  node, i.e., stmt1 above.  */
4569               orig_name = PHI_RESULT (exit_phi);
4570               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4571                 {
4572                   stmt_vec_info use_stmt_vinfo;
4573                   stmt_vec_info new_phi_vinfo;
4574                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4575                   basic_block bb = gimple_bb (use_stmt);
4576                   gimple use;
4577
4578                   /* Check that USE_STMT is really double reduction phi
4579                      node.  */
4580                   if (gimple_code (use_stmt) != GIMPLE_PHI
4581                       || gimple_phi_num_args (use_stmt) != 2
4582                       || bb->loop_father != outer_loop)
4583                     continue;
4584                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4585                   if (!use_stmt_vinfo
4586                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4587                           != vect_double_reduction_def)
4588                     continue;
4589
4590                   /* Create vector phi node for double reduction:
4591                      vs1 = phi <vs0, vs2>
4592                      vs1 was created previously in this function by a call to
4593                        vect_get_vec_def_for_operand and is stored in
4594                        vec_initial_def;
4595                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4596                      vs0 is created here.  */
4597
4598                   /* Create vector phi node.  */
4599                   vect_phi = create_phi_node (vec_initial_def, bb);
4600                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4601                                     loop_vec_info_for_loop (outer_loop), NULL);
4602                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4603
4604                   /* Create vs0 - initial def of the double reduction phi.  */
4605                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4606                                              loop_preheader_edge (outer_loop));
4607                   init_def = get_initial_def_for_reduction (stmt,
4608                                                           preheader_arg, NULL);
4609                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4610                                                     vectype, NULL);
4611
4612                   /* Update phi node arguments with vs0 and vs2.  */
4613                   add_phi_arg (vect_phi, vect_phi_init,
4614                                loop_preheader_edge (outer_loop),
4615                                UNKNOWN_LOCATION);
4616                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4617                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4618                   if (dump_enabled_p ())
4619                     {
4620                       dump_printf_loc (MSG_NOTE, vect_location,
4621                                        "created double reduction phi node: ");
4622                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4623                       dump_printf (MSG_NOTE, "\n");
4624                     }
4625
4626                   vect_phi_res = PHI_RESULT (vect_phi);
4627
4628                   /* Replace the use, i.e., set the correct vs1 in the regular
4629                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4630                      loop is redundant.  */
4631                   use = reduction_phi;
4632                   for (j = 0; j < ncopies; j++)
4633                     {
4634                       edge pr_edge = loop_preheader_edge (loop);
4635                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4636                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4637                     }
4638                 }
4639             }
4640         }
4641
4642       phis.release ();
4643       if (nested_in_vect_loop)
4644         {
4645           if (double_reduc)
4646             loop = outer_loop;
4647           else
4648             continue;
4649         }
4650
4651       phis.create (3);
4652       /* Find the loop-closed-use at the loop exit of the original scalar
4653          result.  (The reduction result is expected to have two immediate uses,
4654          one at the latch block, and one at the loop exit).  For double
4655          reductions we are looking for exit phis of the outer loop.  */
4656       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4657         {
4658           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4659             {
4660               if (!is_gimple_debug (USE_STMT (use_p)))
4661                 phis.safe_push (USE_STMT (use_p));
4662             }
4663           else
4664             {
4665               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4666                 {
4667                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4668
4669                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4670                     {
4671                       if (!flow_bb_inside_loop_p (loop,
4672                                              gimple_bb (USE_STMT (phi_use_p)))
4673                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4674                         phis.safe_push (USE_STMT (phi_use_p));
4675                     }
4676                 }
4677             }
4678         }
4679
4680       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4681         {
4682           /* Replace the uses:  */
4683           orig_name = PHI_RESULT (exit_phi);
4684           scalar_result = scalar_results[k];
4685           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4686             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4687               SET_USE (use_p, scalar_result);
4688         }
4689
4690       phis.release ();
4691     }
4692 }
4693
4694
4695 /* Function vectorizable_reduction.
4696
4697    Check if STMT performs a reduction operation that can be vectorized.
4698    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4699    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4700    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4701
4702    This function also handles reduction idioms (patterns) that have been
4703    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4704    of this form:
4705      X = pattern_expr (arg0, arg1, ..., X)
4706    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4707    sequence that had been detected and replaced by the pattern-stmt (STMT).
4708
4709    In some cases of reduction patterns, the type of the reduction variable X is
4710    different than the type of the other arguments of STMT.
4711    In such cases, the vectype that is used when transforming STMT into a vector
4712    stmt is different than the vectype that is used to determine the
4713    vectorization factor, because it consists of a different number of elements
4714    than the actual number of elements that are being operated upon in parallel.
4715
4716    For example, consider an accumulation of shorts into an int accumulator.
4717    On some targets it's possible to vectorize this pattern operating on 8
4718    shorts at a time (hence, the vectype for purposes of determining the
4719    vectorization factor should be V8HI); on the other hand, the vectype that
4720    is used to create the vector form is actually V4SI (the type of the result).
4721
4722    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4723    indicates what is the actual level of parallelism (V8HI in the example), so
4724    that the right vectorization factor would be derived.  This vectype
4725    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4726    be used to create the vectorized stmt.  The right vectype for the vectorized
4727    stmt is obtained from the type of the result X:
4728         get_vectype_for_scalar_type (TREE_TYPE (X))
4729
4730    This means that, contrary to "regular" reductions (or "regular" stmts in
4731    general), the following equation:
4732       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4733    does *NOT* necessarily hold for reduction patterns.  */
4734
4735 bool
4736 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4737                         gimple *vec_stmt, slp_tree slp_node)
4738 {
4739   tree vec_dest;
4740   tree scalar_dest;
4741   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4742   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4743   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4744   tree vectype_in = NULL_TREE;
4745   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4746   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4747   enum tree_code code, orig_code, epilog_reduc_code;
4748   machine_mode vec_mode;
4749   int op_type;
4750   optab optab, reduc_optab;
4751   tree new_temp = NULL_TREE;
4752   tree def;
4753   gimple def_stmt;
4754   enum vect_def_type dt;
4755   gimple new_phi = NULL;
4756   tree scalar_type;
4757   bool is_simple_use;
4758   gimple orig_stmt;
4759   stmt_vec_info orig_stmt_info;
4760   tree expr = NULL_TREE;
4761   int i;
4762   int ncopies;
4763   int epilog_copies;
4764   stmt_vec_info prev_stmt_info, prev_phi_info;
4765   bool single_defuse_cycle = false;
4766   tree reduc_def = NULL_TREE;
4767   gimple new_stmt = NULL;
4768   int j;
4769   tree ops[3];
4770   bool nested_cycle = false, found_nested_cycle_def = false;
4771   gimple reduc_def_stmt = NULL;
4772   /* The default is that the reduction variable is the last in statement.  */
4773   int reduc_index = 2;
4774   bool double_reduc = false, dummy;
4775   basic_block def_bb;
4776   struct loop * def_stmt_loop, *outer_loop = NULL;
4777   tree def_arg;
4778   gimple def_arg_stmt;
4779   auto_vec<tree> vec_oprnds0;
4780   auto_vec<tree> vec_oprnds1;
4781   auto_vec<tree> vect_defs;
4782   auto_vec<gimple> phis;
4783   int vec_num;
4784   tree def0, def1, tem, op0, op1 = NULL_TREE;
4785
4786   /* In case of reduction chain we switch to the first stmt in the chain, but
4787      we don't update STMT_INFO, since only the last stmt is marked as reduction
4788      and has reduction properties.  */
4789   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4790     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4791
4792   if (nested_in_vect_loop_p (loop, stmt))
4793     {
4794       outer_loop = loop;
4795       loop = loop->inner;
4796       nested_cycle = true;
4797     }
4798
4799   /* 1. Is vectorizable reduction?  */
4800   /* Not supportable if the reduction variable is used in the loop, unless
4801      it's a reduction chain.  */
4802   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4803       && !GROUP_FIRST_ELEMENT (stmt_info))
4804     return false;
4805
4806   /* Reductions that are not used even in an enclosing outer-loop,
4807      are expected to be "live" (used out of the loop).  */
4808   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4809       && !STMT_VINFO_LIVE_P (stmt_info))
4810     return false;
4811
4812   /* Make sure it was already recognized as a reduction computation.  */
4813   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4814       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4815     return false;
4816
4817   /* 2. Has this been recognized as a reduction pattern?
4818
4819      Check if STMT represents a pattern that has been recognized
4820      in earlier analysis stages.  For stmts that represent a pattern,
4821      the STMT_VINFO_RELATED_STMT field records the last stmt in
4822      the original sequence that constitutes the pattern.  */
4823
4824   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4825   if (orig_stmt)
4826     {
4827       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4828       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4829       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4830     }
4831
4832   /* 3. Check the operands of the operation.  The first operands are defined
4833         inside the loop body. The last operand is the reduction variable,
4834         which is defined by the loop-header-phi.  */
4835
4836   gcc_assert (is_gimple_assign (stmt));
4837
4838   /* Flatten RHS.  */
4839   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4840     {
4841     case GIMPLE_SINGLE_RHS:
4842       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4843       if (op_type == ternary_op)
4844         {
4845           tree rhs = gimple_assign_rhs1 (stmt);
4846           ops[0] = TREE_OPERAND (rhs, 0);
4847           ops[1] = TREE_OPERAND (rhs, 1);
4848           ops[2] = TREE_OPERAND (rhs, 2);
4849           code = TREE_CODE (rhs);
4850         }
4851       else
4852         return false;
4853       break;
4854
4855     case GIMPLE_BINARY_RHS:
4856       code = gimple_assign_rhs_code (stmt);
4857       op_type = TREE_CODE_LENGTH (code);
4858       gcc_assert (op_type == binary_op);
4859       ops[0] = gimple_assign_rhs1 (stmt);
4860       ops[1] = gimple_assign_rhs2 (stmt);
4861       break;
4862
4863     case GIMPLE_TERNARY_RHS:
4864       code = gimple_assign_rhs_code (stmt);
4865       op_type = TREE_CODE_LENGTH (code);
4866       gcc_assert (op_type == ternary_op);
4867       ops[0] = gimple_assign_rhs1 (stmt);
4868       ops[1] = gimple_assign_rhs2 (stmt);
4869       ops[2] = gimple_assign_rhs3 (stmt);
4870       break;
4871
4872     case GIMPLE_UNARY_RHS:
4873       return false;
4874
4875     default:
4876       gcc_unreachable ();
4877     }
4878
4879   if (code == COND_EXPR && slp_node)
4880     return false;
4881
4882   scalar_dest = gimple_assign_lhs (stmt);
4883   scalar_type = TREE_TYPE (scalar_dest);
4884   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4885       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4886     return false;
4887
4888   /* Do not try to vectorize bit-precision reductions.  */
4889   if ((TYPE_PRECISION (scalar_type)
4890        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4891     return false;
4892
4893   /* All uses but the last are expected to be defined in the loop.
4894      The last use is the reduction variable.  In case of nested cycle this
4895      assumption is not true: we use reduc_index to record the index of the
4896      reduction variable.  */
4897   for (i = 0; i < op_type - 1; i++)
4898     {
4899       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4900       if (i == 0 && code == COND_EXPR)
4901         continue;
4902
4903       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4904                                             &def_stmt, &def, &dt, &tem);
4905       if (!vectype_in)
4906         vectype_in = tem;
4907       gcc_assert (is_simple_use);
4908
4909       if (dt != vect_internal_def
4910           && dt != vect_external_def
4911           && dt != vect_constant_def
4912           && dt != vect_induction_def
4913           && !(dt == vect_nested_cycle && nested_cycle))
4914         return false;
4915
4916       if (dt == vect_nested_cycle)
4917         {
4918           found_nested_cycle_def = true;
4919           reduc_def_stmt = def_stmt;
4920           reduc_index = i;
4921         }
4922     }
4923
4924   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4925                                         &def_stmt, &def, &dt, &tem);
4926   if (!vectype_in)
4927     vectype_in = tem;
4928   gcc_assert (is_simple_use);
4929   if (!(dt == vect_reduction_def
4930         || dt == vect_nested_cycle
4931         || ((dt == vect_internal_def || dt == vect_external_def
4932              || dt == vect_constant_def || dt == vect_induction_def)
4933             && nested_cycle && found_nested_cycle_def)))
4934     {
4935       /* For pattern recognized stmts, orig_stmt might be a reduction,
4936          but some helper statements for the pattern might not, or
4937          might be COND_EXPRs with reduction uses in the condition.  */
4938       gcc_assert (orig_stmt);
4939       return false;
4940     }
4941   if (!found_nested_cycle_def)
4942     reduc_def_stmt = def_stmt;
4943
4944   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4945   if (orig_stmt)
4946     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4947                                                        reduc_def_stmt,
4948                                                        !nested_cycle,
4949                                                        &dummy));
4950   else
4951     {
4952       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4953                                              !nested_cycle, &dummy);
4954       /* We changed STMT to be the first stmt in reduction chain, hence we
4955          check that in this case the first element in the chain is STMT.  */
4956       gcc_assert (stmt == tmp
4957                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4958     }
4959
4960   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4961     return false;
4962
4963   if (slp_node || PURE_SLP_STMT (stmt_info))
4964     ncopies = 1;
4965   else
4966     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4967                / TYPE_VECTOR_SUBPARTS (vectype_in));
4968
4969   gcc_assert (ncopies >= 1);
4970
4971   vec_mode = TYPE_MODE (vectype_in);
4972
4973   if (code == COND_EXPR)
4974     {
4975       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
4976         {
4977           if (dump_enabled_p ())
4978             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4979                              "unsupported condition in reduction\n");
4980
4981             return false;
4982         }
4983     }
4984   else
4985     {
4986       /* 4. Supportable by target?  */
4987
4988       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
4989           || code == LROTATE_EXPR || code == RROTATE_EXPR)
4990         {
4991           /* Shifts and rotates are only supported by vectorizable_shifts,
4992              not vectorizable_reduction.  */
4993           if (dump_enabled_p ())
4994             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
4995                              "unsupported shift or rotation.\n");
4996           return false;
4997         }
4998
4999       /* 4.1. check support for the operation in the loop  */
5000       optab = optab_for_tree_code (code, vectype_in, optab_default);
5001       if (!optab)
5002         {
5003           if (dump_enabled_p ())
5004             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5005                              "no optab.\n");
5006
5007           return false;
5008         }
5009
5010       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5011         {
5012           if (dump_enabled_p ())
5013             dump_printf (MSG_NOTE, "op not supported by target.\n");
5014
5015           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5016               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5017                   < vect_min_worthwhile_factor (code))
5018             return false;
5019
5020           if (dump_enabled_p ())
5021             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5022         }
5023
5024       /* Worthwhile without SIMD support?  */
5025       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5026           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5027              < vect_min_worthwhile_factor (code))
5028         {
5029           if (dump_enabled_p ())
5030             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5031                              "not worthwhile without SIMD support.\n");
5032
5033           return false;
5034         }
5035     }
5036
5037   /* 4.2. Check support for the epilog operation.
5038
5039           If STMT represents a reduction pattern, then the type of the
5040           reduction variable may be different than the type of the rest
5041           of the arguments.  For example, consider the case of accumulation
5042           of shorts into an int accumulator; The original code:
5043                         S1: int_a = (int) short_a;
5044           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5045
5046           was replaced with:
5047                         STMT: int_acc = widen_sum <short_a, int_acc>
5048
5049           This means that:
5050           1. The tree-code that is used to create the vector operation in the
5051              epilog code (that reduces the partial results) is not the
5052              tree-code of STMT, but is rather the tree-code of the original
5053              stmt from the pattern that STMT is replacing.  I.e, in the example
5054              above we want to use 'widen_sum' in the loop, but 'plus' in the
5055              epilog.
5056           2. The type (mode) we use to check available target support
5057              for the vector operation to be created in the *epilog*, is
5058              determined by the type of the reduction variable (in the example
5059              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5060              However the type (mode) we use to check available target support
5061              for the vector operation to be created *inside the loop*, is
5062              determined by the type of the other arguments to STMT (in the
5063              example we'd check this: optab_handler (widen_sum_optab,
5064              vect_short_mode)).
5065
5066           This is contrary to "regular" reductions, in which the types of all
5067           the arguments are the same as the type of the reduction variable.
5068           For "regular" reductions we can therefore use the same vector type
5069           (and also the same tree-code) when generating the epilog code and
5070           when generating the code inside the loop.  */
5071
5072   if (orig_stmt)
5073     {
5074       /* This is a reduction pattern: get the vectype from the type of the
5075          reduction variable, and get the tree-code from orig_stmt.  */
5076       orig_code = gimple_assign_rhs_code (orig_stmt);
5077       gcc_assert (vectype_out);
5078       vec_mode = TYPE_MODE (vectype_out);
5079     }
5080   else
5081     {
5082       /* Regular reduction: use the same vectype and tree-code as used for
5083          the vector code inside the loop can be used for the epilog code. */
5084       orig_code = code;
5085     }
5086
5087   if (nested_cycle)
5088     {
5089       def_bb = gimple_bb (reduc_def_stmt);
5090       def_stmt_loop = def_bb->loop_father;
5091       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5092                                        loop_preheader_edge (def_stmt_loop));
5093       if (TREE_CODE (def_arg) == SSA_NAME
5094           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5095           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5096           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5097           && vinfo_for_stmt (def_arg_stmt)
5098           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5099               == vect_double_reduction_def)
5100         double_reduc = true;
5101     }
5102
5103   epilog_reduc_code = ERROR_MARK;
5104   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5105     {
5106       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5107                                          optab_default);
5108       if (!reduc_optab)
5109         {
5110           if (dump_enabled_p ())
5111             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5112                              "no optab for reduction.\n");
5113
5114           epilog_reduc_code = ERROR_MARK;
5115         }
5116       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5117         {
5118           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5119           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5120             {
5121               if (dump_enabled_p ())
5122                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5123                                  "reduc op not supported by target.\n");
5124
5125               epilog_reduc_code = ERROR_MARK;
5126             }
5127         }
5128     }
5129   else
5130     {
5131       if (!nested_cycle || double_reduc)
5132         {
5133           if (dump_enabled_p ())
5134             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5135                              "no reduc code for scalar code.\n");
5136
5137           return false;
5138         }
5139     }
5140
5141   if (double_reduc && ncopies > 1)
5142     {
5143       if (dump_enabled_p ())
5144         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5145                          "multiple types in double reduction\n");
5146
5147       return false;
5148     }
5149
5150   /* In case of widenning multiplication by a constant, we update the type
5151      of the constant to be the type of the other operand.  We check that the
5152      constant fits the type in the pattern recognition pass.  */
5153   if (code == DOT_PROD_EXPR
5154       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5155     {
5156       if (TREE_CODE (ops[0]) == INTEGER_CST)
5157         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5158       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5159         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5160       else
5161         {
5162           if (dump_enabled_p ())
5163             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5164                              "invalid types in dot-prod\n");
5165
5166           return false;
5167         }
5168     }
5169
5170   if (!vec_stmt) /* transformation not required.  */
5171     {
5172       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5173         return false;
5174       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5175       return true;
5176     }
5177
5178   /** Transform.  **/
5179
5180   if (dump_enabled_p ())
5181     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5182
5183   /* FORNOW: Multiple types are not supported for condition.  */
5184   if (code == COND_EXPR)
5185     gcc_assert (ncopies == 1);
5186
5187   /* Create the destination vector  */
5188   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5189
5190   /* In case the vectorization factor (VF) is bigger than the number
5191      of elements that we can fit in a vectype (nunits), we have to generate
5192      more than one vector stmt - i.e - we need to "unroll" the
5193      vector stmt by a factor VF/nunits.  For more details see documentation
5194      in vectorizable_operation.  */
5195
5196   /* If the reduction is used in an outer loop we need to generate
5197      VF intermediate results, like so (e.g. for ncopies=2):
5198         r0 = phi (init, r0)
5199         r1 = phi (init, r1)
5200         r0 = x0 + r0;
5201         r1 = x1 + r1;
5202     (i.e. we generate VF results in 2 registers).
5203     In this case we have a separate def-use cycle for each copy, and therefore
5204     for each copy we get the vector def for the reduction variable from the
5205     respective phi node created for this copy.
5206
5207     Otherwise (the reduction is unused in the loop nest), we can combine
5208     together intermediate results, like so (e.g. for ncopies=2):
5209         r = phi (init, r)
5210         r = x0 + r;
5211         r = x1 + r;
5212    (i.e. we generate VF/2 results in a single register).
5213    In this case for each copy we get the vector def for the reduction variable
5214    from the vectorized reduction operation generated in the previous iteration.
5215   */
5216
5217   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5218     {
5219       single_defuse_cycle = true;
5220       epilog_copies = 1;
5221     }
5222   else
5223     epilog_copies = ncopies;
5224
5225   prev_stmt_info = NULL;
5226   prev_phi_info = NULL;
5227   if (slp_node)
5228     {
5229       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5230       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5231                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5232     }
5233   else
5234     {
5235       vec_num = 1;
5236       vec_oprnds0.create (1);
5237       if (op_type == ternary_op)
5238         vec_oprnds1.create (1);
5239     }
5240
5241   phis.create (vec_num);
5242   vect_defs.create (vec_num);
5243   if (!slp_node)
5244     vect_defs.quick_push (NULL_TREE);
5245
5246   for (j = 0; j < ncopies; j++)
5247     {
5248       if (j == 0 || !single_defuse_cycle)
5249         {
5250           for (i = 0; i < vec_num; i++)
5251             {
5252               /* Create the reduction-phi that defines the reduction
5253                  operand.  */
5254               new_phi = create_phi_node (vec_dest, loop->header);
5255               set_vinfo_for_stmt (new_phi,
5256                                   new_stmt_vec_info (new_phi, loop_vinfo,
5257                                                      NULL));
5258                if (j == 0 || slp_node)
5259                  phis.quick_push (new_phi);
5260             }
5261         }
5262
5263       if (code == COND_EXPR)
5264         {
5265           gcc_assert (!slp_node);
5266           vectorizable_condition (stmt, gsi, vec_stmt,
5267                                   PHI_RESULT (phis[0]),
5268                                   reduc_index, NULL);
5269           /* Multiple types are not supported for condition.  */
5270           break;
5271         }
5272
5273       /* Handle uses.  */
5274       if (j == 0)
5275         {
5276           op0 = ops[!reduc_index];
5277           if (op_type == ternary_op)
5278             {
5279               if (reduc_index == 0)
5280                 op1 = ops[2];
5281               else
5282                 op1 = ops[1];
5283             }
5284
5285           if (slp_node)
5286             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5287                                slp_node, -1);
5288           else
5289             {
5290               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5291                                                             stmt, NULL);
5292               vec_oprnds0.quick_push (loop_vec_def0);
5293               if (op_type == ternary_op)
5294                {
5295                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5296                                                                NULL);
5297                  vec_oprnds1.quick_push (loop_vec_def1);
5298                }
5299             }
5300         }
5301       else
5302         {
5303           if (!slp_node)
5304             {
5305               enum vect_def_type dt;
5306               gimple dummy_stmt;
5307               tree dummy;
5308
5309               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5310                                   &dummy_stmt, &dummy, &dt);
5311               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5312                                                               loop_vec_def0);
5313               vec_oprnds0[0] = loop_vec_def0;
5314               if (op_type == ternary_op)
5315                 {
5316                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5317                                       &dummy, &dt);
5318                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5319                                                                 loop_vec_def1);
5320                   vec_oprnds1[0] = loop_vec_def1;
5321                 }
5322             }
5323
5324           if (single_defuse_cycle)
5325             reduc_def = gimple_assign_lhs (new_stmt);
5326
5327           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5328         }
5329
5330       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5331         {
5332           if (slp_node)
5333             reduc_def = PHI_RESULT (phis[i]);
5334           else
5335             {
5336               if (!single_defuse_cycle || j == 0)
5337                 reduc_def = PHI_RESULT (new_phi);
5338             }
5339
5340           def1 = ((op_type == ternary_op)
5341                   ? vec_oprnds1[i] : NULL);
5342           if (op_type == binary_op)
5343             {
5344               if (reduc_index == 0)
5345                 expr = build2 (code, vectype_out, reduc_def, def0);
5346               else
5347                 expr = build2 (code, vectype_out, def0, reduc_def);
5348             }
5349           else
5350             {
5351               if (reduc_index == 0)
5352                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5353               else
5354                 {
5355                   if (reduc_index == 1)
5356                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5357                   else
5358                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5359                 }
5360             }
5361
5362           new_stmt = gimple_build_assign (vec_dest, expr);
5363           new_temp = make_ssa_name (vec_dest, new_stmt);
5364           gimple_assign_set_lhs (new_stmt, new_temp);
5365           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5366
5367           if (slp_node)
5368             {
5369               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5370               vect_defs.quick_push (new_temp);
5371             }
5372           else
5373             vect_defs[0] = new_temp;
5374         }
5375
5376       if (slp_node)
5377         continue;
5378
5379       if (j == 0)
5380         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5381       else
5382         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5383
5384       prev_stmt_info = vinfo_for_stmt (new_stmt);
5385       prev_phi_info = vinfo_for_stmt (new_phi);
5386     }
5387
5388   /* Finalize the reduction-phi (set its arguments) and create the
5389      epilog reduction code.  */
5390   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5391     {
5392       new_temp = gimple_assign_lhs (*vec_stmt);
5393       vect_defs[0] = new_temp;
5394     }
5395
5396   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5397                                     epilog_reduc_code, phis, reduc_index,
5398                                     double_reduc, slp_node);
5399
5400   return true;
5401 }
5402
5403 /* Function vect_min_worthwhile_factor.
5404
5405    For a loop where we could vectorize the operation indicated by CODE,
5406    return the minimum vectorization factor that makes it worthwhile
5407    to use generic vectors.  */
5408 int
5409 vect_min_worthwhile_factor (enum tree_code code)
5410 {
5411   switch (code)
5412     {
5413     case PLUS_EXPR:
5414     case MINUS_EXPR:
5415     case NEGATE_EXPR:
5416       return 4;
5417
5418     case BIT_AND_EXPR:
5419     case BIT_IOR_EXPR:
5420     case BIT_XOR_EXPR:
5421     case BIT_NOT_EXPR:
5422       return 2;
5423
5424     default:
5425       return INT_MAX;
5426     }
5427 }
5428
5429
5430 /* Function vectorizable_induction
5431
5432    Check if PHI performs an induction computation that can be vectorized.
5433    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5434    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5435    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5436
5437 bool
5438 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5439                         gimple *vec_stmt)
5440 {
5441   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5442   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5443   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5444   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5445   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5446   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5447   tree vec_def;
5448
5449   gcc_assert (ncopies >= 1);
5450   /* FORNOW. These restrictions should be relaxed.  */
5451   if (nested_in_vect_loop_p (loop, phi))
5452     {
5453       imm_use_iterator imm_iter;
5454       use_operand_p use_p;
5455       gimple exit_phi;
5456       edge latch_e;
5457       tree loop_arg;
5458
5459       if (ncopies > 1)
5460         {
5461           if (dump_enabled_p ())
5462             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5463                              "multiple types in nested loop.\n");
5464           return false;
5465         }
5466
5467       exit_phi = NULL;
5468       latch_e = loop_latch_edge (loop->inner);
5469       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5470       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5471         {
5472           gimple use_stmt = USE_STMT (use_p);
5473           if (is_gimple_debug (use_stmt))
5474             continue;
5475
5476           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5477             {
5478               exit_phi = use_stmt;
5479               break;
5480             }
5481         }
5482       if (exit_phi)
5483         {
5484           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5485           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5486                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5487             {
5488               if (dump_enabled_p ())
5489                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5490                                  "inner-loop induction only used outside "
5491                                  "of the outer vectorized loop.\n");
5492               return false;
5493             }
5494         }
5495     }
5496
5497   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5498     return false;
5499
5500   /* FORNOW: SLP not supported.  */
5501   if (STMT_SLP_TYPE (stmt_info))
5502     return false;
5503
5504   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5505
5506   if (gimple_code (phi) != GIMPLE_PHI)
5507     return false;
5508
5509   if (!vec_stmt) /* transformation not required.  */
5510     {
5511       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5512       if (dump_enabled_p ())
5513         dump_printf_loc (MSG_NOTE, vect_location,
5514                          "=== vectorizable_induction ===\n");
5515       vect_model_induction_cost (stmt_info, ncopies);
5516       return true;
5517     }
5518
5519   /** Transform.  **/
5520
5521   if (dump_enabled_p ())
5522     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5523
5524   vec_def = get_initial_def_for_induction (phi);
5525   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5526   return true;
5527 }
5528
5529 /* Function vectorizable_live_operation.
5530
5531    STMT computes a value that is used outside the loop.  Check if
5532    it can be supported.  */
5533
5534 bool
5535 vectorizable_live_operation (gimple stmt,
5536                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5537                              gimple *vec_stmt)
5538 {
5539   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5540   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5541   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5542   int i;
5543   int op_type;
5544   tree op;
5545   tree def;
5546   gimple def_stmt;
5547   enum vect_def_type dt;
5548   enum tree_code code;
5549   enum gimple_rhs_class rhs_class;
5550
5551   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5552
5553   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5554     return false;
5555
5556   if (!is_gimple_assign (stmt))
5557     {
5558       if (gimple_call_internal_p (stmt)
5559           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5560           && gimple_call_lhs (stmt)
5561           && loop->simduid
5562           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5563           && loop->simduid
5564              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5565         {
5566           edge e = single_exit (loop);
5567           basic_block merge_bb = e->dest;
5568           imm_use_iterator imm_iter;
5569           use_operand_p use_p;
5570           tree lhs = gimple_call_lhs (stmt);
5571
5572           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5573             {
5574               gimple use_stmt = USE_STMT (use_p);
5575               if (gimple_code (use_stmt) == GIMPLE_PHI
5576                   && gimple_bb (use_stmt) == merge_bb)
5577                 {
5578                   if (vec_stmt)
5579                     {
5580                       tree vfm1
5581                         = build_int_cst (unsigned_type_node,
5582                                          loop_vinfo->vectorization_factor - 1);
5583                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5584                     }
5585                   return true;
5586                 }
5587             }
5588         }
5589
5590       return false;
5591     }
5592
5593   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5594     return false;
5595
5596   /* FORNOW. CHECKME. */
5597   if (nested_in_vect_loop_p (loop, stmt))
5598     return false;
5599
5600   code = gimple_assign_rhs_code (stmt);
5601   op_type = TREE_CODE_LENGTH (code);
5602   rhs_class = get_gimple_rhs_class (code);
5603   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5604   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5605
5606   /* FORNOW: support only if all uses are invariant.  This means
5607      that the scalar operations can remain in place, unvectorized.
5608      The original last scalar value that they compute will be used.  */
5609
5610   for (i = 0; i < op_type; i++)
5611     {
5612       if (rhs_class == GIMPLE_SINGLE_RHS)
5613         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5614       else
5615         op = gimple_op (stmt, i + 1);
5616       if (op
5617           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5618                                   &dt))
5619         {
5620           if (dump_enabled_p ())
5621             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5622                              "use not simple.\n");
5623           return false;
5624         }
5625
5626       if (dt != vect_external_def && dt != vect_constant_def)
5627         return false;
5628     }
5629
5630   /* No transformation is required for the cases we currently support.  */
5631   return true;
5632 }
5633
5634 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5635
5636 static void
5637 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5638 {
5639   ssa_op_iter op_iter;
5640   imm_use_iterator imm_iter;
5641   def_operand_p def_p;
5642   gimple ustmt;
5643
5644   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5645     {
5646       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5647         {
5648           basic_block bb;
5649
5650           if (!is_gimple_debug (ustmt))
5651             continue;
5652
5653           bb = gimple_bb (ustmt);
5654
5655           if (!flow_bb_inside_loop_p (loop, bb))
5656             {
5657               if (gimple_debug_bind_p (ustmt))
5658                 {
5659                   if (dump_enabled_p ())
5660                     dump_printf_loc (MSG_NOTE, vect_location,
5661                                      "killing debug use\n");
5662
5663                   gimple_debug_bind_reset_value (ustmt);
5664                   update_stmt (ustmt);
5665                 }
5666               else
5667                 gcc_unreachable ();
5668             }
5669         }
5670     }
5671 }
5672
5673
5674 /* This function builds ni_name = number of iterations.  Statements
5675    are emitted on the loop preheader edge.  */
5676
5677 static tree
5678 vect_build_loop_niters (loop_vec_info loop_vinfo)
5679 {
5680   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5681   if (TREE_CODE (ni) == INTEGER_CST)
5682     return ni;
5683   else
5684     {
5685       tree ni_name, var;
5686       gimple_seq stmts = NULL;
5687       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5688
5689       var = create_tmp_var (TREE_TYPE (ni), "niters");
5690       ni_name = force_gimple_operand (ni, &stmts, false, var);
5691       if (stmts)
5692         gsi_insert_seq_on_edge_immediate (pe, stmts);
5693
5694       return ni_name;
5695     }
5696 }
5697
5698
5699 /* This function generates the following statements:
5700
5701    ni_name = number of iterations loop executes
5702    ratio = ni_name / vf
5703    ratio_mult_vf_name = ratio * vf
5704
5705    and places them on the loop preheader edge.  */
5706
5707 static void
5708 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5709                                  tree ni_name,
5710                                  tree *ratio_mult_vf_name_ptr,
5711                                  tree *ratio_name_ptr)
5712 {
5713   tree ni_minus_gap_name;
5714   tree var;
5715   tree ratio_name;
5716   tree ratio_mult_vf_name;
5717   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5718   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5719   tree log_vf;
5720
5721   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5722
5723   /* If epilogue loop is required because of data accesses with gaps, we
5724      subtract one iteration from the total number of iterations here for
5725      correct calculation of RATIO.  */
5726   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5727     {
5728       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5729                                        ni_name,
5730                                        build_one_cst (TREE_TYPE (ni_name)));
5731       if (!is_gimple_val (ni_minus_gap_name))
5732         {
5733           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5734           gimple stmts = NULL;
5735           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5736                                                     true, var);
5737           gsi_insert_seq_on_edge_immediate (pe, stmts);
5738         }
5739     }
5740   else
5741     ni_minus_gap_name = ni_name;
5742
5743   /* Create: ratio = ni >> log2(vf) */
5744   /* ???  As we have ni == number of latch executions + 1, ni could
5745      have overflown to zero.  So avoid computing ratio based on ni
5746      but compute it using the fact that we know ratio will be at least
5747      one, thus via (ni - vf) >> log2(vf) + 1.  */
5748   ratio_name
5749     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5750                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5751                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5752                                              ni_minus_gap_name,
5753                                              build_int_cst
5754                                                (TREE_TYPE (ni_name), vf)),
5755                                 log_vf),
5756                    build_int_cst (TREE_TYPE (ni_name), 1));
5757   if (!is_gimple_val (ratio_name))
5758     {
5759       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5760       gimple stmts = NULL;
5761       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5762       gsi_insert_seq_on_edge_immediate (pe, stmts);
5763     }
5764   *ratio_name_ptr = ratio_name;
5765
5766   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5767
5768   if (ratio_mult_vf_name_ptr)
5769     {
5770       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5771                                         ratio_name, log_vf);
5772       if (!is_gimple_val (ratio_mult_vf_name))
5773         {
5774           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5775           gimple stmts = NULL;
5776           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5777                                                      true, var);
5778           gsi_insert_seq_on_edge_immediate (pe, stmts);
5779         }
5780       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5781     }
5782
5783   return;
5784 }
5785
5786
5787 /* Function vect_transform_loop.
5788
5789    The analysis phase has determined that the loop is vectorizable.
5790    Vectorize the loop - created vectorized stmts to replace the scalar
5791    stmts in the loop, and update the loop exit condition.  */
5792
5793 void
5794 vect_transform_loop (loop_vec_info loop_vinfo)
5795 {
5796   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5797   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5798   int nbbs = loop->num_nodes;
5799   gimple_stmt_iterator si;
5800   int i;
5801   tree ratio = NULL;
5802   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5803   bool grouped_store;
5804   bool slp_scheduled = false;
5805   gimple stmt, pattern_stmt;
5806   gimple_seq pattern_def_seq = NULL;
5807   gimple_stmt_iterator pattern_def_si = gsi_none ();
5808   bool transform_pattern_stmt = false;
5809   bool check_profitability = false;
5810   int th;
5811   /* Record number of iterations before we started tampering with the profile. */
5812   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5813
5814   if (dump_enabled_p ())
5815     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5816
5817   /* If profile is inprecise, we have chance to fix it up.  */
5818   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5819     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5820
5821   /* Use the more conservative vectorization threshold.  If the number
5822      of iterations is constant assume the cost check has been performed
5823      by our caller.  If the threshold makes all loops profitable that
5824      run at least the vectorization factor number of times checking
5825      is pointless, too.  */
5826   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5827   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5828       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5829     {
5830       if (dump_enabled_p ())
5831         dump_printf_loc (MSG_NOTE, vect_location,
5832                          "Profitability threshold is %d loop iterations.\n",
5833                          th);
5834       check_profitability = true;
5835     }
5836
5837   /* Version the loop first, if required, so the profitability check
5838      comes first.  */
5839
5840   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5841       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5842     {
5843       vect_loop_versioning (loop_vinfo, th, check_profitability);
5844       check_profitability = false;
5845     }
5846
5847   tree ni_name = vect_build_loop_niters (loop_vinfo);
5848   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5849
5850   /* Peel the loop if there are data refs with unknown alignment.
5851      Only one data ref with unknown store is allowed.  */
5852
5853   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5854     {
5855       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5856                                      th, check_profitability);
5857       check_profitability = false;
5858       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5859          be re-computed.  */
5860       ni_name = NULL_TREE;
5861     }
5862
5863   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5864      compile time constant), or it is a constant that doesn't divide by the
5865      vectorization factor, then an epilog loop needs to be created.
5866      We therefore duplicate the loop: the original loop will be vectorized,
5867      and will compute the first (n/VF) iterations.  The second copy of the loop
5868      will remain scalar and will compute the remaining (n%VF) iterations.
5869      (VF is the vectorization factor).  */
5870
5871   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5872       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5873     {
5874       tree ratio_mult_vf;
5875       if (!ni_name)
5876         ni_name = vect_build_loop_niters (loop_vinfo);
5877       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5878                                        &ratio);
5879       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5880                                       th, check_profitability);
5881     }
5882   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5883     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5884                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5885   else
5886     {
5887       if (!ni_name)
5888         ni_name = vect_build_loop_niters (loop_vinfo);
5889       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5890     }
5891
5892   /* 1) Make sure the loop header has exactly two entries
5893      2) Make sure we have a preheader basic block.  */
5894
5895   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5896
5897   split_edge (loop_preheader_edge (loop));
5898
5899   /* FORNOW: the vectorizer supports only loops which body consist
5900      of one basic block (header + empty latch). When the vectorizer will
5901      support more involved loop forms, the order by which the BBs are
5902      traversed need to be reconsidered.  */
5903
5904   for (i = 0; i < nbbs; i++)
5905     {
5906       basic_block bb = bbs[i];
5907       stmt_vec_info stmt_info;
5908       gimple phi;
5909
5910       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
5911         {
5912           phi = gsi_stmt (si);
5913           if (dump_enabled_p ())
5914             {
5915               dump_printf_loc (MSG_NOTE, vect_location,
5916                                "------>vectorizing phi: ");
5917               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5918               dump_printf (MSG_NOTE, "\n");
5919             }
5920           stmt_info = vinfo_for_stmt (phi);
5921           if (!stmt_info)
5922             continue;
5923
5924           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5925             vect_loop_kill_debug_uses (loop, phi);
5926
5927           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5928               && !STMT_VINFO_LIVE_P (stmt_info))
5929             continue;
5930
5931           if (STMT_VINFO_VECTYPE (stmt_info)
5932               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5933                   != (unsigned HOST_WIDE_INT) vectorization_factor)
5934               && dump_enabled_p ())
5935             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5936
5937           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5938             {
5939               if (dump_enabled_p ())
5940                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5941               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5942             }
5943         }
5944
5945       pattern_stmt = NULL;
5946       for (si = gsi_start_bb (bb); !gsi_end_p (si) || transform_pattern_stmt;)
5947         {
5948           bool is_store;
5949
5950           if (transform_pattern_stmt)
5951             stmt = pattern_stmt;
5952           else
5953             {
5954               stmt = gsi_stmt (si);
5955               /* During vectorization remove existing clobber stmts.  */
5956               if (gimple_clobber_p (stmt))
5957                 {
5958                   unlink_stmt_vdef (stmt);
5959                   gsi_remove (&si, true);
5960                   release_defs (stmt);
5961                   continue;
5962                 }
5963             }
5964
5965           if (dump_enabled_p ())
5966             {
5967               dump_printf_loc (MSG_NOTE, vect_location,
5968                                "------>vectorizing statement: ");
5969               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5970               dump_printf (MSG_NOTE, "\n");
5971             }
5972
5973           stmt_info = vinfo_for_stmt (stmt);
5974
5975           /* vector stmts created in the outer-loop during vectorization of
5976              stmts in an inner-loop may not have a stmt_info, and do not
5977              need to be vectorized.  */
5978           if (!stmt_info)
5979             {
5980               gsi_next (&si);
5981               continue;
5982             }
5983
5984           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5985             vect_loop_kill_debug_uses (loop, stmt);
5986
5987           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5988               && !STMT_VINFO_LIVE_P (stmt_info))
5989             {
5990               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
5991                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
5992                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
5993                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
5994                 {
5995                   stmt = pattern_stmt;
5996                   stmt_info = vinfo_for_stmt (stmt);
5997                 }
5998               else
5999                 {
6000                   gsi_next (&si);
6001                   continue;
6002                 }
6003             }
6004           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6005                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6006                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6007                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6008             transform_pattern_stmt = true;
6009
6010           /* If pattern statement has def stmts, vectorize them too.  */
6011           if (is_pattern_stmt_p (stmt_info))
6012             {
6013               if (pattern_def_seq == NULL)
6014                 {
6015                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6016                   pattern_def_si = gsi_start (pattern_def_seq);
6017                 }
6018               else if (!gsi_end_p (pattern_def_si))
6019                 gsi_next (&pattern_def_si);
6020               if (pattern_def_seq != NULL)
6021                 {
6022                   gimple pattern_def_stmt = NULL;
6023                   stmt_vec_info pattern_def_stmt_info = NULL;
6024
6025                   while (!gsi_end_p (pattern_def_si))
6026                     {
6027                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6028                       pattern_def_stmt_info
6029                         = vinfo_for_stmt (pattern_def_stmt);
6030                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6031                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6032                         break;
6033                       gsi_next (&pattern_def_si);
6034                     }
6035
6036                   if (!gsi_end_p (pattern_def_si))
6037                     {
6038                       if (dump_enabled_p ())
6039                         {
6040                           dump_printf_loc (MSG_NOTE, vect_location,
6041                                            "==> vectorizing pattern def "
6042                                            "stmt: ");
6043                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6044                                             pattern_def_stmt, 0);
6045                           dump_printf (MSG_NOTE, "\n");
6046                         }
6047
6048                       stmt = pattern_def_stmt;
6049                       stmt_info = pattern_def_stmt_info;
6050                     }
6051                   else
6052                     {
6053                       pattern_def_si = gsi_none ();
6054                       transform_pattern_stmt = false;
6055                     }
6056                 }
6057               else
6058                 transform_pattern_stmt = false;
6059             }
6060
6061           if (STMT_VINFO_VECTYPE (stmt_info))
6062             {
6063               unsigned int nunits
6064                 = (unsigned int)
6065                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6066               if (!STMT_SLP_TYPE (stmt_info)
6067                   && nunits != (unsigned int) vectorization_factor
6068                   && dump_enabled_p ())
6069                   /* For SLP VF is set according to unrolling factor, and not
6070                      to vector size, hence for SLP this print is not valid.  */
6071                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6072             }
6073
6074           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6075              reached.  */
6076           if (STMT_SLP_TYPE (stmt_info))
6077             {
6078               if (!slp_scheduled)
6079                 {
6080                   slp_scheduled = true;
6081
6082                   if (dump_enabled_p ())
6083                     dump_printf_loc (MSG_NOTE, vect_location,
6084                                      "=== scheduling SLP instances ===\n");
6085
6086                   vect_schedule_slp (loop_vinfo, NULL);
6087                 }
6088
6089               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6090               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6091                 {
6092                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6093                     {
6094                       pattern_def_seq = NULL;
6095                       gsi_next (&si);
6096                     }
6097                   continue;
6098                 }
6099             }
6100
6101           /* -------- vectorize statement ------------ */
6102           if (dump_enabled_p ())
6103             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6104
6105           grouped_store = false;
6106           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6107           if (is_store)
6108             {
6109               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6110                 {
6111                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6112                      interleaving chain was completed - free all the stores in
6113                      the chain.  */
6114                   gsi_next (&si);
6115                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6116                 }
6117               else
6118                 {
6119                   /* Free the attached stmt_vec_info and remove the stmt.  */
6120                   gimple store = gsi_stmt (si);
6121                   free_stmt_vec_info (store);
6122                   unlink_stmt_vdef (store);
6123                   gsi_remove (&si, true);
6124                   release_defs (store);
6125                 }
6126
6127               /* Stores can only appear at the end of pattern statements.  */
6128               gcc_assert (!transform_pattern_stmt);
6129               pattern_def_seq = NULL;
6130             }
6131           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6132             {
6133               pattern_def_seq = NULL;
6134               gsi_next (&si);
6135             }
6136         }                       /* stmts in BB */
6137     }                           /* BBs in loop */
6138
6139   slpeel_make_loop_iterate_ntimes (loop, ratio);
6140
6141   /* Reduce loop iterations by the vectorization factor.  */
6142   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6143                       expected_iterations / vectorization_factor);
6144   loop->nb_iterations_upper_bound
6145     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6146   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6147       && loop->nb_iterations_upper_bound != 0)
6148     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6149   if (loop->any_estimate)
6150     {
6151       loop->nb_iterations_estimate
6152         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6153        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6154            && loop->nb_iterations_estimate != 0)
6155          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6156     }
6157
6158   if (dump_enabled_p ())
6159     {
6160       dump_printf_loc (MSG_NOTE, vect_location,
6161                        "LOOP VECTORIZED\n");
6162       if (loop->inner)
6163         dump_printf_loc (MSG_NOTE, vect_location,
6164                          "OUTER LOOP VECTORIZED\n");
6165       dump_printf (MSG_NOTE, "\n");
6166     }
6167 }