gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "predict.h"
  30 #include "vec.h"
  31 #include "hashtab.h"
  32 #include "hash-set.h"
  33 #include "machmode.h"
  34 #include "hard-reg-set.h"
  35 #include "input.h"
  36 #include "function.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfganal.h"
  40 #include "basic-block.h"
  41 #include "gimple-pretty-print.h"
  42 #include "tree-ssa-alias.h"
  43 #include "internal-fn.h"
  44 #include "gimple-expr.h"
  45 #include "is-a.h"
  46 #include "gimple.h"
  47 #include "gimplify.h"
  48 #include "gimple-iterator.h"
  49 #include "gimplify-me.h"
  50 #include "gimple-ssa.h"
  51 #include "tree-phinodes.h"
  52 #include "ssa-iterators.h"
  53 #include "stringpool.h"
  54 #include "tree-ssanames.h"
  55 #include "tree-ssa-loop-ivopts.h"
  56 #include "tree-ssa-loop-manip.h"
  57 #include "tree-ssa-loop-niter.h"
  58 #include "tree-pass.h"
  59 #include "cfgloop.h"
  60 #include "expr.h"
  61 #include "recog.h"
  62 #include "insn-codes.h"
  63 #include "optabs.h"
  64 #include "params.h"
  65 #include "diagnostic-core.h"
  66 #include "tree-chrec.h"
  67 #include "tree-scalar-evolution.h"
  68 #include "tree-vectorizer.h"
  69 #include "target.h"
  70
  71 /* Loop Vectorization Pass.
  72
  73    This pass tries to vectorize loops.
  74
  75    For example, the vectorizer transforms the following simple loop:
  76
  77         short a[N]; short b[N]; short c[N]; int i;
  78
  79         for (i=0; i<N; i++){
  80           a[i] = b[i] + c[i];
  81         }
  82
  83    as if it was manually vectorized by rewriting the source code into:
  84
  85         typedef int __attribute__((mode(V8HI))) v8hi;
  86         short a[N];  short b[N]; short c[N];   int i;
  87         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  88         v8hi va, vb, vc;
  89
  90         for (i=0; i<N/8; i++){
  91           vb = pb[i];
  92           vc = pc[i];
  93           va = vb + vc;
  94           pa[i] = va;
  95         }
  96
  97         The main entry to this pass is vectorize_loops(), in which
  98    the vectorizer applies a set of analyses on a given set of loops,
  99    followed by the actual vectorization transformation for the loops that
 100    had successfully passed the analysis phase.
 101         Throughout this pass we make a distinction between two types of
 102    data: scalars (which are represented by SSA_NAMES), and memory references
 103    ("data-refs").  These two types of data require different handling both
 104    during analysis and transformation. The types of data-refs that the
 105    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 106    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 107    accesses are required to have a simple (consecutive) access pattern.
 108
 109    Analysis phase:
 110    ===============
 111         The driver for the analysis phase is vect_analyze_loop().
 112    It applies a set of analyses, some of which rely on the scalar evolution
 113    analyzer (scev) developed by Sebastian Pop.
 114
 115         During the analysis phase the vectorizer records some information
 116    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 117    loop, as well as general information about the loop as a whole, which is
 118    recorded in a "loop_vec_info" struct attached to each loop.
 119
 120    Transformation phase:
 121    =====================
 122         The loop transformation phase scans all the stmts in the loop, and
 123    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 124    the loop that needs to be vectorized.  It inserts the vector code sequence
 125    just before the scalar stmt S, and records a pointer to the vector code
 126    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 127    attached to S).  This pointer will be used for the vectorization of following
 128    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 129    otherwise, we rely on dead code elimination for removing it.
 130
 131         For example, say stmt S1 was vectorized into stmt VS1:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    S2:  a = b;
 136
 137    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 138    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 139    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 140    resulting sequence would be:
 141
 142    VS1: vb = px[i];
 143    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 144    VS2: va = vb;
 145    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 146
 147         Operands that are not SSA_NAMEs, are data-refs that appear in
 148    load/store operations (like 'x[i]' in S1), and are handled differently.
 149
 150    Target modeling:
 151    =================
 152         Currently the only target specific information that is used is the
 153    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 154    Targets that can support different sizes of vectors, for now will need
 155    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 156    flexibility will be added in the future.
 157
 158         Since we only vectorize operations which vector form can be
 159    expressed using existing tree codes, to verify that an operation is
 160    supported, the vectorizer checks the relevant optab at the relevant
 161    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 162    the value found is CODE_FOR_nothing, then there's no target support, and
 163    we can't vectorize the stmt.
 164
 165    For additional information on this project see:
 166    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 167 */
 168
 169 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 170
 171 /* Function vect_determine_vectorization_factor
 172
 173    Determine the vectorization factor (VF).  VF is the number of data elements
 174    that are operated upon in parallel in a single iteration of the vectorized
 175    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 176    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 177    elements can fit in a single vector register.
 178
 179    We currently support vectorization of loops in which all types operated upon
 180    are of the same size.  Therefore this function currently sets VF according to
 181    the size of the types operated upon, and fails if there are multiple sizes
 182    in the loop.
 183
 184    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 185    original loop:
 186         for (i=0; i<N; i++){
 187           a[i] = b[i] + c[i];
 188         }
 189
 190    vectorized loop:
 191         for (i=0; i<N; i+=VF){
 192           a[i:VF] = b[i:VF] + c[i:VF];
 193         }
 194 */
 195
 196 static bool
 197 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 198 {
 199   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 200   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 201   int nbbs = loop->num_nodes;
 202   unsigned int vectorization_factor = 0;
 203   tree scalar_type;
 204   gphi *phi;
 205   tree vectype;
 206   unsigned int nunits;
 207   stmt_vec_info stmt_info;
 208   int i;
 209   HOST_WIDE_INT dummy;
 210   gimple stmt, pattern_stmt = NULL;
 211   gimple_seq pattern_def_seq = NULL;
 212   gimple_stmt_iterator pattern_def_si = gsi_none ();
 213   bool analyze_pattern_stmt = false;
 214
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location,
 217                      "=== vect_determine_vectorization_factor ===\n");
 218
 219   for (i = 0; i < nbbs; i++)
 220     {
 221       basic_block bb = bbs[i];
 222
 223       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 224            gsi_next (&si))
 225         {
 226           phi = si.phi ();
 227           stmt_info = vinfo_for_stmt (phi);
 228           if (dump_enabled_p ())
 229             {
 230               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 231               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 232               dump_printf (MSG_NOTE, "\n");
 233             }
 234
 235           gcc_assert (stmt_info);
 236
 237           if (STMT_VINFO_RELEVANT_P (stmt_info))
 238             {
 239               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 240               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 241
 242               if (dump_enabled_p ())
 243                 {
 244                   dump_printf_loc (MSG_NOTE, vect_location,
 245                                    "get vectype for scalar type:  ");
 246                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 247                   dump_printf (MSG_NOTE, "\n");
 248                 }
 249
 250               vectype = get_vectype_for_scalar_type (scalar_type);
 251               if (!vectype)
 252                 {
 253                   if (dump_enabled_p ())
 254                     {
 255                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 256                                        "not vectorized: unsupported "
 257                                        "data-type ");
 258                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 259                                          scalar_type);
 260                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 261                     }
 262                   return false;
 263                 }
 264               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 265
 266               if (dump_enabled_p ())
 267                 {
 268                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 269                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 270                   dump_printf (MSG_NOTE, "\n");
 271                 }
 272
 273               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 274               if (dump_enabled_p ())
 275                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 276                                  nunits);
 277
 278               if (!vectorization_factor
 279                   || (nunits > vectorization_factor))
 280                 vectorization_factor = nunits;
 281             }
 282         }
 283
 284       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 285            !gsi_end_p (si) || analyze_pattern_stmt;)
 286         {
 287           tree vf_vectype;
 288
 289           if (analyze_pattern_stmt)
 290             stmt = pattern_stmt;
 291           else
 292             stmt = gsi_stmt (si);
 293
 294           stmt_info = vinfo_for_stmt (stmt);
 295
 296           if (dump_enabled_p ())
 297             {
 298               dump_printf_loc (MSG_NOTE, vect_location,
 299                                "==> examining statement: ");
 300               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 301               dump_printf (MSG_NOTE, "\n");
 302             }
 303
 304           gcc_assert (stmt_info);
 305
 306           /* Skip stmts which do not need to be vectorized.  */
 307           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 308                && !STMT_VINFO_LIVE_P (stmt_info))
 309               || gimple_clobber_p (stmt))
 310             {
 311               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 312                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 313                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 314                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 315                 {
 316                   stmt = pattern_stmt;
 317                   stmt_info = vinfo_for_stmt (pattern_stmt);
 318                   if (dump_enabled_p ())
 319                     {
 320                       dump_printf_loc (MSG_NOTE, vect_location,
 321                                        "==> examining pattern statement: ");
 322                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 323                       dump_printf (MSG_NOTE, "\n");
 324                     }
 325                 }
 326               else
 327                 {
 328                   if (dump_enabled_p ())
 329                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 330                   gsi_next (&si);
 331                   continue;
 332                 }
 333             }
 334           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 335                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 336                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 337                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 338             analyze_pattern_stmt = true;
 339
 340           /* If a pattern statement has def stmts, analyze them too.  */
 341           if (is_pattern_stmt_p (stmt_info))
 342             {
 343               if (pattern_def_seq == NULL)
 344                 {
 345                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 346                   pattern_def_si = gsi_start (pattern_def_seq);
 347                 }
 348               else if (!gsi_end_p (pattern_def_si))
 349                 gsi_next (&pattern_def_si);
 350               if (pattern_def_seq != NULL)
 351                 {
 352                   gimple pattern_def_stmt = NULL;
 353                   stmt_vec_info pattern_def_stmt_info = NULL;
 354
 355                   while (!gsi_end_p (pattern_def_si))
 356                     {
 357                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 358                       pattern_def_stmt_info
 359                         = vinfo_for_stmt (pattern_def_stmt);
 360                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 361                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 362                         break;
 363                       gsi_next (&pattern_def_si);
 364                     }
 365
 366                   if (!gsi_end_p (pattern_def_si))
 367                     {
 368                       if (dump_enabled_p ())
 369                         {
 370                           dump_printf_loc (MSG_NOTE, vect_location,
 371                                            "==> examining pattern def stmt: ");
 372                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 373                                             pattern_def_stmt, 0);
 374                           dump_printf (MSG_NOTE, "\n");
 375                         }
 376
 377                       stmt = pattern_def_stmt;
 378                       stmt_info = pattern_def_stmt_info;
 379                     }
 380                   else
 381                     {
 382                       pattern_def_si = gsi_none ();
 383                       analyze_pattern_stmt = false;
 384                     }
 385                 }
 386               else
 387                 analyze_pattern_stmt = false;
 388             }
 389
 390           if (gimple_get_lhs (stmt) == NULL_TREE
 391               /* MASK_STORE has no lhs, but is ok.  */
 392               && (!is_gimple_call (stmt)
 393                   || !gimple_call_internal_p (stmt)
 394                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 395             {
 396               if (is_gimple_call (stmt))
 397                 {
 398                   /* Ignore calls with no lhs.  These must be calls to
 399                      #pragma omp simd functions, and what vectorization factor
 400                      it really needs can't be determined until
 401                      vectorizable_simd_clone_call.  */
 402                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 403                     {
 404                       pattern_def_seq = NULL;
 405                       gsi_next (&si);
 406                     }
 407                   continue;
 408                 }
 409               if (dump_enabled_p ())
 410                 {
 411                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 412                                    "not vectorized: irregular stmt.");
 413                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 414                                     0);
 415                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 416                 }
 417               return false;
 418             }
 419
 420           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 421             {
 422               if (dump_enabled_p ())
 423                 {
 424                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 425                                    "not vectorized: vector stmt in loop:");
 426                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 427                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 428                 }
 429               return false;
 430             }
 431
 432           if (STMT_VINFO_VECTYPE (stmt_info))
 433             {
 434               /* The only case when a vectype had been already set is for stmts
 435                  that contain a dataref, or for "pattern-stmts" (stmts
 436                  generated by the vectorizer to represent/replace a certain
 437                  idiom).  */
 438               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 439                           || is_pattern_stmt_p (stmt_info)
 440                           || !gsi_end_p (pattern_def_si));
 441               vectype = STMT_VINFO_VECTYPE (stmt_info);
 442             }
 443           else
 444             {
 445               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 446               if (is_gimple_call (stmt)
 447                   && gimple_call_internal_p (stmt)
 448                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 449                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 450               else
 451                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 452               if (dump_enabled_p ())
 453                 {
 454                   dump_printf_loc (MSG_NOTE, vect_location,
 455                                    "get vectype for scalar type:  ");
 456                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 457                   dump_printf (MSG_NOTE, "\n");
 458                 }
 459               vectype = get_vectype_for_scalar_type (scalar_type);
 460               if (!vectype)
 461                 {
 462                   if (dump_enabled_p ())
 463                     {
 464                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 465                                        "not vectorized: unsupported "
 466                                        "data-type ");
 467                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 468                                          scalar_type);
 469                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 470                     }
 471                   return false;
 472                 }
 473
 474               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 475
 476               if (dump_enabled_p ())
 477                 {
 478                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 479                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 480                   dump_printf (MSG_NOTE, "\n");
 481                 }
 482             }
 483
 484           /* The vectorization factor is according to the smallest
 485              scalar type (or the largest vector size, but we only
 486              support one vector size per loop).  */
 487           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 488                                                        &dummy);
 489           if (dump_enabled_p ())
 490             {
 491               dump_printf_loc (MSG_NOTE, vect_location,
 492                                "get vectype for scalar type:  ");
 493               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 494               dump_printf (MSG_NOTE, "\n");
 495             }
 496           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 497           if (!vf_vectype)
 498             {
 499               if (dump_enabled_p ())
 500                 {
 501                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                                    "not vectorized: unsupported data-type ");
 503                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 504                                      scalar_type);
 505                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 506                 }
 507               return false;
 508             }
 509
 510           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 511                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 512             {
 513               if (dump_enabled_p ())
 514                 {
 515                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 516                                    "not vectorized: different sized vector "
 517                                    "types in statement, ");
 518                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 519                                      vectype);
 520                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      vf_vectype);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if (dump_enabled_p ())
 529             {
 530               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 531               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 532               dump_printf (MSG_NOTE, "\n");
 533             }
 534
 535           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 536           if (dump_enabled_p ())
 537             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 538           if (!vectorization_factor
 539               || (nunits > vectorization_factor))
 540             vectorization_factor = nunits;
 541
 542           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 543             {
 544               pattern_def_seq = NULL;
 545               gsi_next (&si);
 546             }
 547         }
 548     }
 549
 550   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 551   if (dump_enabled_p ())
 552     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 553                      vectorization_factor);
 554   if (vectorization_factor <= 1)
 555     {
 556       if (dump_enabled_p ())
 557         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 558                          "not vectorized: unsupported data-type\n");
 559       return false;
 560     }
 561   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 562
 563   return true;
 564 }
 565
 566
 567 /* Function vect_is_simple_iv_evolution.
 568
 569    FORNOW: A simple evolution of an induction variables in the loop is
 570    considered a polynomial evolution.  */
 571
 572 static bool
 573 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 574                              tree * step)
 575 {
 576   tree init_expr;
 577   tree step_expr;
 578   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 579   basic_block bb;
 580
 581   /* When there is no evolution in this loop, the evolution function
 582      is not "simple".  */
 583   if (evolution_part == NULL_TREE)
 584     return false;
 585
 586   /* When the evolution is a polynomial of degree >= 2
 587      the evolution function is not "simple".  */
 588   if (tree_is_chrec (evolution_part))
 589     return false;
 590
 591   step_expr = evolution_part;
 592   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 593
 594   if (dump_enabled_p ())
 595     {
 596       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 597       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 598       dump_printf (MSG_NOTE, ",  init: ");
 599       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 600       dump_printf (MSG_NOTE, "\n");
 601     }
 602
 603   *init = init_expr;
 604   *step = step_expr;
 605
 606   if (TREE_CODE (step_expr) != INTEGER_CST
 607       && (TREE_CODE (step_expr) != SSA_NAME
 608           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 609               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 610           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 611               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 612                   || !flag_associative_math)))
 613       && (TREE_CODE (step_expr) != REAL_CST
 614           || !flag_associative_math))
 615     {
 616       if (dump_enabled_p ())
 617         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                          "step unknown.\n");
 619       return false;
 620     }
 621
 622   return true;
 623 }
 624
 625 /* Function vect_analyze_scalar_cycles_1.
 626
 627    Examine the cross iteration def-use cycles of scalar variables
 628    in LOOP.  LOOP_VINFO represents the loop that is now being
 629    considered for vectorization (can be LOOP, or an outer-loop
 630    enclosing LOOP).  */
 631
 632 static void
 633 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 634 {
 635   basic_block bb = loop->header;
 636   tree init, step;
 637   auto_vec<gimple, 64> worklist;
 638   gphi_iterator gsi;
 639   bool double_reduc;
 640
 641   if (dump_enabled_p ())
 642     dump_printf_loc (MSG_NOTE, vect_location,
 643                      "=== vect_analyze_scalar_cycles ===\n");
 644
 645   /* First - identify all inductions.  Reduction detection assumes that all the
 646      inductions have been identified, therefore, this order must not be
 647      changed.  */
 648   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 649     {
 650       gphi *phi = gsi.phi ();
 651       tree access_fn = NULL;
 652       tree def = PHI_RESULT (phi);
 653       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 654
 655       if (dump_enabled_p ())
 656         {
 657           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 658           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 659           dump_printf (MSG_NOTE, "\n");
 660         }
 661
 662       /* Skip virtual phi's.  The data dependences that are associated with
 663          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 664       if (virtual_operand_p (def))
 665         continue;
 666
 667       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 668
 669       /* Analyze the evolution function.  */
 670       access_fn = analyze_scalar_evolution (loop, def);
 671       if (access_fn)
 672         {
 673           STRIP_NOPS (access_fn);
 674           if (dump_enabled_p ())
 675             {
 676               dump_printf_loc (MSG_NOTE, vect_location,
 677                                "Access function of PHI: ");
 678               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 679               dump_printf (MSG_NOTE, "\n");
 680             }
 681           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 682             = evolution_part_in_loop_num (access_fn, loop->num);
 683         }
 684
 685       if (!access_fn
 686           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 687           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 688               && TREE_CODE (step) != INTEGER_CST))
 689         {
 690           worklist.safe_push (phi);
 691           continue;
 692         }
 693
 694       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 695
 696       if (dump_enabled_p ())
 697         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 698       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 699     }
 700
 701
 702   /* Second - identify all reductions and nested cycles.  */
 703   while (worklist.length () > 0)
 704     {
 705       gimple phi = worklist.pop ();
 706       tree def = PHI_RESULT (phi);
 707       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 708       gimple reduc_stmt;
 709       bool nested_cycle;
 710
 711       if (dump_enabled_p ())
 712         {
 713           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 714           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 715           dump_printf (MSG_NOTE, "\n");
 716         }
 717
 718       gcc_assert (!virtual_operand_p (def)
 719                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 720
 721       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 722       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 723                                                 &double_reduc);
 724       if (reduc_stmt)
 725         {
 726           if (double_reduc)
 727             {
 728               if (dump_enabled_p ())
 729                 dump_printf_loc (MSG_NOTE, vect_location,
 730                                  "Detected double reduction.\n");
 731
 732               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 733               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 734                                                     vect_double_reduction_def;
 735             }
 736           else
 737             {
 738               if (nested_cycle)
 739                 {
 740                   if (dump_enabled_p ())
 741                     dump_printf_loc (MSG_NOTE, vect_location,
 742                                      "Detected vectorizable nested cycle.\n");
 743
 744                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 745                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 746                                                              vect_nested_cycle;
 747                 }
 748               else
 749                 {
 750                   if (dump_enabled_p ())
 751                     dump_printf_loc (MSG_NOTE, vect_location,
 752                                      "Detected reduction.\n");
 753
 754                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 755                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 756                                                            vect_reduction_def;
 757                   /* Store the reduction cycles for possible vectorization in
 758                      loop-aware SLP.  */
 759                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 760                 }
 761             }
 762         }
 763       else
 764         if (dump_enabled_p ())
 765           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 766                            "Unknown def-use cycle pattern.\n");
 767     }
 768 }
 769
 770
 771 /* Function vect_analyze_scalar_cycles.
 772
 773    Examine the cross iteration def-use cycles of scalar variables, by
 774    analyzing the loop-header PHIs of scalar variables.  Classify each
 775    cycle as one of the following: invariant, induction, reduction, unknown.
 776    We do that for the loop represented by LOOP_VINFO, and also to its
 777    inner-loop, if exists.
 778    Examples for scalar cycles:
 779
 780    Example1: reduction:
 781
 782               loop1:
 783               for (i=0; i<N; i++)
 784                  sum += a[i];
 785
 786    Example2: induction:
 787
 788               loop2:
 789               for (i=0; i<N; i++)
 790                  a[i] = i;  */
 791
 792 static void
 793 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 794 {
 795   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 796
 797   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 798
 799   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 800      Reductions in such inner-loop therefore have different properties than
 801      the reductions in the nest that gets vectorized:
 802      1. When vectorized, they are executed in the same order as in the original
 803         scalar loop, so we can't change the order of computation when
 804         vectorizing them.
 805      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 806         current checks are too strict.  */
 807
 808   if (loop->inner)
 809     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 810 }
 811
 812
 813 /* Function vect_get_loop_niters.
 814
 815    Determine how many iterations the loop is executed and place it
 816    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 817    in NUMBER_OF_ITERATIONSM1.
 818
 819    Return the loop exit condition.  */
 820
 821
 822 static gcond *
 823 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 824                       tree *number_of_iterationsm1)
 825 {
 826   tree niters;
 827
 828   if (dump_enabled_p ())
 829     dump_printf_loc (MSG_NOTE, vect_location,
 830                      "=== get_loop_niters ===\n");
 831
 832   niters = number_of_latch_executions (loop);
 833   *number_of_iterationsm1 = niters;
 834
 835   /* We want the number of loop header executions which is the number
 836      of latch executions plus one.
 837      ???  For UINT_MAX latch executions this number overflows to zero
 838      for loops like do { n++; } while (n != 0);  */
 839   if (niters && !chrec_contains_undetermined (niters))
 840     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 841                           build_int_cst (TREE_TYPE (niters), 1));
 842   *number_of_iterations = niters;
 843
 844   return get_loop_exit_condition (loop);
 845 }
 846
 847
 848 /* Function bb_in_loop_p
 849
 850    Used as predicate for dfs order traversal of the loop bbs.  */
 851
 852 static bool
 853 bb_in_loop_p (const_basic_block bb, const void *data)
 854 {
 855   const struct loop *const loop = (const struct loop *)data;
 856   if (flow_bb_inside_loop_p (loop, bb))
 857     return true;
 858   return false;
 859 }
 860
 861
 862 /* Function new_loop_vec_info.
 863
 864    Create and initialize a new loop_vec_info struct for LOOP, as well as
 865    stmt_vec_info structs for all the stmts in LOOP.  */
 866
 867 static loop_vec_info
 868 new_loop_vec_info (struct loop *loop)
 869 {
 870   loop_vec_info res;
 871   basic_block *bbs;
 872   gimple_stmt_iterator si;
 873   unsigned int i, nbbs;
 874
 875   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 876   LOOP_VINFO_LOOP (res) = loop;
 877
 878   bbs = get_loop_body (loop);
 879
 880   /* Create/Update stmt_info for all stmts in the loop.  */
 881   for (i = 0; i < loop->num_nodes; i++)
 882     {
 883       basic_block bb = bbs[i];
 884
 885       /* BBs in a nested inner-loop will have been already processed (because
 886          we will have called vect_analyze_loop_form for any nested inner-loop).
 887          Therefore, for stmts in an inner-loop we just want to update the
 888          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 889          loop_info of the outer-loop we are currently considering to vectorize
 890          (instead of the loop_info of the inner-loop).
 891          For stmts in other BBs we need to create a stmt_info from scratch.  */
 892       if (bb->loop_father != loop)
 893         {
 894           /* Inner-loop bb.  */
 895           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 896           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 897             {
 898               gimple phi = gsi_stmt (si);
 899               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 900               loop_vec_info inner_loop_vinfo =
 901                 STMT_VINFO_LOOP_VINFO (stmt_info);
 902               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 903               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 904             }
 905           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 906            {
 907               gimple stmt = gsi_stmt (si);
 908               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 909               loop_vec_info inner_loop_vinfo =
 910                  STMT_VINFO_LOOP_VINFO (stmt_info);
 911               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 912               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 913            }
 914         }
 915       else
 916         {
 917           /* bb in current nest.  */
 918           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 919             {
 920               gimple phi = gsi_stmt (si);
 921               gimple_set_uid (phi, 0);
 922               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 923             }
 924
 925           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 926             {
 927               gimple stmt = gsi_stmt (si);
 928               gimple_set_uid (stmt, 0);
 929               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 930             }
 931         }
 932     }
 933
 934   /* CHECKME: We want to visit all BBs before their successors (except for
 935      latch blocks, for which this assertion wouldn't hold).  In the simple
 936      case of the loop forms we allow, a dfs order of the BBs would the same
 937      as reversed postorder traversal, so we are safe.  */
 938
 939    free (bbs);
 940    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 941    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 942                               bbs, loop->num_nodes, loop);
 943    gcc_assert (nbbs == loop->num_nodes);
 944
 945   LOOP_VINFO_BBS (res) = bbs;
 946   LOOP_VINFO_NITERSM1 (res) = NULL;
 947   LOOP_VINFO_NITERS (res) = NULL;
 948   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 949   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 950   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 951   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 952   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 953   LOOP_VINFO_VECT_FACTOR (res) = 0;
 954   LOOP_VINFO_LOOP_NEST (res).create (3);
 955   LOOP_VINFO_DATAREFS (res).create (10);
 956   LOOP_VINFO_DDRS (res).create (10 * 10);
 957   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 958   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 959              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 960   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 961              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 962   LOOP_VINFO_GROUPED_STORES (res).create (10);
 963   LOOP_VINFO_REDUCTIONS (res).create (10);
 964   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 965   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 966   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 967   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 968   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 969   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 970   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 971
 972   return res;
 973 }
 974
 975
 976 /* Function destroy_loop_vec_info.
 977
 978    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 979    stmts in the loop.  */
 980
 981 void
 982 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 983 {
 984   struct loop *loop;
 985   basic_block *bbs;
 986   int nbbs;
 987   gimple_stmt_iterator si;
 988   int j;
 989   vec<slp_instance> slp_instances;
 990   slp_instance instance;
 991   bool swapped;
 992
 993   if (!loop_vinfo)
 994     return;
 995
 996   loop = LOOP_VINFO_LOOP (loop_vinfo);
 997
 998   bbs = LOOP_VINFO_BBS (loop_vinfo);
 999   nbbs = clean_stmts ? loop->num_nodes : 0;
1000   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1001
1002   for (j = 0; j < nbbs; j++)
1003     {
1004       basic_block bb = bbs[j];
1005       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1006         free_stmt_vec_info (gsi_stmt (si));
1007
1008       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1009         {
1010           gimple stmt = gsi_stmt (si);
1011
1012           /* We may have broken canonical form by moving a constant
1013              into RHS1 of a commutative op.  Fix such occurrences.  */
1014           if (swapped && is_gimple_assign (stmt))
1015             {
1016               enum tree_code code = gimple_assign_rhs_code (stmt);
1017
1018               if ((code == PLUS_EXPR
1019                    || code == POINTER_PLUS_EXPR
1020                    || code == MULT_EXPR)
1021                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1022                 swap_ssa_operands (stmt,
1023                                    gimple_assign_rhs1_ptr (stmt),
1024                                    gimple_assign_rhs2_ptr (stmt));
1025             }
1026
1027           /* Free stmt_vec_info.  */
1028           free_stmt_vec_info (stmt);
1029           gsi_next (&si);
1030         }
1031     }
1032
1033   free (LOOP_VINFO_BBS (loop_vinfo));
1034   vect_destroy_datarefs (loop_vinfo, NULL);
1035   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1036   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1037   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1038   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1039   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1040   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1041     vect_free_slp_instance (instance);
1042
1043   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1044   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1045   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1046   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1047
1048   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1049   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1050
1051   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1052
1053   free (loop_vinfo);
1054   loop->aux = NULL;
1055 }
1056
1057
1058 /* Function vect_analyze_loop_1.
1059
1060    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1061    for it. The different analyses will record information in the
1062    loop_vec_info struct.  This is a subset of the analyses applied in
1063    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1064    that is now considered for (outer-loop) vectorization.  */
1065
1066 static loop_vec_info
1067 vect_analyze_loop_1 (struct loop *loop)
1068 {
1069   loop_vec_info loop_vinfo;
1070
1071   if (dump_enabled_p ())
1072     dump_printf_loc (MSG_NOTE, vect_location,
1073                      "===== analyze_loop_nest_1 =====\n");
1074
1075   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1076
1077   loop_vinfo = vect_analyze_loop_form (loop);
1078   if (!loop_vinfo)
1079     {
1080       if (dump_enabled_p ())
1081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1082                          "bad inner-loop form.\n");
1083       return NULL;
1084     }
1085
1086   return loop_vinfo;
1087 }
1088
1089
1090 /* Function vect_analyze_loop_form.
1091
1092    Verify that certain CFG restrictions hold, including:
1093    - the loop has a pre-header
1094    - the loop has a single entry and exit
1095    - the loop exit condition is simple enough, and the number of iterations
1096      can be analyzed (a countable loop).  */
1097
1098 loop_vec_info
1099 vect_analyze_loop_form (struct loop *loop)
1100 {
1101   loop_vec_info loop_vinfo;
1102   gcond *loop_cond;
1103   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1104   loop_vec_info inner_loop_vinfo = NULL;
1105
1106   if (dump_enabled_p ())
1107     dump_printf_loc (MSG_NOTE, vect_location,
1108                      "=== vect_analyze_loop_form ===\n");
1109
1110   /* Different restrictions apply when we are considering an inner-most loop,
1111      vs. an outer (nested) loop.
1112      (FORNOW. May want to relax some of these restrictions in the future).  */
1113
1114   if (!loop->inner)
1115     {
1116       /* Inner-most loop.  We currently require that the number of BBs is
1117          exactly 2 (the header and latch).  Vectorizable inner-most loops
1118          look like this:
1119
1120                         (pre-header)
1121                            |
1122                           header <--------+
1123                            | |            |
1124                            | +--> latch --+
1125                            |
1126                         (exit-bb)  */
1127
1128       if (loop->num_nodes != 2)
1129         {
1130           if (dump_enabled_p ())
1131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1132                              "not vectorized: control flow in loop.\n");
1133           return NULL;
1134         }
1135
1136       if (empty_block_p (loop->header))
1137         {
1138           if (dump_enabled_p ())
1139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140                              "not vectorized: empty loop.\n");
1141           return NULL;
1142         }
1143     }
1144   else
1145     {
1146       struct loop *innerloop = loop->inner;
1147       edge entryedge;
1148
1149       /* Nested loop. We currently require that the loop is doubly-nested,
1150          contains a single inner loop, and the number of BBs is exactly 5.
1151          Vectorizable outer-loops look like this:
1152
1153                         (pre-header)
1154                            |
1155                           header <---+
1156                            |         |
1157                           inner-loop |
1158                            |         |
1159                           tail ------+
1160                            |
1161                         (exit-bb)
1162
1163          The inner-loop has the properties expected of inner-most loops
1164          as described above.  */
1165
1166       if ((loop->inner)->inner || (loop->inner)->next)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "not vectorized: multiple nested loops.\n");
1171           return NULL;
1172         }
1173
1174       /* Analyze the inner-loop.  */
1175       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1176       if (!inner_loop_vinfo)
1177         {
1178           if (dump_enabled_p ())
1179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                              "not vectorized: Bad inner loop.\n");
1181           return NULL;
1182         }
1183
1184       if (!expr_invariant_in_loop_p (loop,
1185                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1186         {
1187           if (dump_enabled_p ())
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: inner-loop count not"
1190                              " invariant.\n");
1191           destroy_loop_vec_info (inner_loop_vinfo, true);
1192           return NULL;
1193         }
1194
1195       if (loop->num_nodes != 5)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: control flow in loop.\n");
1200           destroy_loop_vec_info (inner_loop_vinfo, true);
1201           return NULL;
1202         }
1203
1204       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1205       entryedge = EDGE_PRED (innerloop->header, 0);
1206       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1207         entryedge = EDGE_PRED (innerloop->header, 1);
1208
1209       if (entryedge->src != loop->header
1210           || !single_exit (innerloop)
1211           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1212         {
1213           if (dump_enabled_p ())
1214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                              "not vectorized: unsupported outerloop form.\n");
1216           destroy_loop_vec_info (inner_loop_vinfo, true);
1217           return NULL;
1218         }
1219
1220       if (dump_enabled_p ())
1221         dump_printf_loc (MSG_NOTE, vect_location,
1222                          "Considering outer-loop vectorization.\n");
1223     }
1224
1225   if (!single_exit (loop)
1226       || EDGE_COUNT (loop->header->preds) != 2)
1227     {
1228       if (dump_enabled_p ())
1229         {
1230           if (!single_exit (loop))
1231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                              "not vectorized: multiple exits.\n");
1233           else if (EDGE_COUNT (loop->header->preds) != 2)
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: too many incoming edges.\n");
1236         }
1237       if (inner_loop_vinfo)
1238         destroy_loop_vec_info (inner_loop_vinfo, true);
1239       return NULL;
1240     }
1241
1242   /* We assume that the loop exit condition is at the end of the loop. i.e,
1243      that the loop is represented as a do-while (with a proper if-guard
1244      before the loop if needed), where the loop header contains all the
1245      executable statements, and the latch is empty.  */
1246   if (!empty_block_p (loop->latch)
1247       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1248     {
1249       if (dump_enabled_p ())
1250         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                          "not vectorized: latch block not empty.\n");
1252       if (inner_loop_vinfo)
1253         destroy_loop_vec_info (inner_loop_vinfo, true);
1254       return NULL;
1255     }
1256
1257   /* Make sure there exists a single-predecessor exit bb:  */
1258   if (!single_pred_p (single_exit (loop)->dest))
1259     {
1260       edge e = single_exit (loop);
1261       if (!(e->flags & EDGE_ABNORMAL))
1262         {
1263           split_loop_exit_edge (e);
1264           if (dump_enabled_p ())
1265             dump_printf (MSG_NOTE, "split exit edge.\n");
1266         }
1267       else
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: abnormal loop exit edge.\n");
1272           if (inner_loop_vinfo)
1273             destroy_loop_vec_info (inner_loop_vinfo, true);
1274           return NULL;
1275         }
1276     }
1277
1278   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1279                                     &number_of_iterationsm1);
1280   if (!loop_cond)
1281     {
1282       if (dump_enabled_p ())
1283         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284                          "not vectorized: complicated exit condition.\n");
1285       if (inner_loop_vinfo)
1286         destroy_loop_vec_info (inner_loop_vinfo, true);
1287       return NULL;
1288     }
1289
1290   if (!number_of_iterations
1291       || chrec_contains_undetermined (number_of_iterations))
1292     {
1293       if (dump_enabled_p ())
1294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1295                          "not vectorized: number of iterations cannot be "
1296                          "computed.\n");
1297       if (inner_loop_vinfo)
1298         destroy_loop_vec_info (inner_loop_vinfo, true);
1299       return NULL;
1300     }
1301
1302   if (integer_zerop (number_of_iterations))
1303     {
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                          "not vectorized: number of iterations = 0.\n");
1307       if (inner_loop_vinfo)
1308         destroy_loop_vec_info (inner_loop_vinfo, true);
1309       return NULL;
1310     }
1311
1312   loop_vinfo = new_loop_vec_info (loop);
1313   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1314   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1315   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1316
1317   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1318     {
1319       if (dump_enabled_p ())
1320         {
1321           dump_printf_loc (MSG_NOTE, vect_location,
1322                            "Symbolic number of iterations is ");
1323           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1324           dump_printf (MSG_NOTE, "\n");
1325         }
1326     }
1327
1328   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1329
1330   /* CHECKME: May want to keep it around it in the future.  */
1331   if (inner_loop_vinfo)
1332     destroy_loop_vec_info (inner_loop_vinfo, false);
1333
1334   gcc_assert (!loop->aux);
1335   loop->aux = loop_vinfo;
1336   return loop_vinfo;
1337 }
1338
1339
1340 /* Function vect_analyze_loop_operations.
1341
1342    Scan the loop stmts and make sure they are all vectorizable.  */
1343
1344 static bool
1345 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1346 {
1347   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1348   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1349   int nbbs = loop->num_nodes;
1350   unsigned int vectorization_factor = 0;
1351   int i;
1352   stmt_vec_info stmt_info;
1353   bool need_to_vectorize = false;
1354   int min_profitable_iters;
1355   int min_scalar_loop_bound;
1356   unsigned int th;
1357   bool only_slp_in_loop = true, ok;
1358   HOST_WIDE_INT max_niter;
1359   HOST_WIDE_INT estimated_niter;
1360   int min_profitable_estimate;
1361
1362   if (dump_enabled_p ())
1363     dump_printf_loc (MSG_NOTE, vect_location,
1364                      "=== vect_analyze_loop_operations ===\n");
1365
1366   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1367   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1368   if (slp)
1369     {
1370       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1371          vectorization factor of the loop is the unrolling factor required by
1372          the SLP instances.  If that unrolling factor is 1, we say, that we
1373          perform pure SLP on loop - cross iteration parallelism is not
1374          exploited.  */
1375       for (i = 0; i < nbbs; i++)
1376         {
1377           basic_block bb = bbs[i];
1378           for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1379                gsi_next (&si))
1380             {
1381               gimple stmt = gsi_stmt (si);
1382               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1383               gcc_assert (stmt_info);
1384               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1385                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1386                   && !PURE_SLP_STMT (stmt_info))
1387                 /* STMT needs both SLP and loop-based vectorization.  */
1388                 only_slp_in_loop = false;
1389             }
1390         }
1391
1392       if (only_slp_in_loop)
1393         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1394       else
1395         vectorization_factor = least_common_multiple (vectorization_factor,
1396                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1397
1398       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1399       if (dump_enabled_p ())
1400         dump_printf_loc (MSG_NOTE, vect_location,
1401                          "Updating vectorization factor to %d\n",
1402                          vectorization_factor);
1403     }
1404
1405   for (i = 0; i < nbbs; i++)
1406     {
1407       basic_block bb = bbs[i];
1408
1409       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1410            gsi_next (&si))
1411         {
1412           gphi *phi = si.phi ();
1413           ok = true;
1414
1415           stmt_info = vinfo_for_stmt (phi);
1416           if (dump_enabled_p ())
1417             {
1418               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1419               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1420               dump_printf (MSG_NOTE, "\n");
1421             }
1422
1423           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1424              (i.e., a phi in the tail of the outer-loop).  */
1425           if (! is_loop_header_bb_p (bb))
1426             {
1427               /* FORNOW: we currently don't support the case that these phis
1428                  are not used in the outerloop (unless it is double reduction,
1429                  i.e., this phi is vect_reduction_def), cause this case
1430                  requires to actually do something here.  */
1431               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1432                    || STMT_VINFO_LIVE_P (stmt_info))
1433                   && STMT_VINFO_DEF_TYPE (stmt_info)
1434                      != vect_double_reduction_def)
1435                 {
1436                   if (dump_enabled_p ())
1437                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                                      "Unsupported loop-closed phi in "
1439                                      "outer-loop.\n");
1440                   return false;
1441                 }
1442
1443               /* If PHI is used in the outer loop, we check that its operand
1444                  is defined in the inner loop.  */
1445               if (STMT_VINFO_RELEVANT_P (stmt_info))
1446                 {
1447                   tree phi_op;
1448                   gimple op_def_stmt;
1449
1450                   if (gimple_phi_num_args (phi) != 1)
1451                     return false;
1452
1453                   phi_op = PHI_ARG_DEF (phi, 0);
1454                   if (TREE_CODE (phi_op) != SSA_NAME)
1455                     return false;
1456
1457                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1458                   if (gimple_nop_p (op_def_stmt)
1459                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1460                       || !vinfo_for_stmt (op_def_stmt))
1461                     return false;
1462
1463                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1464                         != vect_used_in_outer
1465                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1466                            != vect_used_in_outer_by_reduction)
1467                     return false;
1468                 }
1469
1470               continue;
1471             }
1472
1473           gcc_assert (stmt_info);
1474
1475           if (STMT_VINFO_LIVE_P (stmt_info))
1476             {
1477               /* FORNOW: not yet supported.  */
1478               if (dump_enabled_p ())
1479                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1480                                  "not vectorized: value used after loop.\n");
1481               return false;
1482             }
1483
1484           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1485               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1486             {
1487               /* A scalar-dependence cycle that we don't support.  */
1488               if (dump_enabled_p ())
1489                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490                                  "not vectorized: scalar dependence cycle.\n");
1491               return false;
1492             }
1493
1494           if (STMT_VINFO_RELEVANT_P (stmt_info))
1495             {
1496               need_to_vectorize = true;
1497               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1498                 ok = vectorizable_induction (phi, NULL, NULL);
1499             }
1500
1501           if (!ok)
1502             {
1503               if (dump_enabled_p ())
1504                 {
1505                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506                                    "not vectorized: relevant phi not "
1507                                    "supported: ");
1508                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1509                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1510                 }
1511               return false;
1512             }
1513         }
1514
1515       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1516            gsi_next (&si))
1517         {
1518           gimple stmt = gsi_stmt (si);
1519           if (!gimple_clobber_p (stmt)
1520               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1521             return false;
1522         }
1523     } /* bbs */
1524
1525   /* All operations in the loop are either irrelevant (deal with loop
1526      control, or dead), or only used outside the loop and can be moved
1527      out of the loop (e.g. invariants, inductions).  The loop can be
1528      optimized away by scalar optimizations.  We're better off not
1529      touching this loop.  */
1530   if (!need_to_vectorize)
1531     {
1532       if (dump_enabled_p ())
1533         dump_printf_loc (MSG_NOTE, vect_location,
1534                          "All the computation can be taken out of the loop.\n");
1535       if (dump_enabled_p ())
1536         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                          "not vectorized: redundant loop. no profit to "
1538                          "vectorize.\n");
1539       return false;
1540     }
1541
1542   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1543     dump_printf_loc (MSG_NOTE, vect_location,
1544                      "vectorization_factor = %d, niters = "
1545                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1546                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1547
1548   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1549        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1550       || ((max_niter = max_stmt_executions_int (loop)) != -1
1551           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1552     {
1553       if (dump_enabled_p ())
1554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555                          "not vectorized: iteration count too small.\n");
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558                          "not vectorized: iteration count smaller than "
1559                          "vectorization factor.\n");
1560       return false;
1561     }
1562
1563   /* Analyze cost.  Decide if worth while to vectorize.  */
1564
1565   /* Once VF is set, SLP costs should be updated since the number of created
1566      vector stmts depends on VF.  */
1567   vect_update_slp_costs_according_to_vf (loop_vinfo);
1568
1569   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1570                                       &min_profitable_estimate);
1571   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1572
1573   if (min_profitable_iters < 0)
1574     {
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577                          "not vectorized: vectorization not profitable.\n");
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1580                          "not vectorized: vector version will never be "
1581                          "profitable.\n");
1582       return false;
1583     }
1584
1585   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1586                             * vectorization_factor) - 1);
1587
1588
1589   /* Use the cost model only if it is more conservative than user specified
1590      threshold.  */
1591
1592   th = (unsigned) min_scalar_loop_bound;
1593   if (min_profitable_iters
1594       && (!min_scalar_loop_bound
1595           || min_profitable_iters > min_scalar_loop_bound))
1596     th = (unsigned) min_profitable_iters;
1597
1598   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1599
1600   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1601       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "not vectorized: vectorization not profitable.\n");
1606       if (dump_enabled_p ())
1607         dump_printf_loc (MSG_NOTE, vect_location,
1608                          "not vectorized: iteration count smaller than user "
1609                          "specified loop bound parameter or minimum profitable "
1610                          "iterations (whichever is more conservative).\n");
1611       return false;
1612     }
1613
1614   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1615       && ((unsigned HOST_WIDE_INT) estimated_niter
1616           <= MAX (th, (unsigned)min_profitable_estimate)))
1617     {
1618       if (dump_enabled_p ())
1619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1620                          "not vectorized: estimated iteration count too "
1621                          "small.\n");
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_NOTE, vect_location,
1624                          "not vectorized: estimated iteration count smaller "
1625                          "than specified loop bound parameter or minimum "
1626                          "profitable iterations (whichever is more "
1627                          "conservative).\n");
1628       return false;
1629     }
1630
1631   return true;
1632 }
1633
1634
1635 /* Function vect_analyze_loop_2.
1636
1637    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1638    for it.  The different analyses will record information in the
1639    loop_vec_info struct.  */
1640 static bool
1641 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1642 {
1643   bool ok, slp = false;
1644   int max_vf = MAX_VECTORIZATION_FACTOR;
1645   int min_vf = 2;
1646   unsigned int th;
1647   unsigned int n_stmts = 0;
1648
1649   /* Find all data references in the loop (which correspond to vdefs/vuses)
1650      and analyze their evolution in the loop.  Also adjust the minimal
1651      vectorization factor according to the loads and stores.
1652
1653      FORNOW: Handle only simple, array references, which
1654      alignment can be forced, and aligned pointer-references.  */
1655
1656   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1657   if (!ok)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661                          "bad data references.\n");
1662       return false;
1663     }
1664
1665   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1666      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1667
1668   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1669   if (!ok)
1670     {
1671       if (dump_enabled_p ())
1672         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1673                          "bad data access.\n");
1674       return false;
1675     }
1676
1677   /* Classify all cross-iteration scalar data-flow cycles.
1678      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1679
1680   vect_analyze_scalar_cycles (loop_vinfo);
1681
1682   vect_pattern_recog (loop_vinfo, NULL);
1683
1684   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1685
1686   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1687   if (!ok)
1688     {
1689       if (dump_enabled_p ())
1690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                          "unexpected pattern.\n");
1692       return false;
1693     }
1694
1695   /* Analyze data dependences between the data-refs in the loop
1696      and adjust the maximum vectorization factor according to
1697      the dependences.
1698      FORNOW: fail at the first data dependence that we encounter.  */
1699
1700   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1701   if (!ok
1702       || max_vf < min_vf)
1703     {
1704       if (dump_enabled_p ())
1705             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1706                              "bad data dependence.\n");
1707       return false;
1708     }
1709
1710   ok = vect_determine_vectorization_factor (loop_vinfo);
1711   if (!ok)
1712     {
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "can't determine vectorization factor.\n");
1716       return false;
1717     }
1718   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1719     {
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                          "bad data dependence.\n");
1723       return false;
1724     }
1725
1726   /* Analyze the alignment of the data-refs in the loop.
1727      Fail if a data reference is found that cannot be vectorized.  */
1728
1729   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1730   if (!ok)
1731     {
1732       if (dump_enabled_p ())
1733         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734                          "bad data alignment.\n");
1735       return false;
1736     }
1737
1738   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1739      It is important to call pruning after vect_analyze_data_ref_accesses,
1740      since we use grouping information gathered by interleaving analysis.  */
1741   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1742   if (!ok)
1743     {
1744       if (dump_enabled_p ())
1745         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746                          "number of versioning for alias "
1747                          "run-time tests exceeds %d "
1748                          "(--param vect-max-version-for-alias-checks)\n",
1749                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1750       return false;
1751     }
1752
1753   /* This pass will decide on using loop versioning and/or loop peeling in
1754      order to enhance the alignment of data references in the loop.  */
1755
1756   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1757   if (!ok)
1758     {
1759       if (dump_enabled_p ())
1760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                          "bad data alignment.\n");
1762       return false;
1763     }
1764
1765   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1766   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1767   if (ok)
1768     {
1769       /* Decide which possible SLP instances to SLP.  */
1770       slp = vect_make_slp_decision (loop_vinfo);
1771
1772       /* Find stmts that need to be both vectorized and SLPed.  */
1773       vect_detect_hybrid_slp (loop_vinfo);
1774     }
1775   else
1776     return false;
1777
1778   /* Scan all the operations in the loop and make sure they are
1779      vectorizable.  */
1780
1781   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1782   if (!ok)
1783     {
1784       if (dump_enabled_p ())
1785         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786                          "bad operation or unsupported loop bound.\n");
1787       return false;
1788     }
1789
1790   /* Decide whether we need to create an epilogue loop to handle
1791      remaining scalar iterations.  */
1792   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1793         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1794        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1795
1796   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1797       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1798     {
1799       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1800                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1801           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1802         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1803     }
1804   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1805            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1806                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1807                /* In case of versioning, check if the maximum number of
1808                   iterations is greater than th.  If they are identical,
1809                   the epilogue is unnecessary.  */
1810                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1811                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1812                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1813                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1814     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1815
1816   /* If an epilogue loop is required make sure we can create one.  */
1817   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1818       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1819     {
1820       if (dump_enabled_p ())
1821         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1822       if (!vect_can_advance_ivs_p (loop_vinfo)
1823           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1824                                            single_exit (LOOP_VINFO_LOOP
1825                                                          (loop_vinfo))))
1826         {
1827           if (dump_enabled_p ())
1828             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1829                              "not vectorized: can't create required "
1830                              "epilog loop\n");
1831           return false;
1832         }
1833     }
1834
1835   return true;
1836 }
1837
1838 /* Function vect_analyze_loop.
1839
1840    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1841    for it.  The different analyses will record information in the
1842    loop_vec_info struct.  */
1843 loop_vec_info
1844 vect_analyze_loop (struct loop *loop)
1845 {
1846   loop_vec_info loop_vinfo;
1847   unsigned int vector_sizes;
1848
1849   /* Autodetect first vector size we try.  */
1850   current_vector_size = 0;
1851   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1852
1853   if (dump_enabled_p ())
1854     dump_printf_loc (MSG_NOTE, vect_location,
1855                      "===== analyze_loop_nest =====\n");
1856
1857   if (loop_outer (loop)
1858       && loop_vec_info_for_loop (loop_outer (loop))
1859       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1860     {
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_NOTE, vect_location,
1863                          "outer-loop already vectorized.\n");
1864       return NULL;
1865     }
1866
1867   while (1)
1868     {
1869       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1870       loop_vinfo = vect_analyze_loop_form (loop);
1871       if (!loop_vinfo)
1872         {
1873           if (dump_enabled_p ())
1874             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                              "bad loop form.\n");
1876           return NULL;
1877         }
1878
1879       if (vect_analyze_loop_2 (loop_vinfo))
1880         {
1881           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1882
1883           return loop_vinfo;
1884         }
1885
1886       destroy_loop_vec_info (loop_vinfo, true);
1887
1888       vector_sizes &= ~current_vector_size;
1889       if (vector_sizes == 0
1890           || current_vector_size == 0)
1891         return NULL;
1892
1893       /* Try the next biggest vector size.  */
1894       current_vector_size = 1 << floor_log2 (vector_sizes);
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_NOTE, vect_location,
1897                          "***** Re-trying analysis with "
1898                          "vector size %d\n", current_vector_size);
1899     }
1900 }
1901
1902
1903 /* Function reduction_code_for_scalar_code
1904
1905    Input:
1906    CODE - tree_code of a reduction operations.
1907
1908    Output:
1909    REDUC_CODE - the corresponding tree-code to be used to reduce the
1910       vector of partial results into a single scalar result, or ERROR_MARK
1911       if the operation is a supported reduction operation, but does not have
1912       such a tree-code.
1913
1914    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1915
1916 static bool
1917 reduction_code_for_scalar_code (enum tree_code code,
1918                                 enum tree_code *reduc_code)
1919 {
1920   switch (code)
1921     {
1922       case MAX_EXPR:
1923         *reduc_code = REDUC_MAX_EXPR;
1924         return true;
1925
1926       case MIN_EXPR:
1927         *reduc_code = REDUC_MIN_EXPR;
1928         return true;
1929
1930       case PLUS_EXPR:
1931         *reduc_code = REDUC_PLUS_EXPR;
1932         return true;
1933
1934       case MULT_EXPR:
1935       case MINUS_EXPR:
1936       case BIT_IOR_EXPR:
1937       case BIT_XOR_EXPR:
1938       case BIT_AND_EXPR:
1939         *reduc_code = ERROR_MARK;
1940         return true;
1941
1942       default:
1943        return false;
1944     }
1945 }
1946
1947
1948 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1949    STMT is printed with a message MSG. */
1950
1951 static void
1952 report_vect_op (int msg_type, gimple stmt, const char *msg)
1953 {
1954   dump_printf_loc (msg_type, vect_location, "%s", msg);
1955   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1956   dump_printf (msg_type, "\n");
1957 }
1958
1959
1960 /* Detect SLP reduction of the form:
1961
1962    #a1 = phi <a5, a0>
1963    a2 = operation (a1)
1964    a3 = operation (a2)
1965    a4 = operation (a3)
1966    a5 = operation (a4)
1967
1968    #a = phi <a5>
1969
1970    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1971    FIRST_STMT is the first reduction stmt in the chain
1972    (a2 = operation (a1)).
1973
1974    Return TRUE if a reduction chain was detected.  */
1975
1976 static bool
1977 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1978 {
1979   struct loop *loop = (gimple_bb (phi))->loop_father;
1980   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1981   enum tree_code code;
1982   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1983   stmt_vec_info use_stmt_info, current_stmt_info;
1984   tree lhs;
1985   imm_use_iterator imm_iter;
1986   use_operand_p use_p;
1987   int nloop_uses, size = 0, n_out_of_loop_uses;
1988   bool found = false;
1989
1990   if (loop != vect_loop)
1991     return false;
1992
1993   lhs = PHI_RESULT (phi);
1994   code = gimple_assign_rhs_code (first_stmt);
1995   while (1)
1996     {
1997       nloop_uses = 0;
1998       n_out_of_loop_uses = 0;
1999       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2000         {
2001           gimple use_stmt = USE_STMT (use_p);
2002           if (is_gimple_debug (use_stmt))
2003             continue;
2004
2005           /* Check if we got back to the reduction phi.  */
2006           if (use_stmt == phi)
2007             {
2008               loop_use_stmt = use_stmt;
2009               found = true;
2010               break;
2011             }
2012
2013           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2014             {
2015               if (vinfo_for_stmt (use_stmt)
2016                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2017                 {
2018                   loop_use_stmt = use_stmt;
2019                   nloop_uses++;
2020                 }
2021             }
2022            else
2023              n_out_of_loop_uses++;
2024
2025            /* There are can be either a single use in the loop or two uses in
2026               phi nodes.  */
2027            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2028              return false;
2029         }
2030
2031       if (found)
2032         break;
2033
2034       /* We reached a statement with no loop uses.  */
2035       if (nloop_uses == 0)
2036         return false;
2037
2038       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2039       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2040         return false;
2041
2042       if (!is_gimple_assign (loop_use_stmt)
2043           || code != gimple_assign_rhs_code (loop_use_stmt)
2044           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2045         return false;
2046
2047       /* Insert USE_STMT into reduction chain.  */
2048       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2049       if (current_stmt)
2050         {
2051           current_stmt_info = vinfo_for_stmt (current_stmt);
2052           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2053           GROUP_FIRST_ELEMENT (use_stmt_info)
2054             = GROUP_FIRST_ELEMENT (current_stmt_info);
2055         }
2056       else
2057         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2058
2059       lhs = gimple_assign_lhs (loop_use_stmt);
2060       current_stmt = loop_use_stmt;
2061       size++;
2062    }
2063
2064   if (!found || loop_use_stmt != phi || size < 2)
2065     return false;
2066
2067   /* Swap the operands, if needed, to make the reduction operand be the second
2068      operand.  */
2069   lhs = PHI_RESULT (phi);
2070   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2071   while (next_stmt)
2072     {
2073       if (gimple_assign_rhs2 (next_stmt) == lhs)
2074         {
2075           tree op = gimple_assign_rhs1 (next_stmt);
2076           gimple def_stmt = NULL;
2077
2078           if (TREE_CODE (op) == SSA_NAME)
2079             def_stmt = SSA_NAME_DEF_STMT (op);
2080
2081           /* Check that the other def is either defined in the loop
2082              ("vect_internal_def"), or it's an induction (defined by a
2083              loop-header phi-node).  */
2084           if (def_stmt
2085               && gimple_bb (def_stmt)
2086               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2087               && (is_gimple_assign (def_stmt)
2088                   || is_gimple_call (def_stmt)
2089                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2090                            == vect_induction_def
2091                   || (gimple_code (def_stmt) == GIMPLE_PHI
2092                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2093                                   == vect_internal_def
2094                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2095             {
2096               lhs = gimple_assign_lhs (next_stmt);
2097               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2098               continue;
2099             }
2100
2101           return false;
2102         }
2103       else
2104         {
2105           tree op = gimple_assign_rhs2 (next_stmt);
2106           gimple def_stmt = NULL;
2107
2108           if (TREE_CODE (op) == SSA_NAME)
2109             def_stmt = SSA_NAME_DEF_STMT (op);
2110
2111           /* Check that the other def is either defined in the loop
2112             ("vect_internal_def"), or it's an induction (defined by a
2113             loop-header phi-node).  */
2114           if (def_stmt
2115               && gimple_bb (def_stmt)
2116               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2117               && (is_gimple_assign (def_stmt)
2118                   || is_gimple_call (def_stmt)
2119                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2120                               == vect_induction_def
2121                   || (gimple_code (def_stmt) == GIMPLE_PHI
2122                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2123                                   == vect_internal_def
2124                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2125             {
2126               if (dump_enabled_p ())
2127                 {
2128                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2129                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2130                   dump_printf (MSG_NOTE, "\n");
2131                 }
2132
2133               swap_ssa_operands (next_stmt,
2134                                  gimple_assign_rhs1_ptr (next_stmt),
2135                                  gimple_assign_rhs2_ptr (next_stmt));
2136               update_stmt (next_stmt);
2137
2138               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2139                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2140             }
2141           else
2142             return false;
2143         }
2144
2145       lhs = gimple_assign_lhs (next_stmt);
2146       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2147     }
2148
2149   /* Save the chain for further analysis in SLP detection.  */
2150   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2151   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2152   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2153
2154   return true;
2155 }
2156
2157
2158 /* Function vect_is_simple_reduction_1
2159
2160    (1) Detect a cross-iteration def-use cycle that represents a simple
2161    reduction computation.  We look for the following pattern:
2162
2163    loop_header:
2164      a1 = phi < a0, a2 >
2165      a3 = ...
2166      a2 = operation (a3, a1)
2167
2168    or
2169
2170    a3 = ...
2171    loop_header:
2172      a1 = phi < a0, a2 >
2173      a2 = operation (a3, a1)
2174
2175    such that:
2176    1. operation is commutative and associative and it is safe to
2177       change the order of the computation (if CHECK_REDUCTION is true)
2178    2. no uses for a2 in the loop (a2 is used out of the loop)
2179    3. no uses of a1 in the loop besides the reduction operation
2180    4. no uses of a1 outside the loop.
2181
2182    Conditions 1,4 are tested here.
2183    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2184
2185    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2186    nested cycles, if CHECK_REDUCTION is false.
2187
2188    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2189    reductions:
2190
2191      a1 = phi < a0, a2 >
2192      inner loop (def of a3)
2193      a2 = phi < a3 >
2194
2195    If MODIFY is true it tries also to rework the code in-place to enable
2196    detection of more reduction patterns.  For the time being we rewrite
2197    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2198 */
2199
2200 static gimple
2201 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2202                             bool check_reduction, bool *double_reduc,
2203                             bool modify)
2204 {
2205   struct loop *loop = (gimple_bb (phi))->loop_father;
2206   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2207   edge latch_e = loop_latch_edge (loop);
2208   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2209   gimple def_stmt, def1 = NULL, def2 = NULL;
2210   enum tree_code orig_code, code;
2211   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2212   tree type;
2213   int nloop_uses;
2214   tree name;
2215   imm_use_iterator imm_iter;
2216   use_operand_p use_p;
2217   bool phi_def;
2218
2219   *double_reduc = false;
2220
2221   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2222      otherwise, we assume outer loop vectorization.  */
2223   gcc_assert ((check_reduction && loop == vect_loop)
2224               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2225
2226   name = PHI_RESULT (phi);
2227   /* ???  If there are no uses of the PHI result the inner loop reduction
2228      won't be detected as possibly double-reduction by vectorizable_reduction
2229      because that tries to walk the PHI arg from the preheader edge which
2230      can be constant.  See PR60382.  */
2231   if (has_zero_uses (name))
2232     return NULL;
2233   nloop_uses = 0;
2234   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2235     {
2236       gimple use_stmt = USE_STMT (use_p);
2237       if (is_gimple_debug (use_stmt))
2238         continue;
2239
2240       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2241         {
2242           if (dump_enabled_p ())
2243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244                              "intermediate value used outside loop.\n");
2245
2246           return NULL;
2247         }
2248
2249       if (vinfo_for_stmt (use_stmt)
2250           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2251         nloop_uses++;
2252       if (nloop_uses > 1)
2253         {
2254           if (dump_enabled_p ())
2255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                              "reduction used in loop.\n");
2257           return NULL;
2258         }
2259     }
2260
2261   if (TREE_CODE (loop_arg) != SSA_NAME)
2262     {
2263       if (dump_enabled_p ())
2264         {
2265           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2266                            "reduction: not ssa_name: ");
2267           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2268           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2269         }
2270       return NULL;
2271     }
2272
2273   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2274   if (!def_stmt)
2275     {
2276       if (dump_enabled_p ())
2277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278                          "reduction: no def_stmt.\n");
2279       return NULL;
2280     }
2281
2282   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2283     {
2284       if (dump_enabled_p ())
2285         {
2286           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2287           dump_printf (MSG_NOTE, "\n");
2288         }
2289       return NULL;
2290     }
2291
2292   if (is_gimple_assign (def_stmt))
2293     {
2294       name = gimple_assign_lhs (def_stmt);
2295       phi_def = false;
2296     }
2297   else
2298     {
2299       name = PHI_RESULT (def_stmt);
2300       phi_def = true;
2301     }
2302
2303   nloop_uses = 0;
2304   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2305     {
2306       gimple use_stmt = USE_STMT (use_p);
2307       if (is_gimple_debug (use_stmt))
2308         continue;
2309       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2310           && vinfo_for_stmt (use_stmt)
2311           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2312         nloop_uses++;
2313       if (nloop_uses > 1)
2314         {
2315           if (dump_enabled_p ())
2316             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                              "reduction used in loop.\n");
2318           return NULL;
2319         }
2320     }
2321
2322   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2323      defined in the inner loop.  */
2324   if (phi_def)
2325     {
2326       op1 = PHI_ARG_DEF (def_stmt, 0);
2327
2328       if (gimple_phi_num_args (def_stmt) != 1
2329           || TREE_CODE (op1) != SSA_NAME)
2330         {
2331           if (dump_enabled_p ())
2332             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                              "unsupported phi node definition.\n");
2334
2335           return NULL;
2336         }
2337
2338       def1 = SSA_NAME_DEF_STMT (op1);
2339       if (gimple_bb (def1)
2340           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2341           && loop->inner
2342           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2343           && is_gimple_assign (def1))
2344         {
2345           if (dump_enabled_p ())
2346             report_vect_op (MSG_NOTE, def_stmt,
2347                             "detected double reduction: ");
2348
2349           *double_reduc = true;
2350           return def_stmt;
2351         }
2352
2353       return NULL;
2354     }
2355
2356   code = orig_code = gimple_assign_rhs_code (def_stmt);
2357
2358   /* We can handle "res -= x[i]", which is non-associative by
2359      simply rewriting this into "res += -x[i]".  Avoid changing
2360      gimple instruction for the first simple tests and only do this
2361      if we're allowed to change code at all.  */
2362   if (code == MINUS_EXPR
2363       && modify
2364       && (op1 = gimple_assign_rhs1 (def_stmt))
2365       && TREE_CODE (op1) == SSA_NAME
2366       && SSA_NAME_DEF_STMT (op1) == phi)
2367     code = PLUS_EXPR;
2368
2369   if (check_reduction
2370       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2371     {
2372       if (dump_enabled_p ())
2373         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374                         "reduction: not commutative/associative: ");
2375       return NULL;
2376     }
2377
2378   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2379     {
2380       if (code != COND_EXPR)
2381         {
2382           if (dump_enabled_p ())
2383             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2384                             "reduction: not binary operation: ");
2385
2386           return NULL;
2387         }
2388
2389       op3 = gimple_assign_rhs1 (def_stmt);
2390       if (COMPARISON_CLASS_P (op3))
2391         {
2392           op4 = TREE_OPERAND (op3, 1);
2393           op3 = TREE_OPERAND (op3, 0);
2394         }
2395
2396       op1 = gimple_assign_rhs2 (def_stmt);
2397       op2 = gimple_assign_rhs3 (def_stmt);
2398
2399       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2400         {
2401           if (dump_enabled_p ())
2402             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403                             "reduction: uses not ssa_names: ");
2404
2405           return NULL;
2406         }
2407     }
2408   else
2409     {
2410       op1 = gimple_assign_rhs1 (def_stmt);
2411       op2 = gimple_assign_rhs2 (def_stmt);
2412
2413       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2414         {
2415           if (dump_enabled_p ())
2416             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2417                             "reduction: uses not ssa_names: ");
2418
2419           return NULL;
2420         }
2421    }
2422
2423   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2424   if ((TREE_CODE (op1) == SSA_NAME
2425        && !types_compatible_p (type,TREE_TYPE (op1)))
2426       || (TREE_CODE (op2) == SSA_NAME
2427           && !types_compatible_p (type, TREE_TYPE (op2)))
2428       || (op3 && TREE_CODE (op3) == SSA_NAME
2429           && !types_compatible_p (type, TREE_TYPE (op3)))
2430       || (op4 && TREE_CODE (op4) == SSA_NAME
2431           && !types_compatible_p (type, TREE_TYPE (op4))))
2432     {
2433       if (dump_enabled_p ())
2434         {
2435           dump_printf_loc (MSG_NOTE, vect_location,
2436                            "reduction: multiple types: operation type: ");
2437           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2438           dump_printf (MSG_NOTE, ", operands types: ");
2439           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2440                              TREE_TYPE (op1));
2441           dump_printf (MSG_NOTE, ",");
2442           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2443                              TREE_TYPE (op2));
2444           if (op3)
2445             {
2446               dump_printf (MSG_NOTE, ",");
2447               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2448                                  TREE_TYPE (op3));
2449             }
2450
2451           if (op4)
2452             {
2453               dump_printf (MSG_NOTE, ",");
2454               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2455                                  TREE_TYPE (op4));
2456             }
2457           dump_printf (MSG_NOTE, "\n");
2458         }
2459
2460       return NULL;
2461     }
2462
2463   /* Check that it's ok to change the order of the computation.
2464      Generally, when vectorizing a reduction we change the order of the
2465      computation.  This may change the behavior of the program in some
2466      cases, so we need to check that this is ok.  One exception is when
2467      vectorizing an outer-loop: the inner-loop is executed sequentially,
2468      and therefore vectorizing reductions in the inner-loop during
2469      outer-loop vectorization is safe.  */
2470
2471   /* CHECKME: check for !flag_finite_math_only too?  */
2472   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2473       && check_reduction)
2474     {
2475       /* Changing the order of operations changes the semantics.  */
2476       if (dump_enabled_p ())
2477         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2478                         "reduction: unsafe fp math optimization: ");
2479       return NULL;
2480     }
2481   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2482            && check_reduction)
2483     {
2484       /* Changing the order of operations changes the semantics.  */
2485       if (dump_enabled_p ())
2486         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2487                         "reduction: unsafe int math optimization: ");
2488       return NULL;
2489     }
2490   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2491     {
2492       /* Changing the order of operations changes the semantics.  */
2493       if (dump_enabled_p ())
2494         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2495                         "reduction: unsafe fixed-point math optimization: ");
2496       return NULL;
2497     }
2498
2499   /* If we detected "res -= x[i]" earlier, rewrite it into
2500      "res += -x[i]" now.  If this turns out to be useless reassoc
2501      will clean it up again.  */
2502   if (orig_code == MINUS_EXPR)
2503     {
2504       tree rhs = gimple_assign_rhs2 (def_stmt);
2505       tree negrhs = make_ssa_name (TREE_TYPE (rhs), NULL);
2506       gimple negate_stmt = gimple_build_assign_with_ops (NEGATE_EXPR, negrhs,
2507                                                          rhs);
2508       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2509       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2510                                                           loop_info, NULL));
2511       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2512       gimple_assign_set_rhs2 (def_stmt, negrhs);
2513       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2514       update_stmt (def_stmt);
2515     }
2516
2517   /* Reduction is safe. We're dealing with one of the following:
2518      1) integer arithmetic and no trapv
2519      2) floating point arithmetic, and special flags permit this optimization
2520      3) nested cycle (i.e., outer loop vectorization).  */
2521   if (TREE_CODE (op1) == SSA_NAME)
2522     def1 = SSA_NAME_DEF_STMT (op1);
2523
2524   if (TREE_CODE (op2) == SSA_NAME)
2525     def2 = SSA_NAME_DEF_STMT (op2);
2526
2527   if (code != COND_EXPR
2528       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2529     {
2530       if (dump_enabled_p ())
2531         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2532       return NULL;
2533     }
2534
2535   /* Check that one def is the reduction def, defined by PHI,
2536      the other def is either defined in the loop ("vect_internal_def"),
2537      or it's an induction (defined by a loop-header phi-node).  */
2538
2539   if (def2 && def2 == phi
2540       && (code == COND_EXPR
2541           || !def1 || gimple_nop_p (def1)
2542           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2543           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2544               && (is_gimple_assign (def1)
2545                   || is_gimple_call (def1)
2546                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2547                       == vect_induction_def
2548                   || (gimple_code (def1) == GIMPLE_PHI
2549                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2550                           == vect_internal_def
2551                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2552     {
2553       if (dump_enabled_p ())
2554         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2555       return def_stmt;
2556     }
2557
2558   if (def1 && def1 == phi
2559       && (code == COND_EXPR
2560           || !def2 || gimple_nop_p (def2)
2561           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2562           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2563               && (is_gimple_assign (def2)
2564                   || is_gimple_call (def2)
2565                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2566                       == vect_induction_def
2567                   || (gimple_code (def2) == GIMPLE_PHI
2568                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2569                           == vect_internal_def
2570                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2571     {
2572       if (check_reduction)
2573         {
2574           /* Swap operands (just for simplicity - so that the rest of the code
2575              can assume that the reduction variable is always the last (second)
2576              argument).  */
2577           if (dump_enabled_p ())
2578             report_vect_op (MSG_NOTE, def_stmt,
2579                             "detected reduction: need to swap operands: ");
2580
2581           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2582                              gimple_assign_rhs2_ptr (def_stmt));
2583
2584           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2585             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2586         }
2587       else
2588         {
2589           if (dump_enabled_p ())
2590             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2591         }
2592
2593       return def_stmt;
2594     }
2595
2596   /* Try to find SLP reduction chain.  */
2597   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2598     {
2599       if (dump_enabled_p ())
2600         report_vect_op (MSG_NOTE, def_stmt,
2601                         "reduction: detected reduction chain: ");
2602
2603       return def_stmt;
2604     }
2605
2606   if (dump_enabled_p ())
2607     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2608                     "reduction: unknown pattern: ");
2609
2610   return NULL;
2611 }
2612
2613 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2614    in-place.  Arguments as there.  */
2615
2616 static gimple
2617 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2618                           bool check_reduction, bool *double_reduc)
2619 {
2620   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2621                                      double_reduc, false);
2622 }
2623
2624 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2625    in-place if it enables detection of more reductions.  Arguments
2626    as there.  */
2627
2628 gimple
2629 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2630                           bool check_reduction, bool *double_reduc)
2631 {
2632   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2633                                      double_reduc, true);
2634 }
2635
2636 /* Calculate the cost of one scalar iteration of the loop.  */
2637 int
2638 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2639 {
2640   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2641   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2642   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2643   int innerloop_iters, i, stmt_cost;
2644
2645   /* Count statements in scalar loop.  Using this as scalar cost for a single
2646      iteration for now.
2647
2648      TODO: Add outer loop support.
2649
2650      TODO: Consider assigning different costs to different scalar
2651      statements.  */
2652
2653   /* FORNOW.  */
2654   innerloop_iters = 1;
2655   if (loop->inner)
2656     innerloop_iters = 50; /* FIXME */
2657
2658   for (i = 0; i < nbbs; i++)
2659     {
2660       gimple_stmt_iterator si;
2661       basic_block bb = bbs[i];
2662
2663       if (bb->loop_father == loop->inner)
2664         factor = innerloop_iters;
2665       else
2666         factor = 1;
2667
2668       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2669         {
2670           gimple stmt = gsi_stmt (si);
2671           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2672
2673           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2674             continue;
2675
2676           /* Skip stmts that are not vectorized inside the loop.  */
2677           if (stmt_info
2678               && !STMT_VINFO_RELEVANT_P (stmt_info)
2679               && (!STMT_VINFO_LIVE_P (stmt_info)
2680                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2681               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2682             continue;
2683
2684           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2685             {
2686               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2687                stmt_cost = vect_get_stmt_cost (scalar_load);
2688              else
2689                stmt_cost = vect_get_stmt_cost (scalar_store);
2690             }
2691           else
2692             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2693
2694           scalar_single_iter_cost += stmt_cost * factor;
2695         }
2696     }
2697   return scalar_single_iter_cost;
2698 }
2699
2700 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2701 int
2702 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2703                              int *peel_iters_epilogue,
2704                              int scalar_single_iter_cost,
2705                              stmt_vector_for_cost *prologue_cost_vec,
2706                              stmt_vector_for_cost *epilogue_cost_vec)
2707 {
2708   int retval = 0;
2709   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2710
2711   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2712     {
2713       *peel_iters_epilogue = vf/2;
2714       if (dump_enabled_p ())
2715         dump_printf_loc (MSG_NOTE, vect_location,
2716                          "cost model: epilogue peel iters set to vf/2 "
2717                          "because loop iterations are unknown .\n");
2718
2719       /* If peeled iterations are known but number of scalar loop
2720          iterations are unknown, count a taken branch per peeled loop.  */
2721       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2722                                  NULL, 0, vect_prologue);
2723     }
2724   else
2725     {
2726       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2727       peel_iters_prologue = niters < peel_iters_prologue ?
2728                             niters : peel_iters_prologue;
2729       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2730       /* If we need to peel for gaps, but no peeling is required, we have to
2731          peel VF iterations.  */
2732       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2733         *peel_iters_epilogue = vf;
2734     }
2735
2736   if (peel_iters_prologue)
2737     retval += record_stmt_cost (prologue_cost_vec,
2738                                 peel_iters_prologue * scalar_single_iter_cost,
2739                                 scalar_stmt, NULL, 0, vect_prologue);
2740   if (*peel_iters_epilogue)
2741     retval += record_stmt_cost (epilogue_cost_vec,
2742                                 *peel_iters_epilogue * scalar_single_iter_cost,
2743                                 scalar_stmt, NULL, 0, vect_epilogue);
2744   return retval;
2745 }
2746
2747 /* Function vect_estimate_min_profitable_iters
2748
2749    Return the number of iterations required for the vector version of the
2750    loop to be profitable relative to the cost of the scalar version of the
2751    loop.  */
2752
2753 static void
2754 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2755                                     int *ret_min_profitable_niters,
2756                                     int *ret_min_profitable_estimate)
2757 {
2758   int min_profitable_iters;
2759   int min_profitable_estimate;
2760   int peel_iters_prologue;
2761   int peel_iters_epilogue;
2762   unsigned vec_inside_cost = 0;
2763   int vec_outside_cost = 0;
2764   unsigned vec_prologue_cost = 0;
2765   unsigned vec_epilogue_cost = 0;
2766   int scalar_single_iter_cost = 0;
2767   int scalar_outside_cost = 0;
2768   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2769   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2770   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2771
2772   /* Cost model disabled.  */
2773   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2774     {
2775       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2776       *ret_min_profitable_niters = 0;
2777       *ret_min_profitable_estimate = 0;
2778       return;
2779     }
2780
2781   /* Requires loop versioning tests to handle misalignment.  */
2782   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2783     {
2784       /*  FIXME: Make cost depend on complexity of individual check.  */
2785       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2786       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2787                             vect_prologue);
2788       dump_printf (MSG_NOTE,
2789                    "cost model: Adding cost of checks for loop "
2790                    "versioning to treat misalignment.\n");
2791     }
2792
2793   /* Requires loop versioning with alias checks.  */
2794   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2795     {
2796       /*  FIXME: Make cost depend on complexity of individual check.  */
2797       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2798       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2799                             vect_prologue);
2800       dump_printf (MSG_NOTE,
2801                    "cost model: Adding cost of checks for loop "
2802                    "versioning aliasing.\n");
2803     }
2804
2805   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2806       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2807     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2808                           vect_prologue);
2809
2810   /* Count statements in scalar loop.  Using this as scalar cost for a single
2811      iteration for now.
2812
2813      TODO: Add outer loop support.
2814
2815      TODO: Consider assigning different costs to different scalar
2816      statements.  */
2817
2818   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2819
2820   /* Add additional cost for the peeled instructions in prologue and epilogue
2821      loop.
2822
2823      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2824      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2825
2826      TODO: Build an expression that represents peel_iters for prologue and
2827      epilogue to be used in a run-time test.  */
2828
2829   if (npeel  < 0)
2830     {
2831       peel_iters_prologue = vf/2;
2832       dump_printf (MSG_NOTE, "cost model: "
2833                    "prologue peel iters set to vf/2.\n");
2834
2835       /* If peeling for alignment is unknown, loop bound of main loop becomes
2836          unknown.  */
2837       peel_iters_epilogue = vf/2;
2838       dump_printf (MSG_NOTE, "cost model: "
2839                    "epilogue peel iters set to vf/2 because "
2840                    "peeling for alignment is unknown.\n");
2841
2842       /* If peeled iterations are unknown, count a taken branch and a not taken
2843          branch per peeled loop. Even if scalar loop iterations are known,
2844          vector iterations are not known since peeled prologue iterations are
2845          not known. Hence guards remain the same.  */
2846       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2847                             NULL, 0, vect_prologue);
2848       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2849                             NULL, 0, vect_prologue);
2850       /* FORNOW: Don't attempt to pass individual scalar instructions to
2851          the model; just assume linear cost for scalar iterations.  */
2852       (void) add_stmt_cost (target_cost_data,
2853                             peel_iters_prologue * scalar_single_iter_cost,
2854                             scalar_stmt, NULL, 0, vect_prologue);
2855       (void) add_stmt_cost (target_cost_data,
2856                             peel_iters_epilogue * scalar_single_iter_cost,
2857                             scalar_stmt, NULL, 0, vect_epilogue);
2858     }
2859   else
2860     {
2861       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2862       stmt_info_for_cost *si;
2863       int j;
2864       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2865
2866       prologue_cost_vec.create (2);
2867       epilogue_cost_vec.create (2);
2868       peel_iters_prologue = npeel;
2869
2870       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2871                                           &peel_iters_epilogue,
2872                                           scalar_single_iter_cost,
2873                                           &prologue_cost_vec,
2874                                           &epilogue_cost_vec);
2875
2876       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2877         {
2878           struct _stmt_vec_info *stmt_info
2879             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2880           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2881                                 si->misalign, vect_prologue);
2882         }
2883
2884       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2885         {
2886           struct _stmt_vec_info *stmt_info
2887             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2888           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2889                                 si->misalign, vect_epilogue);
2890         }
2891
2892       prologue_cost_vec.release ();
2893       epilogue_cost_vec.release ();
2894     }
2895
2896   /* FORNOW: The scalar outside cost is incremented in one of the
2897      following ways:
2898
2899      1. The vectorizer checks for alignment and aliasing and generates
2900      a condition that allows dynamic vectorization.  A cost model
2901      check is ANDED with the versioning condition.  Hence scalar code
2902      path now has the added cost of the versioning check.
2903
2904        if (cost > th & versioning_check)
2905          jmp to vector code
2906
2907      Hence run-time scalar is incremented by not-taken branch cost.
2908
2909      2. The vectorizer then checks if a prologue is required.  If the
2910      cost model check was not done before during versioning, it has to
2911      be done before the prologue check.
2912
2913        if (cost <= th)
2914          prologue = scalar_iters
2915        if (prologue == 0)
2916          jmp to vector code
2917        else
2918          execute prologue
2919        if (prologue == num_iters)
2920          go to exit
2921
2922      Hence the run-time scalar cost is incremented by a taken branch,
2923      plus a not-taken branch, plus a taken branch cost.
2924
2925      3. The vectorizer then checks if an epilogue is required.  If the
2926      cost model check was not done before during prologue check, it
2927      has to be done with the epilogue check.
2928
2929        if (prologue == 0)
2930          jmp to vector code
2931        else
2932          execute prologue
2933        if (prologue == num_iters)
2934          go to exit
2935        vector code:
2936          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2937            jmp to epilogue
2938
2939      Hence the run-time scalar cost should be incremented by 2 taken
2940      branches.
2941
2942      TODO: The back end may reorder the BBS's differently and reverse
2943      conditions/branch directions.  Change the estimates below to
2944      something more reasonable.  */
2945
2946   /* If the number of iterations is known and we do not do versioning, we can
2947      decide whether to vectorize at compile time.  Hence the scalar version
2948      do not carry cost model guard costs.  */
2949   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2950       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2951       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2952     {
2953       /* Cost model check occurs at versioning.  */
2954       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2955           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2956         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2957       else
2958         {
2959           /* Cost model check occurs at prologue generation.  */
2960           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2961             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2962               + vect_get_stmt_cost (cond_branch_not_taken);
2963           /* Cost model check occurs at epilogue generation.  */
2964           else
2965             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2966         }
2967     }
2968
2969   /* Complete the target-specific cost calculations.  */
2970   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2971                &vec_inside_cost, &vec_epilogue_cost);
2972
2973   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2974
2975   /* Calculate number of iterations required to make the vector version
2976      profitable, relative to the loop bodies only.  The following condition
2977      must hold true:
2978      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2979      where
2980      SIC = scalar iteration cost, VIC = vector iteration cost,
2981      VOC = vector outside cost, VF = vectorization factor,
2982      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2983      SOC = scalar outside cost for run time cost model check.  */
2984
2985   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2986     {
2987       if (vec_outside_cost <= 0)
2988         min_profitable_iters = 1;
2989       else
2990         {
2991           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2992                                   - vec_inside_cost * peel_iters_prologue
2993                                   - vec_inside_cost * peel_iters_epilogue)
2994                                  / ((scalar_single_iter_cost * vf)
2995                                     - vec_inside_cost);
2996
2997           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2998               <= (((int) vec_inside_cost * min_profitable_iters)
2999                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3000             min_profitable_iters++;
3001         }
3002     }
3003   /* vector version will never be profitable.  */
3004   else
3005     {
3006       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3007         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3008                     "did not happen for a simd loop");
3009
3010       if (dump_enabled_p ())
3011         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3012                          "cost model: the vector iteration cost = %d "
3013                          "divided by the scalar iteration cost = %d "
3014                          "is greater or equal to the vectorization factor = %d"
3015                          ".\n",
3016                          vec_inside_cost, scalar_single_iter_cost, vf);
3017       *ret_min_profitable_niters = -1;
3018       *ret_min_profitable_estimate = -1;
3019       return;
3020     }
3021
3022   if (dump_enabled_p ())
3023     {
3024       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3025       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3026                    vec_inside_cost);
3027       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3028                    vec_prologue_cost);
3029       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3030                    vec_epilogue_cost);
3031       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3032                    scalar_single_iter_cost);
3033       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3034                    scalar_outside_cost);
3035       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3036                    vec_outside_cost);
3037       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3038                    peel_iters_prologue);
3039       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3040                    peel_iters_epilogue);
3041       dump_printf (MSG_NOTE,
3042                    "  Calculated minimum iters for profitability: %d\n",
3043                    min_profitable_iters);
3044       dump_printf (MSG_NOTE, "\n");
3045     }
3046
3047   min_profitable_iters =
3048         min_profitable_iters < vf ? vf : min_profitable_iters;
3049
3050   /* Because the condition we create is:
3051      if (niters <= min_profitable_iters)
3052        then skip the vectorized loop.  */
3053   min_profitable_iters--;
3054
3055   if (dump_enabled_p ())
3056     dump_printf_loc (MSG_NOTE, vect_location,
3057                      "  Runtime profitability threshold = %d\n",
3058                      min_profitable_iters);
3059
3060   *ret_min_profitable_niters = min_profitable_iters;
3061
3062   /* Calculate number of iterations required to make the vector version
3063      profitable, relative to the loop bodies only.
3064
3065      Non-vectorized variant is SIC * niters and it must win over vector
3066      variant on the expected loop trip count.  The following condition must hold true:
3067      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3068
3069   if (vec_outside_cost <= 0)
3070     min_profitable_estimate = 1;
3071   else
3072     {
3073       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3074                                  - vec_inside_cost * peel_iters_prologue
3075                                  - vec_inside_cost * peel_iters_epilogue)
3076                                  / ((scalar_single_iter_cost * vf)
3077                                    - vec_inside_cost);
3078     }
3079   min_profitable_estimate --;
3080   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3081   if (dump_enabled_p ())
3082     dump_printf_loc (MSG_NOTE, vect_location,
3083                      "  Static estimate profitability threshold = %d\n",
3084                       min_profitable_iters);
3085
3086   *ret_min_profitable_estimate = min_profitable_estimate;
3087 }
3088
3089 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3090    vector elements (not bits) for a vector of mode MODE.  */
3091 static void
3092 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3093                               unsigned char *sel)
3094 {
3095   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3096
3097   for (i = 0; i < nelt; i++)
3098     sel[i] = (i + offset) & (2*nelt - 1);
3099 }
3100
3101 /* Checks whether the target supports whole-vector shifts for vectors of mode
3102    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3103    it supports vec_perm_const with masks for all necessary shift amounts.  */
3104 static bool
3105 have_whole_vector_shift (enum machine_mode mode)
3106 {
3107   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3108     return true;
3109
3110   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3111     return false;
3112
3113   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3114   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3115
3116   for (i = nelt/2; i >= 1; i/=2)
3117     {
3118       calc_vec_perm_mask_for_shift (mode, i, sel);
3119       if (!can_vec_perm_p (mode, false, sel))
3120         return false;
3121     }
3122   return true;
3123 }
3124
3125 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3126    functions. Design better to avoid maintenance issues.  */
3127
3128 /* Function vect_model_reduction_cost.
3129
3130    Models cost for a reduction operation, including the vector ops
3131    generated within the strip-mine loop, the initial definition before
3132    the loop, and the epilogue code that must be generated.  */
3133
3134 static bool
3135 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3136                            int ncopies)
3137 {
3138   int prologue_cost = 0, epilogue_cost = 0;
3139   enum tree_code code;
3140   optab optab;
3141   tree vectype;
3142   gimple stmt, orig_stmt;
3143   tree reduction_op;
3144   machine_mode mode;
3145   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3146   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3147   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3148
3149   /* Cost of reduction op inside loop.  */
3150   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3151                                         stmt_info, 0, vect_body);
3152   stmt = STMT_VINFO_STMT (stmt_info);
3153
3154   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3155     {
3156     case GIMPLE_SINGLE_RHS:
3157       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3158       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3159       break;
3160     case GIMPLE_UNARY_RHS:
3161       reduction_op = gimple_assign_rhs1 (stmt);
3162       break;
3163     case GIMPLE_BINARY_RHS:
3164       reduction_op = gimple_assign_rhs2 (stmt);
3165       break;
3166     case GIMPLE_TERNARY_RHS:
3167       reduction_op = gimple_assign_rhs3 (stmt);
3168       break;
3169     default:
3170       gcc_unreachable ();
3171     }
3172
3173   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3174   if (!vectype)
3175     {
3176       if (dump_enabled_p ())
3177         {
3178           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3179                            "unsupported data-type ");
3180           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3181                              TREE_TYPE (reduction_op));
3182           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3183         }
3184       return false;
3185    }
3186
3187   mode = TYPE_MODE (vectype);
3188   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3189
3190   if (!orig_stmt)
3191     orig_stmt = STMT_VINFO_STMT (stmt_info);
3192
3193   code = gimple_assign_rhs_code (orig_stmt);
3194
3195   /* Add in cost for initial definition.  */
3196   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3197                                   stmt_info, 0, vect_prologue);
3198
3199   /* Determine cost of epilogue code.
3200
3201      We have a reduction operator that will reduce the vector in one statement.
3202      Also requires scalar extract.  */
3203
3204   if (!nested_in_vect_loop_p (loop, orig_stmt))
3205     {
3206       if (reduc_code != ERROR_MARK)
3207         {
3208           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3209                                           stmt_info, 0, vect_epilogue);
3210           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3211                                           stmt_info, 0, vect_epilogue);
3212         }
3213       else
3214         {
3215           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3216           tree bitsize =
3217             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3218           int element_bitsize = tree_to_uhwi (bitsize);
3219           int nelements = vec_size_in_bits / element_bitsize;
3220
3221           optab = optab_for_tree_code (code, vectype, optab_default);
3222
3223           /* We have a whole vector shift available.  */
3224           if (VECTOR_MODE_P (mode)
3225               && optab_handler (optab, mode) != CODE_FOR_nothing
3226               && have_whole_vector_shift (mode))
3227             {
3228               /* Final reduction via vector shifts and the reduction operator.
3229                  Also requires scalar extract.  */
3230               epilogue_cost += add_stmt_cost (target_cost_data,
3231                                               exact_log2 (nelements) * 2,
3232                                               vector_stmt, stmt_info, 0,
3233                                               vect_epilogue);
3234               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3235                                               vec_to_scalar, stmt_info, 0,
3236                                               vect_epilogue);
3237             }
3238           else
3239             /* Use extracts and reduction op for final reduction.  For N
3240                elements, we have N extracts and N-1 reduction ops.  */
3241             epilogue_cost += add_stmt_cost (target_cost_data,
3242                                             nelements + nelements - 1,
3243                                             vector_stmt, stmt_info, 0,
3244                                             vect_epilogue);
3245         }
3246     }
3247
3248   if (dump_enabled_p ())
3249     dump_printf (MSG_NOTE,
3250                  "vect_model_reduction_cost: inside_cost = %d, "
3251                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3252                  prologue_cost, epilogue_cost);
3253
3254   return true;
3255 }
3256
3257
3258 /* Function vect_model_induction_cost.
3259
3260    Models cost for induction operations.  */
3261
3262 static void
3263 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3264 {
3265   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3266   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3267   unsigned inside_cost, prologue_cost;
3268
3269   /* loop cost for vec_loop.  */
3270   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3271                                stmt_info, 0, vect_body);
3272
3273   /* prologue cost for vec_init and vec_step.  */
3274   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3275                                  stmt_info, 0, vect_prologue);
3276
3277   if (dump_enabled_p ())
3278     dump_printf_loc (MSG_NOTE, vect_location,
3279                      "vect_model_induction_cost: inside_cost = %d, "
3280                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3281 }
3282
3283
3284 /* Function get_initial_def_for_induction
3285
3286    Input:
3287    STMT - a stmt that performs an induction operation in the loop.
3288    IV_PHI - the initial value of the induction variable
3289
3290    Output:
3291    Return a vector variable, initialized with the first VF values of
3292    the induction variable.  E.g., for an iv with IV_PHI='X' and
3293    evolution S, for a vector of 4 units, we want to return:
3294    [X, X + S, X + 2*S, X + 3*S].  */
3295
3296 static tree
3297 get_initial_def_for_induction (gimple iv_phi)
3298 {
3299   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3300   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3301   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3302   tree vectype;
3303   int nunits;
3304   edge pe = loop_preheader_edge (loop);
3305   struct loop *iv_loop;
3306   basic_block new_bb;
3307   tree new_vec, vec_init, vec_step, t;
3308   tree new_var;
3309   tree new_name;
3310   gimple init_stmt, new_stmt;
3311   gphi *induction_phi;
3312   tree induc_def, vec_def, vec_dest;
3313   tree init_expr, step_expr;
3314   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3315   int i;
3316   int ncopies;
3317   tree expr;
3318   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3319   bool nested_in_vect_loop = false;
3320   gimple_seq stmts = NULL;
3321   imm_use_iterator imm_iter;
3322   use_operand_p use_p;
3323   gimple exit_phi;
3324   edge latch_e;
3325   tree loop_arg;
3326   gimple_stmt_iterator si;
3327   basic_block bb = gimple_bb (iv_phi);
3328   tree stepvectype;
3329   tree resvectype;
3330
3331   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3332   if (nested_in_vect_loop_p (loop, iv_phi))
3333     {
3334       nested_in_vect_loop = true;
3335       iv_loop = loop->inner;
3336     }
3337   else
3338     iv_loop = loop;
3339   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3340
3341   latch_e = loop_latch_edge (iv_loop);
3342   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3343
3344   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3345   gcc_assert (step_expr != NULL_TREE);
3346
3347   pe = loop_preheader_edge (iv_loop);
3348   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3349                                      loop_preheader_edge (iv_loop));
3350
3351   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3352   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3353   gcc_assert (vectype);
3354   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3355   ncopies = vf / nunits;
3356
3357   gcc_assert (phi_info);
3358   gcc_assert (ncopies >= 1);
3359
3360   /* Convert the step to the desired type.  */
3361   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3362                                                   step_expr),
3363                                     &stmts, true, NULL_TREE);
3364   if (stmts)
3365     {
3366       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3367       gcc_assert (!new_bb);
3368     }
3369
3370   /* Find the first insertion point in the BB.  */
3371   si = gsi_after_labels (bb);
3372
3373   /* Create the vector that holds the initial_value of the induction.  */
3374   if (nested_in_vect_loop)
3375     {
3376       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3377          been created during vectorization of previous stmts.  We obtain it
3378          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3379       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3380       /* If the initial value is not of proper type, convert it.  */
3381       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3382         {
3383           new_stmt = gimple_build_assign_with_ops
3384               (VIEW_CONVERT_EXPR,
3385                vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_"),
3386                build1 (VIEW_CONVERT_EXPR, vectype, vec_init));
3387           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3388           gimple_assign_set_lhs (new_stmt, vec_init);
3389           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3390                                                  new_stmt);
3391           gcc_assert (!new_bb);
3392           set_vinfo_for_stmt (new_stmt,
3393                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3394         }
3395     }
3396   else
3397     {
3398       vec<constructor_elt, va_gc> *v;
3399
3400       /* iv_loop is the loop to be vectorized. Create:
3401          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3402       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3403                                        vect_scalar_var, "var_");
3404       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3405                                                      init_expr),
3406                                        &stmts, false, new_var);
3407       if (stmts)
3408         {
3409           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3410           gcc_assert (!new_bb);
3411         }
3412
3413       vec_alloc (v, nunits);
3414       bool constant_p = is_gimple_min_invariant (new_name);
3415       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3416       for (i = 1; i < nunits; i++)
3417         {
3418           /* Create: new_name_i = new_name + step_expr  */
3419           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3420                                   new_name, step_expr);
3421           if (!is_gimple_min_invariant (new_name))
3422             {
3423               init_stmt = gimple_build_assign (new_var, new_name);
3424               new_name = make_ssa_name (new_var, init_stmt);
3425               gimple_assign_set_lhs (init_stmt, new_name);
3426               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3427               gcc_assert (!new_bb);
3428               if (dump_enabled_p ())
3429                 {
3430                   dump_printf_loc (MSG_NOTE, vect_location,
3431                                    "created new init_stmt: ");
3432                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3433                   dump_printf (MSG_NOTE, "\n");
3434                 }
3435               constant_p = false;
3436             }
3437           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3438         }
3439       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3440       if (constant_p)
3441         new_vec = build_vector_from_ctor (vectype, v);
3442       else
3443         new_vec = build_constructor (vectype, v);
3444       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3445     }
3446
3447
3448   /* Create the vector that holds the step of the induction.  */
3449   if (nested_in_vect_loop)
3450     /* iv_loop is nested in the loop to be vectorized. Generate:
3451        vec_step = [S, S, S, S]  */
3452     new_name = step_expr;
3453   else
3454     {
3455       /* iv_loop is the loop to be vectorized. Generate:
3456           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3457       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3458         {
3459           expr = build_int_cst (integer_type_node, vf);
3460           expr = fold_convert (TREE_TYPE (step_expr), expr);
3461         }
3462       else
3463         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3464       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3465                               expr, step_expr);
3466       if (TREE_CODE (step_expr) == SSA_NAME)
3467         new_name = vect_init_vector (iv_phi, new_name,
3468                                      TREE_TYPE (step_expr), NULL);
3469     }
3470
3471   t = unshare_expr (new_name);
3472   gcc_assert (CONSTANT_CLASS_P (new_name)
3473               || TREE_CODE (new_name) == SSA_NAME);
3474   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3475   gcc_assert (stepvectype);
3476   new_vec = build_vector_from_val (stepvectype, t);
3477   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3478
3479
3480   /* Create the following def-use cycle:
3481      loop prolog:
3482          vec_init = ...
3483          vec_step = ...
3484      loop:
3485          vec_iv = PHI <vec_init, vec_loop>
3486          ...
3487          STMT
3488          ...
3489          vec_loop = vec_iv + vec_step;  */
3490
3491   /* Create the induction-phi that defines the induction-operand.  */
3492   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3493   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3494   set_vinfo_for_stmt (induction_phi,
3495                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3496   induc_def = PHI_RESULT (induction_phi);
3497
3498   /* Create the iv update inside the loop  */
3499   new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3500                                            induc_def, vec_step);
3501   vec_def = make_ssa_name (vec_dest, new_stmt);
3502   gimple_assign_set_lhs (new_stmt, vec_def);
3503   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3504   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3505                                                    NULL));
3506
3507   /* Set the arguments of the phi node:  */
3508   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3509   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3510                UNKNOWN_LOCATION);
3511
3512
3513   /* In case that vectorization factor (VF) is bigger than the number
3514      of elements that we can fit in a vectype (nunits), we have to generate
3515      more than one vector stmt - i.e - we need to "unroll" the
3516      vector stmt by a factor VF/nunits.  For more details see documentation
3517      in vectorizable_operation.  */
3518
3519   if (ncopies > 1)
3520     {
3521       stmt_vec_info prev_stmt_vinfo;
3522       /* FORNOW. This restriction should be relaxed.  */
3523       gcc_assert (!nested_in_vect_loop);
3524
3525       /* Create the vector that holds the step of the induction.  */
3526       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3527         {
3528           expr = build_int_cst (integer_type_node, nunits);
3529           expr = fold_convert (TREE_TYPE (step_expr), expr);
3530         }
3531       else
3532         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3533       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3534                               expr, step_expr);
3535       if (TREE_CODE (step_expr) == SSA_NAME)
3536         new_name = vect_init_vector (iv_phi, new_name,
3537                                      TREE_TYPE (step_expr), NULL);
3538       t = unshare_expr (new_name);
3539       gcc_assert (CONSTANT_CLASS_P (new_name)
3540                   || TREE_CODE (new_name) == SSA_NAME);
3541       new_vec = build_vector_from_val (stepvectype, t);
3542       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3543
3544       vec_def = induc_def;
3545       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3546       for (i = 1; i < ncopies; i++)
3547         {
3548           /* vec_i = vec_prev + vec_step  */
3549           new_stmt = gimple_build_assign_with_ops (PLUS_EXPR, vec_dest,
3550                                                    vec_def, vec_step);
3551           vec_def = make_ssa_name (vec_dest, new_stmt);
3552           gimple_assign_set_lhs (new_stmt, vec_def);
3553
3554           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3555           if (!useless_type_conversion_p (resvectype, vectype))
3556             {
3557               new_stmt = gimple_build_assign_with_ops
3558                   (VIEW_CONVERT_EXPR,
3559                    vect_get_new_vect_var (resvectype, vect_simple_var,
3560                                           "vec_iv_"),
3561                    build1 (VIEW_CONVERT_EXPR, resvectype,
3562                            gimple_assign_lhs (new_stmt)));
3563               gimple_assign_set_lhs (new_stmt,
3564                                      make_ssa_name
3565                                        (gimple_assign_lhs (new_stmt), new_stmt));
3566               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3567             }
3568           set_vinfo_for_stmt (new_stmt,
3569                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3570           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3571           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3572         }
3573     }
3574
3575   if (nested_in_vect_loop)
3576     {
3577       /* Find the loop-closed exit-phi of the induction, and record
3578          the final vector of induction results:  */
3579       exit_phi = NULL;
3580       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3581         {
3582           gimple use_stmt = USE_STMT (use_p);
3583           if (is_gimple_debug (use_stmt))
3584             continue;
3585
3586           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3587             {
3588               exit_phi = use_stmt;
3589               break;
3590             }
3591         }
3592       if (exit_phi)
3593         {
3594           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3595           /* FORNOW. Currently not supporting the case that an inner-loop induction
3596              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3597           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3598                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3599
3600           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3601           if (dump_enabled_p ())
3602             {
3603               dump_printf_loc (MSG_NOTE, vect_location,
3604                                "vector of inductions after inner-loop:");
3605               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3606               dump_printf (MSG_NOTE, "\n");
3607             }
3608         }
3609     }
3610
3611
3612   if (dump_enabled_p ())
3613     {
3614       dump_printf_loc (MSG_NOTE, vect_location,
3615                        "transform induction: created def-use cycle: ");
3616       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3617       dump_printf (MSG_NOTE, "\n");
3618       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3619                         SSA_NAME_DEF_STMT (vec_def), 0);
3620       dump_printf (MSG_NOTE, "\n");
3621     }
3622
3623   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3624   if (!useless_type_conversion_p (resvectype, vectype))
3625     {
3626       new_stmt = gimple_build_assign_with_ops
3627          (VIEW_CONVERT_EXPR,
3628           vect_get_new_vect_var (resvectype, vect_simple_var, "vec_iv_"),
3629           build1 (VIEW_CONVERT_EXPR, resvectype, induc_def));
3630       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3631       gimple_assign_set_lhs (new_stmt, induc_def);
3632       si = gsi_after_labels (bb);
3633       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3634       set_vinfo_for_stmt (new_stmt,
3635                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3636       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3637         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3638     }
3639
3640   return induc_def;
3641 }
3642
3643
3644 /* Function get_initial_def_for_reduction
3645
3646    Input:
3647    STMT - a stmt that performs a reduction operation in the loop.
3648    INIT_VAL - the initial value of the reduction variable
3649
3650    Output:
3651    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3652         of the reduction (used for adjusting the epilog - see below).
3653    Return a vector variable, initialized according to the operation that STMT
3654         performs. This vector will be used as the initial value of the
3655         vector of partial results.
3656
3657    Option1 (adjust in epilog): Initialize the vector as follows:
3658      add/bit or/xor:    [0,0,...,0,0]
3659      mult/bit and:      [1,1,...,1,1]
3660      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3661    and when necessary (e.g. add/mult case) let the caller know
3662    that it needs to adjust the result by init_val.
3663
3664    Option2: Initialize the vector as follows:
3665      add/bit or/xor:    [init_val,0,0,...,0]
3666      mult/bit and:      [init_val,1,1,...,1]
3667      min/max/cond_expr: [init_val,init_val,...,init_val]
3668    and no adjustments are needed.
3669
3670    For example, for the following code:
3671
3672    s = init_val;
3673    for (i=0;i<n;i++)
3674      s = s + a[i];
3675
3676    STMT is 's = s + a[i]', and the reduction variable is 's'.
3677    For a vector of 4 units, we want to return either [0,0,0,init_val],
3678    or [0,0,0,0] and let the caller know that it needs to adjust
3679    the result at the end by 'init_val'.
3680
3681    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3682    initialization vector is simpler (same element in all entries), if
3683    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3684
3685    A cost model should help decide between these two schemes.  */
3686
3687 tree
3688 get_initial_def_for_reduction (gimple stmt, tree init_val,
3689                                tree *adjustment_def)
3690 {
3691   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3692   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3693   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3694   tree scalar_type = TREE_TYPE (init_val);
3695   tree vectype = get_vectype_for_scalar_type (scalar_type);
3696   int nunits;
3697   enum tree_code code = gimple_assign_rhs_code (stmt);
3698   tree def_for_init;
3699   tree init_def;
3700   tree *elts;
3701   int i;
3702   bool nested_in_vect_loop = false;
3703   tree init_value;
3704   REAL_VALUE_TYPE real_init_val = dconst0;
3705   int int_init_val = 0;
3706   gimple def_stmt = NULL;
3707
3708   gcc_assert (vectype);
3709   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3710
3711   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3712               || SCALAR_FLOAT_TYPE_P (scalar_type));
3713
3714   if (nested_in_vect_loop_p (loop, stmt))
3715     nested_in_vect_loop = true;
3716   else
3717     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3718
3719   /* In case of double reduction we only create a vector variable to be put
3720      in the reduction phi node.  The actual statement creation is done in
3721      vect_create_epilog_for_reduction.  */
3722   if (adjustment_def && nested_in_vect_loop
3723       && TREE_CODE (init_val) == SSA_NAME
3724       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3725       && gimple_code (def_stmt) == GIMPLE_PHI
3726       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3727       && vinfo_for_stmt (def_stmt)
3728       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3729           == vect_double_reduction_def)
3730     {
3731       *adjustment_def = NULL;
3732       return vect_create_destination_var (init_val, vectype);
3733     }
3734
3735   if (TREE_CONSTANT (init_val))
3736     {
3737       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3738         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3739       else
3740         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3741     }
3742   else
3743     init_value = init_val;
3744
3745   switch (code)
3746     {
3747       case WIDEN_SUM_EXPR:
3748       case DOT_PROD_EXPR:
3749       case SAD_EXPR:
3750       case PLUS_EXPR:
3751       case MINUS_EXPR:
3752       case BIT_IOR_EXPR:
3753       case BIT_XOR_EXPR:
3754       case MULT_EXPR:
3755       case BIT_AND_EXPR:
3756         /* ADJUSMENT_DEF is NULL when called from
3757            vect_create_epilog_for_reduction to vectorize double reduction.  */
3758         if (adjustment_def)
3759           {
3760             if (nested_in_vect_loop)
3761               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3762                                                               NULL);
3763             else
3764               *adjustment_def = init_val;
3765           }
3766
3767         if (code == MULT_EXPR)
3768           {
3769             real_init_val = dconst1;
3770             int_init_val = 1;
3771           }
3772
3773         if (code == BIT_AND_EXPR)
3774           int_init_val = -1;
3775
3776         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3777           def_for_init = build_real (scalar_type, real_init_val);
3778         else
3779           def_for_init = build_int_cst (scalar_type, int_init_val);
3780
3781         /* Create a vector of '0' or '1' except the first element.  */
3782         elts = XALLOCAVEC (tree, nunits);
3783         for (i = nunits - 2; i >= 0; --i)
3784           elts[i + 1] = def_for_init;
3785
3786         /* Option1: the first element is '0' or '1' as well.  */
3787         if (adjustment_def)
3788           {
3789             elts[0] = def_for_init;
3790             init_def = build_vector (vectype, elts);
3791             break;
3792           }
3793
3794         /* Option2: the first element is INIT_VAL.  */
3795         elts[0] = init_val;
3796         if (TREE_CONSTANT (init_val))
3797           init_def = build_vector (vectype, elts);
3798         else
3799           {
3800             vec<constructor_elt, va_gc> *v;
3801             vec_alloc (v, nunits);
3802             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3803             for (i = 1; i < nunits; ++i)
3804               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3805             init_def = build_constructor (vectype, v);
3806           }
3807
3808         break;
3809
3810       case MIN_EXPR:
3811       case MAX_EXPR:
3812       case COND_EXPR:
3813         if (adjustment_def)
3814           {
3815             *adjustment_def = NULL_TREE;
3816             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3817             break;
3818           }
3819
3820         init_def = build_vector_from_val (vectype, init_value);
3821         break;
3822
3823       default:
3824         gcc_unreachable ();
3825     }
3826
3827   return init_def;
3828 }
3829
3830 /* Function vect_create_epilog_for_reduction
3831
3832    Create code at the loop-epilog to finalize the result of a reduction
3833    computation.
3834
3835    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3836      reduction statements.
3837    STMT is the scalar reduction stmt that is being vectorized.
3838    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3839      number of elements that we can fit in a vectype (nunits).  In this case
3840      we have to generate more than one vector stmt - i.e - we need to "unroll"
3841      the vector stmt by a factor VF/nunits.  For more details see documentation
3842      in vectorizable_operation.
3843    REDUC_CODE is the tree-code for the epilog reduction.
3844    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3845      computation.
3846    REDUC_INDEX is the index of the operand in the right hand side of the
3847      statement that is defined by REDUCTION_PHI.
3848    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3849    SLP_NODE is an SLP node containing a group of reduction statements. The
3850      first one in this group is STMT.
3851
3852    This function:
3853    1. Creates the reduction def-use cycles: sets the arguments for
3854       REDUCTION_PHIS:
3855       The loop-entry argument is the vectorized initial-value of the reduction.
3856       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3857       sums.
3858    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3859       by applying the operation specified by REDUC_CODE if available, or by
3860       other means (whole-vector shifts or a scalar loop).
3861       The function also creates a new phi node at the loop exit to preserve
3862       loop-closed form, as illustrated below.
3863
3864      The flow at the entry to this function:
3865
3866         loop:
3867           vec_def = phi <null, null>            # REDUCTION_PHI
3868           VECT_DEF = vector_stmt                # vectorized form of STMT
3869           s_loop = scalar_stmt                  # (scalar) STMT
3870         loop_exit:
3871           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3872           use <s_out0>
3873           use <s_out0>
3874
3875      The above is transformed by this function into:
3876
3877         loop:
3878           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3879           VECT_DEF = vector_stmt                # vectorized form of STMT
3880           s_loop = scalar_stmt                  # (scalar) STMT
3881         loop_exit:
3882           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3883           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3884           v_out2 = reduce <v_out1>
3885           s_out3 = extract_field <v_out2, 0>
3886           s_out4 = adjust_result <s_out3>
3887           use <s_out4>
3888           use <s_out4>
3889 */
3890
3891 static void
3892 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3893                                   int ncopies, enum tree_code reduc_code,
3894                                   vec<gimple> reduction_phis,
3895                                   int reduc_index, bool double_reduc,
3896                                   slp_tree slp_node)
3897 {
3898   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3899   stmt_vec_info prev_phi_info;
3900   tree vectype;
3901   machine_mode mode;
3902   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3903   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3904   basic_block exit_bb;
3905   tree scalar_dest;
3906   tree scalar_type;
3907   gimple new_phi = NULL, phi;
3908   gimple_stmt_iterator exit_gsi;
3909   tree vec_dest;
3910   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3911   gimple epilog_stmt = NULL;
3912   enum tree_code code = gimple_assign_rhs_code (stmt);
3913   gimple exit_phi;
3914   tree bitsize;
3915   tree adjustment_def = NULL;
3916   tree vec_initial_def = NULL;
3917   tree reduction_op, expr, def;
3918   tree orig_name, scalar_result;
3919   imm_use_iterator imm_iter, phi_imm_iter;
3920   use_operand_p use_p, phi_use_p;
3921   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3922   bool nested_in_vect_loop = false;
3923   auto_vec<gimple> new_phis;
3924   auto_vec<gimple> inner_phis;
3925   enum vect_def_type dt = vect_unknown_def_type;
3926   int j, i;
3927   auto_vec<tree> scalar_results;
3928   unsigned int group_size = 1, k, ratio;
3929   auto_vec<tree> vec_initial_defs;
3930   auto_vec<gimple> phis;
3931   bool slp_reduc = false;
3932   tree new_phi_result;
3933   gimple inner_phi = NULL;
3934
3935   if (slp_node)
3936     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3937
3938   if (nested_in_vect_loop_p (loop, stmt))
3939     {
3940       outer_loop = loop;
3941       loop = loop->inner;
3942       nested_in_vect_loop = true;
3943       gcc_assert (!slp_node);
3944     }
3945
3946   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3947     {
3948     case GIMPLE_SINGLE_RHS:
3949       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3950                   == ternary_op);
3951       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3952       break;
3953     case GIMPLE_UNARY_RHS:
3954       reduction_op = gimple_assign_rhs1 (stmt);
3955       break;
3956     case GIMPLE_BINARY_RHS:
3957       reduction_op = reduc_index ?
3958                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3959       break;
3960     case GIMPLE_TERNARY_RHS:
3961       reduction_op = gimple_op (stmt, reduc_index + 1);
3962       break;
3963     default:
3964       gcc_unreachable ();
3965     }
3966
3967   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3968   gcc_assert (vectype);
3969   mode = TYPE_MODE (vectype);
3970
3971   /* 1. Create the reduction def-use cycle:
3972      Set the arguments of REDUCTION_PHIS, i.e., transform
3973
3974         loop:
3975           vec_def = phi <null, null>            # REDUCTION_PHI
3976           VECT_DEF = vector_stmt                # vectorized form of STMT
3977           ...
3978
3979      into:
3980
3981         loop:
3982           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3983           VECT_DEF = vector_stmt                # vectorized form of STMT
3984           ...
3985
3986      (in case of SLP, do it for all the phis). */
3987
3988   /* Get the loop-entry arguments.  */
3989   if (slp_node)
3990     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3991                        NULL, slp_node, reduc_index);
3992   else
3993     {
3994       vec_initial_defs.create (1);
3995      /* For the case of reduction, vect_get_vec_def_for_operand returns
3996         the scalar def before the loop, that defines the initial value
3997         of the reduction variable.  */
3998       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
3999                                                       &adjustment_def);
4000       vec_initial_defs.quick_push (vec_initial_def);
4001     }
4002
4003   /* Set phi nodes arguments.  */
4004   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4005     {
4006       tree vec_init_def, def;
4007       gimple_seq stmts;
4008       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4009                                            true, NULL_TREE);
4010       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4011       def = vect_defs[i];
4012       for (j = 0; j < ncopies; j++)
4013         {
4014           /* Set the loop-entry arg of the reduction-phi.  */
4015           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4016                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4017
4018           /* Set the loop-latch arg for the reduction-phi.  */
4019           if (j > 0)
4020             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4021
4022           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4023                        UNKNOWN_LOCATION);
4024
4025           if (dump_enabled_p ())
4026             {
4027               dump_printf_loc (MSG_NOTE, vect_location,
4028                                "transform reduction: created def-use cycle: ");
4029               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4030               dump_printf (MSG_NOTE, "\n");
4031               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4032               dump_printf (MSG_NOTE, "\n");
4033             }
4034
4035           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4036         }
4037     }
4038
4039   /* 2. Create epilog code.
4040         The reduction epilog code operates across the elements of the vector
4041         of partial results computed by the vectorized loop.
4042         The reduction epilog code consists of:
4043
4044         step 1: compute the scalar result in a vector (v_out2)
4045         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4046         step 3: adjust the scalar result (s_out3) if needed.
4047
4048         Step 1 can be accomplished using one the following three schemes:
4049           (scheme 1) using reduc_code, if available.
4050           (scheme 2) using whole-vector shifts, if available.
4051           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4052                      combined.
4053
4054           The overall epilog code looks like this:
4055
4056           s_out0 = phi <s_loop>         # original EXIT_PHI
4057           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4058           v_out2 = reduce <v_out1>              # step 1
4059           s_out3 = extract_field <v_out2, 0>    # step 2
4060           s_out4 = adjust_result <s_out3>       # step 3
4061
4062           (step 3 is optional, and steps 1 and 2 may be combined).
4063           Lastly, the uses of s_out0 are replaced by s_out4.  */
4064
4065
4066   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4067          v_out1 = phi <VECT_DEF>
4068          Store them in NEW_PHIS.  */
4069
4070   exit_bb = single_exit (loop)->dest;
4071   prev_phi_info = NULL;
4072   new_phis.create (vect_defs.length ());
4073   FOR_EACH_VEC_ELT (vect_defs, i, def)
4074     {
4075       for (j = 0; j < ncopies; j++)
4076         {
4077           tree new_def = copy_ssa_name (def, NULL);
4078           phi = create_phi_node (new_def, exit_bb);
4079           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4080           if (j == 0)
4081             new_phis.quick_push (phi);
4082           else
4083             {
4084               def = vect_get_vec_def_for_stmt_copy (dt, def);
4085               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4086             }
4087
4088           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4089           prev_phi_info = vinfo_for_stmt (phi);
4090         }
4091     }
4092
4093   /* The epilogue is created for the outer-loop, i.e., for the loop being
4094      vectorized.  Create exit phis for the outer loop.  */
4095   if (double_reduc)
4096     {
4097       loop = outer_loop;
4098       exit_bb = single_exit (loop)->dest;
4099       inner_phis.create (vect_defs.length ());
4100       FOR_EACH_VEC_ELT (new_phis, i, phi)
4101         {
4102           tree new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4103           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4104           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4105                            PHI_RESULT (phi));
4106           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4107                                                             loop_vinfo, NULL));
4108           inner_phis.quick_push (phi);
4109           new_phis[i] = outer_phi;
4110           prev_phi_info = vinfo_for_stmt (outer_phi);
4111           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4112             {
4113               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4114               new_result = copy_ssa_name (PHI_RESULT (phi), NULL);
4115               outer_phi = create_phi_node (new_result, exit_bb);
4116               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4117                                PHI_RESULT (phi));
4118               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4119                                                         loop_vinfo, NULL));
4120               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4121               prev_phi_info = vinfo_for_stmt (outer_phi);
4122             }
4123         }
4124     }
4125
4126   exit_gsi = gsi_after_labels (exit_bb);
4127
4128   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4129          (i.e. when reduc_code is not available) and in the final adjustment
4130          code (if needed).  Also get the original scalar reduction variable as
4131          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4132          represents a reduction pattern), the tree-code and scalar-def are
4133          taken from the original stmt that the pattern-stmt (STMT) replaces.
4134          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4135          are taken from STMT.  */
4136
4137   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4138   if (!orig_stmt)
4139     {
4140       /* Regular reduction  */
4141       orig_stmt = stmt;
4142     }
4143   else
4144     {
4145       /* Reduction pattern  */
4146       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4147       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4148       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4149     }
4150
4151   code = gimple_assign_rhs_code (orig_stmt);
4152   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4153      partial results are added and not subtracted.  */
4154   if (code == MINUS_EXPR)
4155     code = PLUS_EXPR;
4156
4157   scalar_dest = gimple_assign_lhs (orig_stmt);
4158   scalar_type = TREE_TYPE (scalar_dest);
4159   scalar_results.create (group_size);
4160   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4161   bitsize = TYPE_SIZE (scalar_type);
4162
4163   /* In case this is a reduction in an inner-loop while vectorizing an outer
4164      loop - we don't need to extract a single scalar result at the end of the
4165      inner-loop (unless it is double reduction, i.e., the use of reduction is
4166      outside the outer-loop).  The final vector of partial results will be used
4167      in the vectorized outer-loop, or reduced to a scalar result at the end of
4168      the outer-loop.  */
4169   if (nested_in_vect_loop && !double_reduc)
4170     goto vect_finalize_reduction;
4171
4172   /* SLP reduction without reduction chain, e.g.,
4173      # a1 = phi <a2, a0>
4174      # b1 = phi <b2, b0>
4175      a2 = operation (a1)
4176      b2 = operation (b1)  */
4177   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4178
4179   /* In case of reduction chain, e.g.,
4180      # a1 = phi <a3, a0>
4181      a2 = operation (a1)
4182      a3 = operation (a2),
4183
4184      we may end up with more than one vector result.  Here we reduce them to
4185      one vector.  */
4186   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4187     {
4188       tree first_vect = PHI_RESULT (new_phis[0]);
4189       tree tmp;
4190       gassign *new_vec_stmt = NULL;
4191
4192       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4193       for (k = 1; k < new_phis.length (); k++)
4194         {
4195           gimple next_phi = new_phis[k];
4196           tree second_vect = PHI_RESULT (next_phi);
4197
4198           tmp = build2 (code, vectype,  first_vect, second_vect);
4199           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4200           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4201           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4202           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4203         }
4204
4205       new_phi_result = first_vect;
4206       if (new_vec_stmt)
4207         {
4208           new_phis.truncate (0);
4209           new_phis.safe_push (new_vec_stmt);
4210         }
4211     }
4212   else
4213     new_phi_result = PHI_RESULT (new_phis[0]);
4214
4215   /* 2.3 Create the reduction code, using one of the three schemes described
4216          above. In SLP we simply need to extract all the elements from the
4217          vector (without reducing them), so we use scalar shifts.  */
4218   if (reduc_code != ERROR_MARK && !slp_reduc)
4219     {
4220       tree tmp;
4221       tree vec_elem_type;
4222
4223       /*** Case 1:  Create:
4224            v_out2 = reduc_expr <v_out1>  */
4225
4226       if (dump_enabled_p ())
4227         dump_printf_loc (MSG_NOTE, vect_location,
4228                          "Reduce using direct vector reduction.\n");
4229
4230       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4231       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4232         {
4233           tree tmp_dest =
4234               vect_create_destination_var (scalar_dest, vec_elem_type);
4235           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4236           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4237           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4238           gimple_assign_set_lhs (epilog_stmt, new_temp);
4239           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4240
4241           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4242         }
4243       else
4244         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4245       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4246       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4247       gimple_assign_set_lhs (epilog_stmt, new_temp);
4248       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4249       scalar_results.safe_push (new_temp);
4250     }
4251   else
4252     {
4253       bool reduce_with_shift = have_whole_vector_shift (mode);
4254       int element_bitsize = tree_to_uhwi (bitsize);
4255       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4256       tree vec_temp;
4257
4258       /* Regardless of whether we have a whole vector shift, if we're
4259          emulating the operation via tree-vect-generic, we don't want
4260          to use it.  Only the first round of the reduction is likely
4261          to still be profitable via emulation.  */
4262       /* ??? It might be better to emit a reduction tree code here, so that
4263          tree-vect-generic can expand the first round via bit tricks.  */
4264       if (!VECTOR_MODE_P (mode))
4265         reduce_with_shift = false;
4266       else
4267         {
4268           optab optab = optab_for_tree_code (code, vectype, optab_default);
4269           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4270             reduce_with_shift = false;
4271         }
4272
4273       if (reduce_with_shift && !slp_reduc)
4274         {
4275           int nelements = vec_size_in_bits / element_bitsize;
4276           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4277
4278           int elt_offset;
4279
4280           tree zero_vec = build_zero_cst (vectype);
4281           /*** Case 2: Create:
4282              for (offset = nelements/2; offset >= 1; offset/=2)
4283                 {
4284                   Create:  va' = vec_shift <va, offset>
4285                   Create:  va = vop <va, va'>
4286                 }  */
4287
4288           tree rhs;
4289
4290           if (dump_enabled_p ())
4291             dump_printf_loc (MSG_NOTE, vect_location,
4292                              "Reduce using vector shifts\n");
4293
4294           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4295           new_temp = new_phi_result;
4296           for (elt_offset = nelements / 2;
4297                elt_offset >= 1;
4298                elt_offset /= 2)
4299             {
4300               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4301               tree mask = vect_gen_perm_mask_any (vectype, sel);
4302               epilog_stmt = gimple_build_assign_with_ops (VEC_PERM_EXPR,
4303                                                           vec_dest, new_temp,
4304                                                           zero_vec, mask);
4305               new_name = make_ssa_name (vec_dest, epilog_stmt);
4306               gimple_assign_set_lhs (epilog_stmt, new_name);
4307               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4308
4309               epilog_stmt = gimple_build_assign_with_ops (code, vec_dest,
4310                                                           new_name, new_temp);
4311               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4312               gimple_assign_set_lhs (epilog_stmt, new_temp);
4313               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4314             }
4315
4316           /* 2.4  Extract the final scalar result.  Create:
4317              s_out3 = extract_field <v_out2, bitpos>  */
4318
4319           if (dump_enabled_p ())
4320             dump_printf_loc (MSG_NOTE, vect_location,
4321                              "extract scalar result\n");
4322
4323           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4324                         bitsize, bitsize_zero_node);
4325           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4326           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4327           gimple_assign_set_lhs (epilog_stmt, new_temp);
4328           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4329           scalar_results.safe_push (new_temp);
4330         }
4331       else
4332         {
4333           /*** Case 3: Create:
4334              s = extract_field <v_out2, 0>
4335              for (offset = element_size;
4336                   offset < vector_size;
4337                   offset += element_size;)
4338                {
4339                  Create:  s' = extract_field <v_out2, offset>
4340                  Create:  s = op <s, s'>  // For non SLP cases
4341                }  */
4342
4343           if (dump_enabled_p ())
4344             dump_printf_loc (MSG_NOTE, vect_location,
4345                              "Reduce using scalar code.\n");
4346
4347           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4348           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4349             {
4350               int bit_offset;
4351               if (gimple_code (new_phi) == GIMPLE_PHI)
4352                 vec_temp = PHI_RESULT (new_phi);
4353               else
4354                 vec_temp = gimple_assign_lhs (new_phi);
4355               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4356                             bitsize_zero_node);
4357               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4358               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4359               gimple_assign_set_lhs (epilog_stmt, new_temp);
4360               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4361
4362               /* In SLP we don't need to apply reduction operation, so we just
4363                  collect s' values in SCALAR_RESULTS.  */
4364               if (slp_reduc)
4365                 scalar_results.safe_push (new_temp);
4366
4367               for (bit_offset = element_bitsize;
4368                    bit_offset < vec_size_in_bits;
4369                    bit_offset += element_bitsize)
4370                 {
4371                   tree bitpos = bitsize_int (bit_offset);
4372                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4373                                      bitsize, bitpos);
4374
4375                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4376                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4377                   gimple_assign_set_lhs (epilog_stmt, new_name);
4378                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4379
4380                   if (slp_reduc)
4381                     {
4382                       /* In SLP we don't need to apply reduction operation, so
4383                          we just collect s' values in SCALAR_RESULTS.  */
4384                       new_temp = new_name;
4385                       scalar_results.safe_push (new_name);
4386                     }
4387                   else
4388                     {
4389                       epilog_stmt = gimple_build_assign_with_ops (code,
4390                                           new_scalar_dest, new_name, new_temp);
4391                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4392                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4393                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4394                     }
4395                 }
4396             }
4397
4398           /* The only case where we need to reduce scalar results in SLP, is
4399              unrolling.  If the size of SCALAR_RESULTS is greater than
4400              GROUP_SIZE, we reduce them combining elements modulo
4401              GROUP_SIZE.  */
4402           if (slp_reduc)
4403             {
4404               tree res, first_res, new_res;
4405               gimple new_stmt;
4406
4407               /* Reduce multiple scalar results in case of SLP unrolling.  */
4408               for (j = group_size; scalar_results.iterate (j, &res);
4409                    j++)
4410                 {
4411                   first_res = scalar_results[j % group_size];
4412                   new_stmt = gimple_build_assign_with_ops (code,
4413                                               new_scalar_dest, first_res, res);
4414                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4415                   gimple_assign_set_lhs (new_stmt, new_res);
4416                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4417                   scalar_results[j % group_size] = new_res;
4418                 }
4419             }
4420           else
4421             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4422             scalar_results.safe_push (new_temp);
4423         }
4424     }
4425
4426 vect_finalize_reduction:
4427
4428   if (double_reduc)
4429     loop = loop->inner;
4430
4431   /* 2.5 Adjust the final result by the initial value of the reduction
4432          variable. (When such adjustment is not needed, then
4433          'adjustment_def' is zero).  For example, if code is PLUS we create:
4434          new_temp = loop_exit_def + adjustment_def  */
4435
4436   if (adjustment_def)
4437     {
4438       gcc_assert (!slp_reduc);
4439       if (nested_in_vect_loop)
4440         {
4441           new_phi = new_phis[0];
4442           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4443           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4444           new_dest = vect_create_destination_var (scalar_dest, vectype);
4445         }
4446       else
4447         {
4448           new_temp = scalar_results[0];
4449           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4450           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4451           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4452         }
4453
4454       epilog_stmt = gimple_build_assign (new_dest, expr);
4455       new_temp = make_ssa_name (new_dest, epilog_stmt);
4456       gimple_assign_set_lhs (epilog_stmt, new_temp);
4457       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4458       if (nested_in_vect_loop)
4459         {
4460           set_vinfo_for_stmt (epilog_stmt,
4461                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4462                                                  NULL));
4463           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4464                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4465
4466           if (!double_reduc)
4467             scalar_results.quick_push (new_temp);
4468           else
4469             scalar_results[0] = new_temp;
4470         }
4471       else
4472         scalar_results[0] = new_temp;
4473
4474       new_phis[0] = epilog_stmt;
4475     }
4476
4477   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4478           phis with new adjusted scalar results, i.e., replace use <s_out0>
4479           with use <s_out4>.
4480
4481      Transform:
4482         loop_exit:
4483           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4484           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4485           v_out2 = reduce <v_out1>
4486           s_out3 = extract_field <v_out2, 0>
4487           s_out4 = adjust_result <s_out3>
4488           use <s_out0>
4489           use <s_out0>
4490
4491      into:
4492
4493         loop_exit:
4494           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4495           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4496           v_out2 = reduce <v_out1>
4497           s_out3 = extract_field <v_out2, 0>
4498           s_out4 = adjust_result <s_out3>
4499           use <s_out4>
4500           use <s_out4> */
4501
4502
4503   /* In SLP reduction chain we reduce vector results into one vector if
4504      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4505      the last stmt in the reduction chain, since we are looking for the loop
4506      exit phi node.  */
4507   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4508     {
4509       scalar_dest = gimple_assign_lhs (
4510                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4511       group_size = 1;
4512     }
4513
4514   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4515      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4516      need to match SCALAR_RESULTS with corresponding statements.  The first
4517      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4518      the first vector stmt, etc.
4519      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4520   if (group_size > new_phis.length ())
4521     {
4522       ratio = group_size / new_phis.length ();
4523       gcc_assert (!(group_size % new_phis.length ()));
4524     }
4525   else
4526     ratio = 1;
4527
4528   for (k = 0; k < group_size; k++)
4529     {
4530       if (k % ratio == 0)
4531         {
4532           epilog_stmt = new_phis[k / ratio];
4533           reduction_phi = reduction_phis[k / ratio];
4534           if (double_reduc)
4535             inner_phi = inner_phis[k / ratio];
4536         }
4537
4538       if (slp_reduc)
4539         {
4540           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4541
4542           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4543           /* SLP statements can't participate in patterns.  */
4544           gcc_assert (!orig_stmt);
4545           scalar_dest = gimple_assign_lhs (current_stmt);
4546         }
4547
4548       phis.create (3);
4549       /* Find the loop-closed-use at the loop exit of the original scalar
4550          result.  (The reduction result is expected to have two immediate uses -
4551          one at the latch block, and one at the loop exit).  */
4552       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4553         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4554             && !is_gimple_debug (USE_STMT (use_p)))
4555           phis.safe_push (USE_STMT (use_p));
4556
4557       /* While we expect to have found an exit_phi because of loop-closed-ssa
4558          form we can end up without one if the scalar cycle is dead.  */
4559
4560       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4561         {
4562           if (outer_loop)
4563             {
4564               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4565               gphi *vect_phi;
4566
4567               /* FORNOW. Currently not supporting the case that an inner-loop
4568                  reduction is not used in the outer-loop (but only outside the
4569                  outer-loop), unless it is double reduction.  */
4570               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4571                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4572                           || double_reduc);
4573
4574               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4575               if (!double_reduc
4576                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4577                       != vect_double_reduction_def)
4578                 continue;
4579
4580               /* Handle double reduction:
4581
4582                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4583                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4584                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4585                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4586
4587                  At that point the regular reduction (stmt2 and stmt3) is
4588                  already vectorized, as well as the exit phi node, stmt4.
4589                  Here we vectorize the phi node of double reduction, stmt1, and
4590                  update all relevant statements.  */
4591
4592               /* Go through all the uses of s2 to find double reduction phi
4593                  node, i.e., stmt1 above.  */
4594               orig_name = PHI_RESULT (exit_phi);
4595               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4596                 {
4597                   stmt_vec_info use_stmt_vinfo;
4598                   stmt_vec_info new_phi_vinfo;
4599                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4600                   basic_block bb = gimple_bb (use_stmt);
4601                   gimple use;
4602
4603                   /* Check that USE_STMT is really double reduction phi
4604                      node.  */
4605                   if (gimple_code (use_stmt) != GIMPLE_PHI
4606                       || gimple_phi_num_args (use_stmt) != 2
4607                       || bb->loop_father != outer_loop)
4608                     continue;
4609                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4610                   if (!use_stmt_vinfo
4611                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4612                           != vect_double_reduction_def)
4613                     continue;
4614
4615                   /* Create vector phi node for double reduction:
4616                      vs1 = phi <vs0, vs2>
4617                      vs1 was created previously in this function by a call to
4618                        vect_get_vec_def_for_operand and is stored in
4619                        vec_initial_def;
4620                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4621                      vs0 is created here.  */
4622
4623                   /* Create vector phi node.  */
4624                   vect_phi = create_phi_node (vec_initial_def, bb);
4625                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4626                                     loop_vec_info_for_loop (outer_loop), NULL);
4627                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4628
4629                   /* Create vs0 - initial def of the double reduction phi.  */
4630                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4631                                              loop_preheader_edge (outer_loop));
4632                   init_def = get_initial_def_for_reduction (stmt,
4633                                                           preheader_arg, NULL);
4634                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4635                                                     vectype, NULL);
4636
4637                   /* Update phi node arguments with vs0 and vs2.  */
4638                   add_phi_arg (vect_phi, vect_phi_init,
4639                                loop_preheader_edge (outer_loop),
4640                                UNKNOWN_LOCATION);
4641                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4642                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4643                   if (dump_enabled_p ())
4644                     {
4645                       dump_printf_loc (MSG_NOTE, vect_location,
4646                                        "created double reduction phi node: ");
4647                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4648                       dump_printf (MSG_NOTE, "\n");
4649                     }
4650
4651                   vect_phi_res = PHI_RESULT (vect_phi);
4652
4653                   /* Replace the use, i.e., set the correct vs1 in the regular
4654                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4655                      loop is redundant.  */
4656                   use = reduction_phi;
4657                   for (j = 0; j < ncopies; j++)
4658                     {
4659                       edge pr_edge = loop_preheader_edge (loop);
4660                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4661                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4662                     }
4663                 }
4664             }
4665         }
4666
4667       phis.release ();
4668       if (nested_in_vect_loop)
4669         {
4670           if (double_reduc)
4671             loop = outer_loop;
4672           else
4673             continue;
4674         }
4675
4676       phis.create (3);
4677       /* Find the loop-closed-use at the loop exit of the original scalar
4678          result.  (The reduction result is expected to have two immediate uses,
4679          one at the latch block, and one at the loop exit).  For double
4680          reductions we are looking for exit phis of the outer loop.  */
4681       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4682         {
4683           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4684             {
4685               if (!is_gimple_debug (USE_STMT (use_p)))
4686                 phis.safe_push (USE_STMT (use_p));
4687             }
4688           else
4689             {
4690               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4691                 {
4692                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4693
4694                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4695                     {
4696                       if (!flow_bb_inside_loop_p (loop,
4697                                              gimple_bb (USE_STMT (phi_use_p)))
4698                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4699                         phis.safe_push (USE_STMT (phi_use_p));
4700                     }
4701                 }
4702             }
4703         }
4704
4705       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4706         {
4707           /* Replace the uses:  */
4708           orig_name = PHI_RESULT (exit_phi);
4709           scalar_result = scalar_results[k];
4710           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4711             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4712               SET_USE (use_p, scalar_result);
4713         }
4714
4715       phis.release ();
4716     }
4717 }
4718
4719
4720 /* Function vectorizable_reduction.
4721
4722    Check if STMT performs a reduction operation that can be vectorized.
4723    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4724    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4725    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4726
4727    This function also handles reduction idioms (patterns) that have been
4728    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4729    of this form:
4730      X = pattern_expr (arg0, arg1, ..., X)
4731    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4732    sequence that had been detected and replaced by the pattern-stmt (STMT).
4733
4734    In some cases of reduction patterns, the type of the reduction variable X is
4735    different than the type of the other arguments of STMT.
4736    In such cases, the vectype that is used when transforming STMT into a vector
4737    stmt is different than the vectype that is used to determine the
4738    vectorization factor, because it consists of a different number of elements
4739    than the actual number of elements that are being operated upon in parallel.
4740
4741    For example, consider an accumulation of shorts into an int accumulator.
4742    On some targets it's possible to vectorize this pattern operating on 8
4743    shorts at a time (hence, the vectype for purposes of determining the
4744    vectorization factor should be V8HI); on the other hand, the vectype that
4745    is used to create the vector form is actually V4SI (the type of the result).
4746
4747    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4748    indicates what is the actual level of parallelism (V8HI in the example), so
4749    that the right vectorization factor would be derived.  This vectype
4750    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4751    be used to create the vectorized stmt.  The right vectype for the vectorized
4752    stmt is obtained from the type of the result X:
4753         get_vectype_for_scalar_type (TREE_TYPE (X))
4754
4755    This means that, contrary to "regular" reductions (or "regular" stmts in
4756    general), the following equation:
4757       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4758    does *NOT* necessarily hold for reduction patterns.  */
4759
4760 bool
4761 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4762                         gimple *vec_stmt, slp_tree slp_node)
4763 {
4764   tree vec_dest;
4765   tree scalar_dest;
4766   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4767   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4768   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4769   tree vectype_in = NULL_TREE;
4770   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4771   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4772   enum tree_code code, orig_code, epilog_reduc_code;
4773   machine_mode vec_mode;
4774   int op_type;
4775   optab optab, reduc_optab;
4776   tree new_temp = NULL_TREE;
4777   tree def;
4778   gimple def_stmt;
4779   enum vect_def_type dt;
4780   gphi *new_phi = NULL;
4781   tree scalar_type;
4782   bool is_simple_use;
4783   gimple orig_stmt;
4784   stmt_vec_info orig_stmt_info;
4785   tree expr = NULL_TREE;
4786   int i;
4787   int ncopies;
4788   int epilog_copies;
4789   stmt_vec_info prev_stmt_info, prev_phi_info;
4790   bool single_defuse_cycle = false;
4791   tree reduc_def = NULL_TREE;
4792   gimple new_stmt = NULL;
4793   int j;
4794   tree ops[3];
4795   bool nested_cycle = false, found_nested_cycle_def = false;
4796   gimple reduc_def_stmt = NULL;
4797   /* The default is that the reduction variable is the last in statement.  */
4798   int reduc_index = 2;
4799   bool double_reduc = false, dummy;
4800   basic_block def_bb;
4801   struct loop * def_stmt_loop, *outer_loop = NULL;
4802   tree def_arg;
4803   gimple def_arg_stmt;
4804   auto_vec<tree> vec_oprnds0;
4805   auto_vec<tree> vec_oprnds1;
4806   auto_vec<tree> vect_defs;
4807   auto_vec<gimple> phis;
4808   int vec_num;
4809   tree def0, def1, tem, op0, op1 = NULL_TREE;
4810
4811   /* In case of reduction chain we switch to the first stmt in the chain, but
4812      we don't update STMT_INFO, since only the last stmt is marked as reduction
4813      and has reduction properties.  */
4814   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4815     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4816
4817   if (nested_in_vect_loop_p (loop, stmt))
4818     {
4819       outer_loop = loop;
4820       loop = loop->inner;
4821       nested_cycle = true;
4822     }
4823
4824   /* 1. Is vectorizable reduction?  */
4825   /* Not supportable if the reduction variable is used in the loop, unless
4826      it's a reduction chain.  */
4827   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4828       && !GROUP_FIRST_ELEMENT (stmt_info))
4829     return false;
4830
4831   /* Reductions that are not used even in an enclosing outer-loop,
4832      are expected to be "live" (used out of the loop).  */
4833   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4834       && !STMT_VINFO_LIVE_P (stmt_info))
4835     return false;
4836
4837   /* Make sure it was already recognized as a reduction computation.  */
4838   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4839       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4840     return false;
4841
4842   /* 2. Has this been recognized as a reduction pattern?
4843
4844      Check if STMT represents a pattern that has been recognized
4845      in earlier analysis stages.  For stmts that represent a pattern,
4846      the STMT_VINFO_RELATED_STMT field records the last stmt in
4847      the original sequence that constitutes the pattern.  */
4848
4849   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4850   if (orig_stmt)
4851     {
4852       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4853       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4854       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4855     }
4856
4857   /* 3. Check the operands of the operation.  The first operands are defined
4858         inside the loop body. The last operand is the reduction variable,
4859         which is defined by the loop-header-phi.  */
4860
4861   gcc_assert (is_gimple_assign (stmt));
4862
4863   /* Flatten RHS.  */
4864   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4865     {
4866     case GIMPLE_SINGLE_RHS:
4867       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4868       if (op_type == ternary_op)
4869         {
4870           tree rhs = gimple_assign_rhs1 (stmt);
4871           ops[0] = TREE_OPERAND (rhs, 0);
4872           ops[1] = TREE_OPERAND (rhs, 1);
4873           ops[2] = TREE_OPERAND (rhs, 2);
4874           code = TREE_CODE (rhs);
4875         }
4876       else
4877         return false;
4878       break;
4879
4880     case GIMPLE_BINARY_RHS:
4881       code = gimple_assign_rhs_code (stmt);
4882       op_type = TREE_CODE_LENGTH (code);
4883       gcc_assert (op_type == binary_op);
4884       ops[0] = gimple_assign_rhs1 (stmt);
4885       ops[1] = gimple_assign_rhs2 (stmt);
4886       break;
4887
4888     case GIMPLE_TERNARY_RHS:
4889       code = gimple_assign_rhs_code (stmt);
4890       op_type = TREE_CODE_LENGTH (code);
4891       gcc_assert (op_type == ternary_op);
4892       ops[0] = gimple_assign_rhs1 (stmt);
4893       ops[1] = gimple_assign_rhs2 (stmt);
4894       ops[2] = gimple_assign_rhs3 (stmt);
4895       break;
4896
4897     case GIMPLE_UNARY_RHS:
4898       return false;
4899
4900     default:
4901       gcc_unreachable ();
4902     }
4903
4904   if (code == COND_EXPR && slp_node)
4905     return false;
4906
4907   scalar_dest = gimple_assign_lhs (stmt);
4908   scalar_type = TREE_TYPE (scalar_dest);
4909   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4910       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4911     return false;
4912
4913   /* Do not try to vectorize bit-precision reductions.  */
4914   if ((TYPE_PRECISION (scalar_type)
4915        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4916     return false;
4917
4918   /* All uses but the last are expected to be defined in the loop.
4919      The last use is the reduction variable.  In case of nested cycle this
4920      assumption is not true: we use reduc_index to record the index of the
4921      reduction variable.  */
4922   for (i = 0; i < op_type - 1; i++)
4923     {
4924       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4925       if (i == 0 && code == COND_EXPR)
4926         continue;
4927
4928       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4929                                             &def_stmt, &def, &dt, &tem);
4930       if (!vectype_in)
4931         vectype_in = tem;
4932       gcc_assert (is_simple_use);
4933
4934       if (dt != vect_internal_def
4935           && dt != vect_external_def
4936           && dt != vect_constant_def
4937           && dt != vect_induction_def
4938           && !(dt == vect_nested_cycle && nested_cycle))
4939         return false;
4940
4941       if (dt == vect_nested_cycle)
4942         {
4943           found_nested_cycle_def = true;
4944           reduc_def_stmt = def_stmt;
4945           reduc_index = i;
4946         }
4947     }
4948
4949   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4950                                         &def_stmt, &def, &dt, &tem);
4951   if (!vectype_in)
4952     vectype_in = tem;
4953   gcc_assert (is_simple_use);
4954   if (!(dt == vect_reduction_def
4955         || dt == vect_nested_cycle
4956         || ((dt == vect_internal_def || dt == vect_external_def
4957              || dt == vect_constant_def || dt == vect_induction_def)
4958             && nested_cycle && found_nested_cycle_def)))
4959     {
4960       /* For pattern recognized stmts, orig_stmt might be a reduction,
4961          but some helper statements for the pattern might not, or
4962          might be COND_EXPRs with reduction uses in the condition.  */
4963       gcc_assert (orig_stmt);
4964       return false;
4965     }
4966   if (!found_nested_cycle_def)
4967     reduc_def_stmt = def_stmt;
4968
4969   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4970   if (orig_stmt)
4971     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4972                                                        reduc_def_stmt,
4973                                                        !nested_cycle,
4974                                                        &dummy));
4975   else
4976     {
4977       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4978                                              !nested_cycle, &dummy);
4979       /* We changed STMT to be the first stmt in reduction chain, hence we
4980          check that in this case the first element in the chain is STMT.  */
4981       gcc_assert (stmt == tmp
4982                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4983     }
4984
4985   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4986     return false;
4987
4988   if (slp_node || PURE_SLP_STMT (stmt_info))
4989     ncopies = 1;
4990   else
4991     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4992                / TYPE_VECTOR_SUBPARTS (vectype_in));
4993
4994   gcc_assert (ncopies >= 1);
4995
4996   vec_mode = TYPE_MODE (vectype_in);
4997
4998   if (code == COND_EXPR)
4999     {
5000       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5001         {
5002           if (dump_enabled_p ())
5003             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5004                              "unsupported condition in reduction\n");
5005
5006             return false;
5007         }
5008     }
5009   else
5010     {
5011       /* 4. Supportable by target?  */
5012
5013       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5014           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5015         {
5016           /* Shifts and rotates are only supported by vectorizable_shifts,
5017              not vectorizable_reduction.  */
5018           if (dump_enabled_p ())
5019             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5020                              "unsupported shift or rotation.\n");
5021           return false;
5022         }
5023
5024       /* 4.1. check support for the operation in the loop  */
5025       optab = optab_for_tree_code (code, vectype_in, optab_default);
5026       if (!optab)
5027         {
5028           if (dump_enabled_p ())
5029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5030                              "no optab.\n");
5031
5032           return false;
5033         }
5034
5035       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5036         {
5037           if (dump_enabled_p ())
5038             dump_printf (MSG_NOTE, "op not supported by target.\n");
5039
5040           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5041               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5042                   < vect_min_worthwhile_factor (code))
5043             return false;
5044
5045           if (dump_enabled_p ())
5046             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5047         }
5048
5049       /* Worthwhile without SIMD support?  */
5050       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5051           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5052              < vect_min_worthwhile_factor (code))
5053         {
5054           if (dump_enabled_p ())
5055             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5056                              "not worthwhile without SIMD support.\n");
5057
5058           return false;
5059         }
5060     }
5061
5062   /* 4.2. Check support for the epilog operation.
5063
5064           If STMT represents a reduction pattern, then the type of the
5065           reduction variable may be different than the type of the rest
5066           of the arguments.  For example, consider the case of accumulation
5067           of shorts into an int accumulator; The original code:
5068                         S1: int_a = (int) short_a;
5069           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5070
5071           was replaced with:
5072                         STMT: int_acc = widen_sum <short_a, int_acc>
5073
5074           This means that:
5075           1. The tree-code that is used to create the vector operation in the
5076              epilog code (that reduces the partial results) is not the
5077              tree-code of STMT, but is rather the tree-code of the original
5078              stmt from the pattern that STMT is replacing.  I.e, in the example
5079              above we want to use 'widen_sum' in the loop, but 'plus' in the
5080              epilog.
5081           2. The type (mode) we use to check available target support
5082              for the vector operation to be created in the *epilog*, is
5083              determined by the type of the reduction variable (in the example
5084              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5085              However the type (mode) we use to check available target support
5086              for the vector operation to be created *inside the loop*, is
5087              determined by the type of the other arguments to STMT (in the
5088              example we'd check this: optab_handler (widen_sum_optab,
5089              vect_short_mode)).
5090
5091           This is contrary to "regular" reductions, in which the types of all
5092           the arguments are the same as the type of the reduction variable.
5093           For "regular" reductions we can therefore use the same vector type
5094           (and also the same tree-code) when generating the epilog code and
5095           when generating the code inside the loop.  */
5096
5097   if (orig_stmt)
5098     {
5099       /* This is a reduction pattern: get the vectype from the type of the
5100          reduction variable, and get the tree-code from orig_stmt.  */
5101       orig_code = gimple_assign_rhs_code (orig_stmt);
5102       gcc_assert (vectype_out);
5103       vec_mode = TYPE_MODE (vectype_out);
5104     }
5105   else
5106     {
5107       /* Regular reduction: use the same vectype and tree-code as used for
5108          the vector code inside the loop can be used for the epilog code. */
5109       orig_code = code;
5110     }
5111
5112   if (nested_cycle)
5113     {
5114       def_bb = gimple_bb (reduc_def_stmt);
5115       def_stmt_loop = def_bb->loop_father;
5116       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5117                                        loop_preheader_edge (def_stmt_loop));
5118       if (TREE_CODE (def_arg) == SSA_NAME
5119           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5120           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5121           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5122           && vinfo_for_stmt (def_arg_stmt)
5123           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5124               == vect_double_reduction_def)
5125         double_reduc = true;
5126     }
5127
5128   epilog_reduc_code = ERROR_MARK;
5129   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5130     {
5131       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5132                                          optab_default);
5133       if (!reduc_optab)
5134         {
5135           if (dump_enabled_p ())
5136             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5137                              "no optab for reduction.\n");
5138
5139           epilog_reduc_code = ERROR_MARK;
5140         }
5141       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5142         {
5143           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5144           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5145             {
5146               if (dump_enabled_p ())
5147                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5148                                  "reduc op not supported by target.\n");
5149
5150               epilog_reduc_code = ERROR_MARK;
5151             }
5152         }
5153     }
5154   else
5155     {
5156       if (!nested_cycle || double_reduc)
5157         {
5158           if (dump_enabled_p ())
5159             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5160                              "no reduc code for scalar code.\n");
5161
5162           return false;
5163         }
5164     }
5165
5166   if (double_reduc && ncopies > 1)
5167     {
5168       if (dump_enabled_p ())
5169         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5170                          "multiple types in double reduction\n");
5171
5172       return false;
5173     }
5174
5175   /* In case of widenning multiplication by a constant, we update the type
5176      of the constant to be the type of the other operand.  We check that the
5177      constant fits the type in the pattern recognition pass.  */
5178   if (code == DOT_PROD_EXPR
5179       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5180     {
5181       if (TREE_CODE (ops[0]) == INTEGER_CST)
5182         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5183       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5184         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5185       else
5186         {
5187           if (dump_enabled_p ())
5188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5189                              "invalid types in dot-prod\n");
5190
5191           return false;
5192         }
5193     }
5194
5195   if (!vec_stmt) /* transformation not required.  */
5196     {
5197       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5198         return false;
5199       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5200       return true;
5201     }
5202
5203   /** Transform.  **/
5204
5205   if (dump_enabled_p ())
5206     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5207
5208   /* FORNOW: Multiple types are not supported for condition.  */
5209   if (code == COND_EXPR)
5210     gcc_assert (ncopies == 1);
5211
5212   /* Create the destination vector  */
5213   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5214
5215   /* In case the vectorization factor (VF) is bigger than the number
5216      of elements that we can fit in a vectype (nunits), we have to generate
5217      more than one vector stmt - i.e - we need to "unroll" the
5218      vector stmt by a factor VF/nunits.  For more details see documentation
5219      in vectorizable_operation.  */
5220
5221   /* If the reduction is used in an outer loop we need to generate
5222      VF intermediate results, like so (e.g. for ncopies=2):
5223         r0 = phi (init, r0)
5224         r1 = phi (init, r1)
5225         r0 = x0 + r0;
5226         r1 = x1 + r1;
5227     (i.e. we generate VF results in 2 registers).
5228     In this case we have a separate def-use cycle for each copy, and therefore
5229     for each copy we get the vector def for the reduction variable from the
5230     respective phi node created for this copy.
5231
5232     Otherwise (the reduction is unused in the loop nest), we can combine
5233     together intermediate results, like so (e.g. for ncopies=2):
5234         r = phi (init, r)
5235         r = x0 + r;
5236         r = x1 + r;
5237    (i.e. we generate VF/2 results in a single register).
5238    In this case for each copy we get the vector def for the reduction variable
5239    from the vectorized reduction operation generated in the previous iteration.
5240   */
5241
5242   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5243     {
5244       single_defuse_cycle = true;
5245       epilog_copies = 1;
5246     }
5247   else
5248     epilog_copies = ncopies;
5249
5250   prev_stmt_info = NULL;
5251   prev_phi_info = NULL;
5252   if (slp_node)
5253     {
5254       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5255       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5256                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5257     }
5258   else
5259     {
5260       vec_num = 1;
5261       vec_oprnds0.create (1);
5262       if (op_type == ternary_op)
5263         vec_oprnds1.create (1);
5264     }
5265
5266   phis.create (vec_num);
5267   vect_defs.create (vec_num);
5268   if (!slp_node)
5269     vect_defs.quick_push (NULL_TREE);
5270
5271   for (j = 0; j < ncopies; j++)
5272     {
5273       if (j == 0 || !single_defuse_cycle)
5274         {
5275           for (i = 0; i < vec_num; i++)
5276             {
5277               /* Create the reduction-phi that defines the reduction
5278                  operand.  */
5279               new_phi = create_phi_node (vec_dest, loop->header);
5280               set_vinfo_for_stmt (new_phi,
5281                                   new_stmt_vec_info (new_phi, loop_vinfo,
5282                                                      NULL));
5283                if (j == 0 || slp_node)
5284                  phis.quick_push (new_phi);
5285             }
5286         }
5287
5288       if (code == COND_EXPR)
5289         {
5290           gcc_assert (!slp_node);
5291           vectorizable_condition (stmt, gsi, vec_stmt,
5292                                   PHI_RESULT (phis[0]),
5293                                   reduc_index, NULL);
5294           /* Multiple types are not supported for condition.  */
5295           break;
5296         }
5297
5298       /* Handle uses.  */
5299       if (j == 0)
5300         {
5301           op0 = ops[!reduc_index];
5302           if (op_type == ternary_op)
5303             {
5304               if (reduc_index == 0)
5305                 op1 = ops[2];
5306               else
5307                 op1 = ops[1];
5308             }
5309
5310           if (slp_node)
5311             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5312                                slp_node, -1);
5313           else
5314             {
5315               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5316                                                             stmt, NULL);
5317               vec_oprnds0.quick_push (loop_vec_def0);
5318               if (op_type == ternary_op)
5319                {
5320                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5321                                                                NULL);
5322                  vec_oprnds1.quick_push (loop_vec_def1);
5323                }
5324             }
5325         }
5326       else
5327         {
5328           if (!slp_node)
5329             {
5330               enum vect_def_type dt;
5331               gimple dummy_stmt;
5332               tree dummy;
5333
5334               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5335                                   &dummy_stmt, &dummy, &dt);
5336               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5337                                                               loop_vec_def0);
5338               vec_oprnds0[0] = loop_vec_def0;
5339               if (op_type == ternary_op)
5340                 {
5341                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5342                                       &dummy, &dt);
5343                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5344                                                                 loop_vec_def1);
5345                   vec_oprnds1[0] = loop_vec_def1;
5346                 }
5347             }
5348
5349           if (single_defuse_cycle)
5350             reduc_def = gimple_assign_lhs (new_stmt);
5351
5352           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5353         }
5354
5355       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5356         {
5357           if (slp_node)
5358             reduc_def = PHI_RESULT (phis[i]);
5359           else
5360             {
5361               if (!single_defuse_cycle || j == 0)
5362                 reduc_def = PHI_RESULT (new_phi);
5363             }
5364
5365           def1 = ((op_type == ternary_op)
5366                   ? vec_oprnds1[i] : NULL);
5367           if (op_type == binary_op)
5368             {
5369               if (reduc_index == 0)
5370                 expr = build2 (code, vectype_out, reduc_def, def0);
5371               else
5372                 expr = build2 (code, vectype_out, def0, reduc_def);
5373             }
5374           else
5375             {
5376               if (reduc_index == 0)
5377                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5378               else
5379                 {
5380                   if (reduc_index == 1)
5381                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5382                   else
5383                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5384                 }
5385             }
5386
5387           new_stmt = gimple_build_assign (vec_dest, expr);
5388           new_temp = make_ssa_name (vec_dest, new_stmt);
5389           gimple_assign_set_lhs (new_stmt, new_temp);
5390           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5391
5392           if (slp_node)
5393             {
5394               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5395               vect_defs.quick_push (new_temp);
5396             }
5397           else
5398             vect_defs[0] = new_temp;
5399         }
5400
5401       if (slp_node)
5402         continue;
5403
5404       if (j == 0)
5405         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5406       else
5407         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5408
5409       prev_stmt_info = vinfo_for_stmt (new_stmt);
5410       prev_phi_info = vinfo_for_stmt (new_phi);
5411     }
5412
5413   /* Finalize the reduction-phi (set its arguments) and create the
5414      epilog reduction code.  */
5415   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5416     {
5417       new_temp = gimple_assign_lhs (*vec_stmt);
5418       vect_defs[0] = new_temp;
5419     }
5420
5421   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5422                                     epilog_reduc_code, phis, reduc_index,
5423                                     double_reduc, slp_node);
5424
5425   return true;
5426 }
5427
5428 /* Function vect_min_worthwhile_factor.
5429
5430    For a loop where we could vectorize the operation indicated by CODE,
5431    return the minimum vectorization factor that makes it worthwhile
5432    to use generic vectors.  */
5433 int
5434 vect_min_worthwhile_factor (enum tree_code code)
5435 {
5436   switch (code)
5437     {
5438     case PLUS_EXPR:
5439     case MINUS_EXPR:
5440     case NEGATE_EXPR:
5441       return 4;
5442
5443     case BIT_AND_EXPR:
5444     case BIT_IOR_EXPR:
5445     case BIT_XOR_EXPR:
5446     case BIT_NOT_EXPR:
5447       return 2;
5448
5449     default:
5450       return INT_MAX;
5451     }
5452 }
5453
5454
5455 /* Function vectorizable_induction
5456
5457    Check if PHI performs an induction computation that can be vectorized.
5458    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5459    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5460    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5461
5462 bool
5463 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5464                         gimple *vec_stmt)
5465 {
5466   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5467   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5468   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5469   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5470   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5471   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5472   tree vec_def;
5473
5474   gcc_assert (ncopies >= 1);
5475   /* FORNOW. These restrictions should be relaxed.  */
5476   if (nested_in_vect_loop_p (loop, phi))
5477     {
5478       imm_use_iterator imm_iter;
5479       use_operand_p use_p;
5480       gimple exit_phi;
5481       edge latch_e;
5482       tree loop_arg;
5483
5484       if (ncopies > 1)
5485         {
5486           if (dump_enabled_p ())
5487             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5488                              "multiple types in nested loop.\n");
5489           return false;
5490         }
5491
5492       exit_phi = NULL;
5493       latch_e = loop_latch_edge (loop->inner);
5494       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5495       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5496         {
5497           gimple use_stmt = USE_STMT (use_p);
5498           if (is_gimple_debug (use_stmt))
5499             continue;
5500
5501           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5502             {
5503               exit_phi = use_stmt;
5504               break;
5505             }
5506         }
5507       if (exit_phi)
5508         {
5509           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5510           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5511                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5512             {
5513               if (dump_enabled_p ())
5514                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5515                                  "inner-loop induction only used outside "
5516                                  "of the outer vectorized loop.\n");
5517               return false;
5518             }
5519         }
5520     }
5521
5522   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5523     return false;
5524
5525   /* FORNOW: SLP not supported.  */
5526   if (STMT_SLP_TYPE (stmt_info))
5527     return false;
5528
5529   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5530
5531   if (gimple_code (phi) != GIMPLE_PHI)
5532     return false;
5533
5534   if (!vec_stmt) /* transformation not required.  */
5535     {
5536       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5537       if (dump_enabled_p ())
5538         dump_printf_loc (MSG_NOTE, vect_location,
5539                          "=== vectorizable_induction ===\n");
5540       vect_model_induction_cost (stmt_info, ncopies);
5541       return true;
5542     }
5543
5544   /** Transform.  **/
5545
5546   if (dump_enabled_p ())
5547     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5548
5549   vec_def = get_initial_def_for_induction (phi);
5550   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5551   return true;
5552 }
5553
5554 /* Function vectorizable_live_operation.
5555
5556    STMT computes a value that is used outside the loop.  Check if
5557    it can be supported.  */
5558
5559 bool
5560 vectorizable_live_operation (gimple stmt,
5561                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5562                              gimple *vec_stmt)
5563 {
5564   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5565   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5566   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5567   int i;
5568   int op_type;
5569   tree op;
5570   tree def;
5571   gimple def_stmt;
5572   enum vect_def_type dt;
5573   enum tree_code code;
5574   enum gimple_rhs_class rhs_class;
5575
5576   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5577
5578   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5579     return false;
5580
5581   if (!is_gimple_assign (stmt))
5582     {
5583       if (gimple_call_internal_p (stmt)
5584           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5585           && gimple_call_lhs (stmt)
5586           && loop->simduid
5587           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5588           && loop->simduid
5589              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5590         {
5591           edge e = single_exit (loop);
5592           basic_block merge_bb = e->dest;
5593           imm_use_iterator imm_iter;
5594           use_operand_p use_p;
5595           tree lhs = gimple_call_lhs (stmt);
5596
5597           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5598             {
5599               gimple use_stmt = USE_STMT (use_p);
5600               if (gimple_code (use_stmt) == GIMPLE_PHI
5601                   && gimple_bb (use_stmt) == merge_bb)
5602                 {
5603                   if (vec_stmt)
5604                     {
5605                       tree vfm1
5606                         = build_int_cst (unsigned_type_node,
5607                                          loop_vinfo->vectorization_factor - 1);
5608                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5609                     }
5610                   return true;
5611                 }
5612             }
5613         }
5614
5615       return false;
5616     }
5617
5618   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5619     return false;
5620
5621   /* FORNOW. CHECKME. */
5622   if (nested_in_vect_loop_p (loop, stmt))
5623     return false;
5624
5625   code = gimple_assign_rhs_code (stmt);
5626   op_type = TREE_CODE_LENGTH (code);
5627   rhs_class = get_gimple_rhs_class (code);
5628   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5629   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5630
5631   /* FORNOW: support only if all uses are invariant.  This means
5632      that the scalar operations can remain in place, unvectorized.
5633      The original last scalar value that they compute will be used.  */
5634
5635   for (i = 0; i < op_type; i++)
5636     {
5637       if (rhs_class == GIMPLE_SINGLE_RHS)
5638         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5639       else
5640         op = gimple_op (stmt, i + 1);
5641       if (op
5642           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5643                                   &dt))
5644         {
5645           if (dump_enabled_p ())
5646             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5647                              "use not simple.\n");
5648           return false;
5649         }
5650
5651       if (dt != vect_external_def && dt != vect_constant_def)
5652         return false;
5653     }
5654
5655   /* No transformation is required for the cases we currently support.  */
5656   return true;
5657 }
5658
5659 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5660
5661 static void
5662 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5663 {
5664   ssa_op_iter op_iter;
5665   imm_use_iterator imm_iter;
5666   def_operand_p def_p;
5667   gimple ustmt;
5668
5669   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5670     {
5671       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5672         {
5673           basic_block bb;
5674
5675           if (!is_gimple_debug (ustmt))
5676             continue;
5677
5678           bb = gimple_bb (ustmt);
5679
5680           if (!flow_bb_inside_loop_p (loop, bb))
5681             {
5682               if (gimple_debug_bind_p (ustmt))
5683                 {
5684                   if (dump_enabled_p ())
5685                     dump_printf_loc (MSG_NOTE, vect_location,
5686                                      "killing debug use\n");
5687
5688                   gimple_debug_bind_reset_value (ustmt);
5689                   update_stmt (ustmt);
5690                 }
5691               else
5692                 gcc_unreachable ();
5693             }
5694         }
5695     }
5696 }
5697
5698
5699 /* This function builds ni_name = number of iterations.  Statements
5700    are emitted on the loop preheader edge.  */
5701
5702 static tree
5703 vect_build_loop_niters (loop_vec_info loop_vinfo)
5704 {
5705   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5706   if (TREE_CODE (ni) == INTEGER_CST)
5707     return ni;
5708   else
5709     {
5710       tree ni_name, var;
5711       gimple_seq stmts = NULL;
5712       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5713
5714       var = create_tmp_var (TREE_TYPE (ni), "niters");
5715       ni_name = force_gimple_operand (ni, &stmts, false, var);
5716       if (stmts)
5717         gsi_insert_seq_on_edge_immediate (pe, stmts);
5718
5719       return ni_name;
5720     }
5721 }
5722
5723
5724 /* This function generates the following statements:
5725
5726    ni_name = number of iterations loop executes
5727    ratio = ni_name / vf
5728    ratio_mult_vf_name = ratio * vf
5729
5730    and places them on the loop preheader edge.  */
5731
5732 static void
5733 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5734                                  tree ni_name,
5735                                  tree *ratio_mult_vf_name_ptr,
5736                                  tree *ratio_name_ptr)
5737 {
5738   tree ni_minus_gap_name;
5739   tree var;
5740   tree ratio_name;
5741   tree ratio_mult_vf_name;
5742   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5743   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5744   tree log_vf;
5745
5746   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5747
5748   /* If epilogue loop is required because of data accesses with gaps, we
5749      subtract one iteration from the total number of iterations here for
5750      correct calculation of RATIO.  */
5751   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5752     {
5753       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5754                                        ni_name,
5755                                        build_one_cst (TREE_TYPE (ni_name)));
5756       if (!is_gimple_val (ni_minus_gap_name))
5757         {
5758           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5759           gimple stmts = NULL;
5760           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5761                                                     true, var);
5762           gsi_insert_seq_on_edge_immediate (pe, stmts);
5763         }
5764     }
5765   else
5766     ni_minus_gap_name = ni_name;
5767
5768   /* Create: ratio = ni >> log2(vf) */
5769   /* ???  As we have ni == number of latch executions + 1, ni could
5770      have overflown to zero.  So avoid computing ratio based on ni
5771      but compute it using the fact that we know ratio will be at least
5772      one, thus via (ni - vf) >> log2(vf) + 1.  */
5773   ratio_name
5774     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5775                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5776                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5777                                              ni_minus_gap_name,
5778                                              build_int_cst
5779                                                (TREE_TYPE (ni_name), vf)),
5780                                 log_vf),
5781                    build_int_cst (TREE_TYPE (ni_name), 1));
5782   if (!is_gimple_val (ratio_name))
5783     {
5784       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5785       gimple stmts = NULL;
5786       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5787       gsi_insert_seq_on_edge_immediate (pe, stmts);
5788     }
5789   *ratio_name_ptr = ratio_name;
5790
5791   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5792
5793   if (ratio_mult_vf_name_ptr)
5794     {
5795       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5796                                         ratio_name, log_vf);
5797       if (!is_gimple_val (ratio_mult_vf_name))
5798         {
5799           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5800           gimple stmts = NULL;
5801           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5802                                                      true, var);
5803           gsi_insert_seq_on_edge_immediate (pe, stmts);
5804         }
5805       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5806     }
5807
5808   return;
5809 }
5810
5811
5812 /* Function vect_transform_loop.
5813
5814    The analysis phase has determined that the loop is vectorizable.
5815    Vectorize the loop - created vectorized stmts to replace the scalar
5816    stmts in the loop, and update the loop exit condition.  */
5817
5818 void
5819 vect_transform_loop (loop_vec_info loop_vinfo)
5820 {
5821   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5822   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5823   int nbbs = loop->num_nodes;
5824   int i;
5825   tree ratio = NULL;
5826   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5827   bool grouped_store;
5828   bool slp_scheduled = false;
5829   gimple stmt, pattern_stmt;
5830   gimple_seq pattern_def_seq = NULL;
5831   gimple_stmt_iterator pattern_def_si = gsi_none ();
5832   bool transform_pattern_stmt = false;
5833   bool check_profitability = false;
5834   int th;
5835   /* Record number of iterations before we started tampering with the profile. */
5836   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5837
5838   if (dump_enabled_p ())
5839     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5840
5841   /* If profile is inprecise, we have chance to fix it up.  */
5842   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5843     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5844
5845   /* Use the more conservative vectorization threshold.  If the number
5846      of iterations is constant assume the cost check has been performed
5847      by our caller.  If the threshold makes all loops profitable that
5848      run at least the vectorization factor number of times checking
5849      is pointless, too.  */
5850   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5851   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5852       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5853     {
5854       if (dump_enabled_p ())
5855         dump_printf_loc (MSG_NOTE, vect_location,
5856                          "Profitability threshold is %d loop iterations.\n",
5857                          th);
5858       check_profitability = true;
5859     }
5860
5861   /* Version the loop first, if required, so the profitability check
5862      comes first.  */
5863
5864   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5865       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5866     {
5867       vect_loop_versioning (loop_vinfo, th, check_profitability);
5868       check_profitability = false;
5869     }
5870
5871   tree ni_name = vect_build_loop_niters (loop_vinfo);
5872   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5873
5874   /* Peel the loop if there are data refs with unknown alignment.
5875      Only one data ref with unknown store is allowed.  */
5876
5877   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5878     {
5879       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5880                                      th, check_profitability);
5881       check_profitability = false;
5882       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5883          be re-computed.  */
5884       ni_name = NULL_TREE;
5885     }
5886
5887   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5888      compile time constant), or it is a constant that doesn't divide by the
5889      vectorization factor, then an epilog loop needs to be created.
5890      We therefore duplicate the loop: the original loop will be vectorized,
5891      and will compute the first (n/VF) iterations.  The second copy of the loop
5892      will remain scalar and will compute the remaining (n%VF) iterations.
5893      (VF is the vectorization factor).  */
5894
5895   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5896       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5897     {
5898       tree ratio_mult_vf;
5899       if (!ni_name)
5900         ni_name = vect_build_loop_niters (loop_vinfo);
5901       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5902                                        &ratio);
5903       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5904                                       th, check_profitability);
5905     }
5906   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5907     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5908                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5909   else
5910     {
5911       if (!ni_name)
5912         ni_name = vect_build_loop_niters (loop_vinfo);
5913       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5914     }
5915
5916   /* 1) Make sure the loop header has exactly two entries
5917      2) Make sure we have a preheader basic block.  */
5918
5919   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5920
5921   split_edge (loop_preheader_edge (loop));
5922
5923   /* FORNOW: the vectorizer supports only loops which body consist
5924      of one basic block (header + empty latch). When the vectorizer will
5925      support more involved loop forms, the order by which the BBs are
5926      traversed need to be reconsidered.  */
5927
5928   for (i = 0; i < nbbs; i++)
5929     {
5930       basic_block bb = bbs[i];
5931       stmt_vec_info stmt_info;
5932
5933       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5934            gsi_next (&si))
5935         {
5936           gphi *phi = si.phi ();
5937           if (dump_enabled_p ())
5938             {
5939               dump_printf_loc (MSG_NOTE, vect_location,
5940                                "------>vectorizing phi: ");
5941               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5942               dump_printf (MSG_NOTE, "\n");
5943             }
5944           stmt_info = vinfo_for_stmt (phi);
5945           if (!stmt_info)
5946             continue;
5947
5948           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5949             vect_loop_kill_debug_uses (loop, phi);
5950
5951           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5952               && !STMT_VINFO_LIVE_P (stmt_info))
5953             continue;
5954
5955           if (STMT_VINFO_VECTYPE (stmt_info)
5956               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5957                   != (unsigned HOST_WIDE_INT) vectorization_factor)
5958               && dump_enabled_p ())
5959             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5960
5961           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5962             {
5963               if (dump_enabled_p ())
5964                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5965               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5966             }
5967         }
5968
5969       pattern_stmt = NULL;
5970       for (gimple_stmt_iterator si = gsi_start_bb (bb);
5971            !gsi_end_p (si) || transform_pattern_stmt;)
5972         {
5973           bool is_store;
5974
5975           if (transform_pattern_stmt)
5976             stmt = pattern_stmt;
5977           else
5978             {
5979               stmt = gsi_stmt (si);
5980               /* During vectorization remove existing clobber stmts.  */
5981               if (gimple_clobber_p (stmt))
5982                 {
5983                   unlink_stmt_vdef (stmt);
5984                   gsi_remove (&si, true);
5985                   release_defs (stmt);
5986                   continue;
5987                 }
5988             }
5989
5990           if (dump_enabled_p ())
5991             {
5992               dump_printf_loc (MSG_NOTE, vect_location,
5993                                "------>vectorizing statement: ");
5994               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5995               dump_printf (MSG_NOTE, "\n");
5996             }
5997
5998           stmt_info = vinfo_for_stmt (stmt);
5999
6000           /* vector stmts created in the outer-loop during vectorization of
6001              stmts in an inner-loop may not have a stmt_info, and do not
6002              need to be vectorized.  */
6003           if (!stmt_info)
6004             {
6005               gsi_next (&si);
6006               continue;
6007             }
6008
6009           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6010             vect_loop_kill_debug_uses (loop, stmt);
6011
6012           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6013               && !STMT_VINFO_LIVE_P (stmt_info))
6014             {
6015               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6016                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6017                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6018                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6019                 {
6020                   stmt = pattern_stmt;
6021                   stmt_info = vinfo_for_stmt (stmt);
6022                 }
6023               else
6024                 {
6025                   gsi_next (&si);
6026                   continue;
6027                 }
6028             }
6029           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6030                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6031                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6032                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6033             transform_pattern_stmt = true;
6034
6035           /* If pattern statement has def stmts, vectorize them too.  */
6036           if (is_pattern_stmt_p (stmt_info))
6037             {
6038               if (pattern_def_seq == NULL)
6039                 {
6040                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6041                   pattern_def_si = gsi_start (pattern_def_seq);
6042                 }
6043               else if (!gsi_end_p (pattern_def_si))
6044                 gsi_next (&pattern_def_si);
6045               if (pattern_def_seq != NULL)
6046                 {
6047                   gimple pattern_def_stmt = NULL;
6048                   stmt_vec_info pattern_def_stmt_info = NULL;
6049
6050                   while (!gsi_end_p (pattern_def_si))
6051                     {
6052                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6053                       pattern_def_stmt_info
6054                         = vinfo_for_stmt (pattern_def_stmt);
6055                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6056                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6057                         break;
6058                       gsi_next (&pattern_def_si);
6059                     }
6060
6061                   if (!gsi_end_p (pattern_def_si))
6062                     {
6063                       if (dump_enabled_p ())
6064                         {
6065                           dump_printf_loc (MSG_NOTE, vect_location,
6066                                            "==> vectorizing pattern def "
6067                                            "stmt: ");
6068                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6069                                             pattern_def_stmt, 0);
6070                           dump_printf (MSG_NOTE, "\n");
6071                         }
6072
6073                       stmt = pattern_def_stmt;
6074                       stmt_info = pattern_def_stmt_info;
6075                     }
6076                   else
6077                     {
6078                       pattern_def_si = gsi_none ();
6079                       transform_pattern_stmt = false;
6080                     }
6081                 }
6082               else
6083                 transform_pattern_stmt = false;
6084             }
6085
6086           if (STMT_VINFO_VECTYPE (stmt_info))
6087             {
6088               unsigned int nunits
6089                 = (unsigned int)
6090                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6091               if (!STMT_SLP_TYPE (stmt_info)
6092                   && nunits != (unsigned int) vectorization_factor
6093                   && dump_enabled_p ())
6094                   /* For SLP VF is set according to unrolling factor, and not
6095                      to vector size, hence for SLP this print is not valid.  */
6096                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6097             }
6098
6099           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6100              reached.  */
6101           if (STMT_SLP_TYPE (stmt_info))
6102             {
6103               if (!slp_scheduled)
6104                 {
6105                   slp_scheduled = true;
6106
6107                   if (dump_enabled_p ())
6108                     dump_printf_loc (MSG_NOTE, vect_location,
6109                                      "=== scheduling SLP instances ===\n");
6110
6111                   vect_schedule_slp (loop_vinfo, NULL);
6112                 }
6113
6114               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6115               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6116                 {
6117                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6118                     {
6119                       pattern_def_seq = NULL;
6120                       gsi_next (&si);
6121                     }
6122                   continue;
6123                 }
6124             }
6125
6126           /* -------- vectorize statement ------------ */
6127           if (dump_enabled_p ())
6128             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6129
6130           grouped_store = false;
6131           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6132           if (is_store)
6133             {
6134               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6135                 {
6136                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6137                      interleaving chain was completed - free all the stores in
6138                      the chain.  */
6139                   gsi_next (&si);
6140                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6141                 }
6142               else
6143                 {
6144                   /* Free the attached stmt_vec_info and remove the stmt.  */
6145                   gimple store = gsi_stmt (si);
6146                   free_stmt_vec_info (store);
6147                   unlink_stmt_vdef (store);
6148                   gsi_remove (&si, true);
6149                   release_defs (store);
6150                 }
6151
6152               /* Stores can only appear at the end of pattern statements.  */
6153               gcc_assert (!transform_pattern_stmt);
6154               pattern_def_seq = NULL;
6155             }
6156           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6157             {
6158               pattern_def_seq = NULL;
6159               gsi_next (&si);
6160             }
6161         }                       /* stmts in BB */
6162     }                           /* BBs in loop */
6163
6164   slpeel_make_loop_iterate_ntimes (loop, ratio);
6165
6166   /* Reduce loop iterations by the vectorization factor.  */
6167   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6168                       expected_iterations / vectorization_factor);
6169   loop->nb_iterations_upper_bound
6170     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6171   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6172       && loop->nb_iterations_upper_bound != 0)
6173     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6174   if (loop->any_estimate)
6175     {
6176       loop->nb_iterations_estimate
6177         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6178        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6179            && loop->nb_iterations_estimate != 0)
6180          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6181     }
6182
6183   if (dump_enabled_p ())
6184     {
6185       dump_printf_loc (MSG_NOTE, vect_location,
6186                        "LOOP VECTORIZED\n");
6187       if (loop->inner)
6188         dump_printf_loc (MSG_NOTE, vect_location,
6189                          "OUTER LOOP VECTORIZED\n");
6190       dump_printf (MSG_NOTE, "\n");
6191     }
6192 }