gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2014 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "tm.h"
  27 #include "tree.h"
  28 #include "stor-layout.h"
  29 #include "predict.h"
  30 #include "vec.h"
  31 #include "hashtab.h"
  32 #include "hash-set.h"
  33 #include "machmode.h"
  34 #include "hard-reg-set.h"
  35 #include "input.h"
  36 #include "function.h"
  37 #include "dominance.h"
  38 #include "cfg.h"
  39 #include "cfganal.h"
  40 #include "basic-block.h"
  41 #include "gimple-pretty-print.h"
  42 #include "tree-ssa-alias.h"
  43 #include "internal-fn.h"
  44 #include "gimple-expr.h"
  45 #include "is-a.h"
  46 #include "gimple.h"
  47 #include "gimplify.h"
  48 #include "gimple-iterator.h"
  49 #include "gimplify-me.h"
  50 #include "gimple-ssa.h"
  51 #include "tree-phinodes.h"
  52 #include "ssa-iterators.h"
  53 #include "stringpool.h"
  54 #include "tree-ssanames.h"
  55 #include "tree-ssa-loop-ivopts.h"
  56 #include "tree-ssa-loop-manip.h"
  57 #include "tree-ssa-loop-niter.h"
  58 #include "tree-pass.h"
  59 #include "cfgloop.h"
  60 #include "expr.h"
  61 #include "recog.h"
  62 #include "insn-codes.h"
  63 #include "optabs.h"
  64 #include "params.h"
  65 #include "diagnostic-core.h"
  66 #include "tree-chrec.h"
  67 #include "tree-scalar-evolution.h"
  68 #include "tree-vectorizer.h"
  69 #include "target.h"
  70
  71 /* Loop Vectorization Pass.
  72
  73    This pass tries to vectorize loops.
  74
  75    For example, the vectorizer transforms the following simple loop:
  76
  77         short a[N]; short b[N]; short c[N]; int i;
  78
  79         for (i=0; i<N; i++){
  80           a[i] = b[i] + c[i];
  81         }
  82
  83    as if it was manually vectorized by rewriting the source code into:
  84
  85         typedef int __attribute__((mode(V8HI))) v8hi;
  86         short a[N];  short b[N]; short c[N];   int i;
  87         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  88         v8hi va, vb, vc;
  89
  90         for (i=0; i<N/8; i++){
  91           vb = pb[i];
  92           vc = pc[i];
  93           va = vb + vc;
  94           pa[i] = va;
  95         }
  96
  97         The main entry to this pass is vectorize_loops(), in which
  98    the vectorizer applies a set of analyses on a given set of loops,
  99    followed by the actual vectorization transformation for the loops that
 100    had successfully passed the analysis phase.
 101         Throughout this pass we make a distinction between two types of
 102    data: scalars (which are represented by SSA_NAMES), and memory references
 103    ("data-refs").  These two types of data require different handling both
 104    during analysis and transformation. The types of data-refs that the
 105    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 106    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 107    accesses are required to have a simple (consecutive) access pattern.
 108
 109    Analysis phase:
 110    ===============
 111         The driver for the analysis phase is vect_analyze_loop().
 112    It applies a set of analyses, some of which rely on the scalar evolution
 113    analyzer (scev) developed by Sebastian Pop.
 114
 115         During the analysis phase the vectorizer records some information
 116    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 117    loop, as well as general information about the loop as a whole, which is
 118    recorded in a "loop_vec_info" struct attached to each loop.
 119
 120    Transformation phase:
 121    =====================
 122         The loop transformation phase scans all the stmts in the loop, and
 123    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 124    the loop that needs to be vectorized.  It inserts the vector code sequence
 125    just before the scalar stmt S, and records a pointer to the vector code
 126    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 127    attached to S).  This pointer will be used for the vectorization of following
 128    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 129    otherwise, we rely on dead code elimination for removing it.
 130
 131         For example, say stmt S1 was vectorized into stmt VS1:
 132
 133    VS1: vb = px[i];
 134    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 135    S2:  a = b;
 136
 137    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 138    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 139    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 140    resulting sequence would be:
 141
 142    VS1: vb = px[i];
 143    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 144    VS2: va = vb;
 145    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 146
 147         Operands that are not SSA_NAMEs, are data-refs that appear in
 148    load/store operations (like 'x[i]' in S1), and are handled differently.
 149
 150    Target modeling:
 151    =================
 152         Currently the only target specific information that is used is the
 153    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 154    Targets that can support different sizes of vectors, for now will need
 155    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 156    flexibility will be added in the future.
 157
 158         Since we only vectorize operations which vector form can be
 159    expressed using existing tree codes, to verify that an operation is
 160    supported, the vectorizer checks the relevant optab at the relevant
 161    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 162    the value found is CODE_FOR_nothing, then there's no target support, and
 163    we can't vectorize the stmt.
 164
 165    For additional information on this project see:
 166    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 167 */
 168
 169 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 170
 171 /* Function vect_determine_vectorization_factor
 172
 173    Determine the vectorization factor (VF).  VF is the number of data elements
 174    that are operated upon in parallel in a single iteration of the vectorized
 175    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 176    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 177    elements can fit in a single vector register.
 178
 179    We currently support vectorization of loops in which all types operated upon
 180    are of the same size.  Therefore this function currently sets VF according to
 181    the size of the types operated upon, and fails if there are multiple sizes
 182    in the loop.
 183
 184    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 185    original loop:
 186         for (i=0; i<N; i++){
 187           a[i] = b[i] + c[i];
 188         }
 189
 190    vectorized loop:
 191         for (i=0; i<N; i+=VF){
 192           a[i:VF] = b[i:VF] + c[i:VF];
 193         }
 194 */
 195
 196 static bool
 197 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 198 {
 199   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 200   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 201   int nbbs = loop->num_nodes;
 202   unsigned int vectorization_factor = 0;
 203   tree scalar_type;
 204   gphi *phi;
 205   tree vectype;
 206   unsigned int nunits;
 207   stmt_vec_info stmt_info;
 208   int i;
 209   HOST_WIDE_INT dummy;
 210   gimple stmt, pattern_stmt = NULL;
 211   gimple_seq pattern_def_seq = NULL;
 212   gimple_stmt_iterator pattern_def_si = gsi_none ();
 213   bool analyze_pattern_stmt = false;
 214
 215   if (dump_enabled_p ())
 216     dump_printf_loc (MSG_NOTE, vect_location,
 217                      "=== vect_determine_vectorization_factor ===\n");
 218
 219   for (i = 0; i < nbbs; i++)
 220     {
 221       basic_block bb = bbs[i];
 222
 223       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 224            gsi_next (&si))
 225         {
 226           phi = si.phi ();
 227           stmt_info = vinfo_for_stmt (phi);
 228           if (dump_enabled_p ())
 229             {
 230               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 231               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 232               dump_printf (MSG_NOTE, "\n");
 233             }
 234
 235           gcc_assert (stmt_info);
 236
 237           if (STMT_VINFO_RELEVANT_P (stmt_info))
 238             {
 239               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 240               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 241
 242               if (dump_enabled_p ())
 243                 {
 244                   dump_printf_loc (MSG_NOTE, vect_location,
 245                                    "get vectype for scalar type:  ");
 246                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 247                   dump_printf (MSG_NOTE, "\n");
 248                 }
 249
 250               vectype = get_vectype_for_scalar_type (scalar_type);
 251               if (!vectype)
 252                 {
 253                   if (dump_enabled_p ())
 254                     {
 255                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 256                                        "not vectorized: unsupported "
 257                                        "data-type ");
 258                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 259                                          scalar_type);
 260                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 261                     }
 262                   return false;
 263                 }
 264               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 265
 266               if (dump_enabled_p ())
 267                 {
 268                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 269                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 270                   dump_printf (MSG_NOTE, "\n");
 271                 }
 272
 273               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 274               if (dump_enabled_p ())
 275                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 276                                  nunits);
 277
 278               if (!vectorization_factor
 279                   || (nunits > vectorization_factor))
 280                 vectorization_factor = nunits;
 281             }
 282         }
 283
 284       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 285            !gsi_end_p (si) || analyze_pattern_stmt;)
 286         {
 287           tree vf_vectype;
 288
 289           if (analyze_pattern_stmt)
 290             stmt = pattern_stmt;
 291           else
 292             stmt = gsi_stmt (si);
 293
 294           stmt_info = vinfo_for_stmt (stmt);
 295
 296           if (dump_enabled_p ())
 297             {
 298               dump_printf_loc (MSG_NOTE, vect_location,
 299                                "==> examining statement: ");
 300               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 301               dump_printf (MSG_NOTE, "\n");
 302             }
 303
 304           gcc_assert (stmt_info);
 305
 306           /* Skip stmts which do not need to be vectorized.  */
 307           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 308                && !STMT_VINFO_LIVE_P (stmt_info))
 309               || gimple_clobber_p (stmt))
 310             {
 311               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 312                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 313                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 314                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 315                 {
 316                   stmt = pattern_stmt;
 317                   stmt_info = vinfo_for_stmt (pattern_stmt);
 318                   if (dump_enabled_p ())
 319                     {
 320                       dump_printf_loc (MSG_NOTE, vect_location,
 321                                        "==> examining pattern statement: ");
 322                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 323                       dump_printf (MSG_NOTE, "\n");
 324                     }
 325                 }
 326               else
 327                 {
 328                   if (dump_enabled_p ())
 329                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 330                   gsi_next (&si);
 331                   continue;
 332                 }
 333             }
 334           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 335                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 336                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 337                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 338             analyze_pattern_stmt = true;
 339
 340           /* If a pattern statement has def stmts, analyze them too.  */
 341           if (is_pattern_stmt_p (stmt_info))
 342             {
 343               if (pattern_def_seq == NULL)
 344                 {
 345                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 346                   pattern_def_si = gsi_start (pattern_def_seq);
 347                 }
 348               else if (!gsi_end_p (pattern_def_si))
 349                 gsi_next (&pattern_def_si);
 350               if (pattern_def_seq != NULL)
 351                 {
 352                   gimple pattern_def_stmt = NULL;
 353                   stmt_vec_info pattern_def_stmt_info = NULL;
 354
 355                   while (!gsi_end_p (pattern_def_si))
 356                     {
 357                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 358                       pattern_def_stmt_info
 359                         = vinfo_for_stmt (pattern_def_stmt);
 360                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 361                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 362                         break;
 363                       gsi_next (&pattern_def_si);
 364                     }
 365
 366                   if (!gsi_end_p (pattern_def_si))
 367                     {
 368                       if (dump_enabled_p ())
 369                         {
 370                           dump_printf_loc (MSG_NOTE, vect_location,
 371                                            "==> examining pattern def stmt: ");
 372                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 373                                             pattern_def_stmt, 0);
 374                           dump_printf (MSG_NOTE, "\n");
 375                         }
 376
 377                       stmt = pattern_def_stmt;
 378                       stmt_info = pattern_def_stmt_info;
 379                     }
 380                   else
 381                     {
 382                       pattern_def_si = gsi_none ();
 383                       analyze_pattern_stmt = false;
 384                     }
 385                 }
 386               else
 387                 analyze_pattern_stmt = false;
 388             }
 389
 390           if (gimple_get_lhs (stmt) == NULL_TREE
 391               /* MASK_STORE has no lhs, but is ok.  */
 392               && (!is_gimple_call (stmt)
 393                   || !gimple_call_internal_p (stmt)
 394                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 395             {
 396               if (is_gimple_call (stmt))
 397                 {
 398                   /* Ignore calls with no lhs.  These must be calls to
 399                      #pragma omp simd functions, and what vectorization factor
 400                      it really needs can't be determined until
 401                      vectorizable_simd_clone_call.  */
 402                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 403                     {
 404                       pattern_def_seq = NULL;
 405                       gsi_next (&si);
 406                     }
 407                   continue;
 408                 }
 409               if (dump_enabled_p ())
 410                 {
 411                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 412                                    "not vectorized: irregular stmt.");
 413                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 414                                     0);
 415                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 416                 }
 417               return false;
 418             }
 419
 420           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 421             {
 422               if (dump_enabled_p ())
 423                 {
 424                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 425                                    "not vectorized: vector stmt in loop:");
 426                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 427                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 428                 }
 429               return false;
 430             }
 431
 432           if (STMT_VINFO_VECTYPE (stmt_info))
 433             {
 434               /* The only case when a vectype had been already set is for stmts
 435                  that contain a dataref, or for "pattern-stmts" (stmts
 436                  generated by the vectorizer to represent/replace a certain
 437                  idiom).  */
 438               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 439                           || is_pattern_stmt_p (stmt_info)
 440                           || !gsi_end_p (pattern_def_si));
 441               vectype = STMT_VINFO_VECTYPE (stmt_info);
 442             }
 443           else
 444             {
 445               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 446               if (is_gimple_call (stmt)
 447                   && gimple_call_internal_p (stmt)
 448                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 449                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 450               else
 451                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 452               if (dump_enabled_p ())
 453                 {
 454                   dump_printf_loc (MSG_NOTE, vect_location,
 455                                    "get vectype for scalar type:  ");
 456                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 457                   dump_printf (MSG_NOTE, "\n");
 458                 }
 459               vectype = get_vectype_for_scalar_type (scalar_type);
 460               if (!vectype)
 461                 {
 462                   if (dump_enabled_p ())
 463                     {
 464                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 465                                        "not vectorized: unsupported "
 466                                        "data-type ");
 467                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 468                                          scalar_type);
 469                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 470                     }
 471                   return false;
 472                 }
 473
 474               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 475
 476               if (dump_enabled_p ())
 477                 {
 478                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 479                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 480                   dump_printf (MSG_NOTE, "\n");
 481                 }
 482             }
 483
 484           /* The vectorization factor is according to the smallest
 485              scalar type (or the largest vector size, but we only
 486              support one vector size per loop).  */
 487           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 488                                                        &dummy);
 489           if (dump_enabled_p ())
 490             {
 491               dump_printf_loc (MSG_NOTE, vect_location,
 492                                "get vectype for scalar type:  ");
 493               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 494               dump_printf (MSG_NOTE, "\n");
 495             }
 496           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 497           if (!vf_vectype)
 498             {
 499               if (dump_enabled_p ())
 500                 {
 501                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                                    "not vectorized: unsupported data-type ");
 503                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 504                                      scalar_type);
 505                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 506                 }
 507               return false;
 508             }
 509
 510           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 511                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 512             {
 513               if (dump_enabled_p ())
 514                 {
 515                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 516                                    "not vectorized: different sized vector "
 517                                    "types in statement, ");
 518                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 519                                      vectype);
 520                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 521                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 522                                      vf_vectype);
 523                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 524                 }
 525               return false;
 526             }
 527
 528           if (dump_enabled_p ())
 529             {
 530               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 531               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 532               dump_printf (MSG_NOTE, "\n");
 533             }
 534
 535           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 536           if (dump_enabled_p ())
 537             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 538           if (!vectorization_factor
 539               || (nunits > vectorization_factor))
 540             vectorization_factor = nunits;
 541
 542           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 543             {
 544               pattern_def_seq = NULL;
 545               gsi_next (&si);
 546             }
 547         }
 548     }
 549
 550   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 551   if (dump_enabled_p ())
 552     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 553                      vectorization_factor);
 554   if (vectorization_factor <= 1)
 555     {
 556       if (dump_enabled_p ())
 557         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 558                          "not vectorized: unsupported data-type\n");
 559       return false;
 560     }
 561   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 562
 563   return true;
 564 }
 565
 566
 567 /* Function vect_is_simple_iv_evolution.
 568
 569    FORNOW: A simple evolution of an induction variables in the loop is
 570    considered a polynomial evolution.  */
 571
 572 static bool
 573 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 574                              tree * step)
 575 {
 576   tree init_expr;
 577   tree step_expr;
 578   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 579   basic_block bb;
 580
 581   /* When there is no evolution in this loop, the evolution function
 582      is not "simple".  */
 583   if (evolution_part == NULL_TREE)
 584     return false;
 585
 586   /* When the evolution is a polynomial of degree >= 2
 587      the evolution function is not "simple".  */
 588   if (tree_is_chrec (evolution_part))
 589     return false;
 590
 591   step_expr = evolution_part;
 592   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 593
 594   if (dump_enabled_p ())
 595     {
 596       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 597       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 598       dump_printf (MSG_NOTE, ",  init: ");
 599       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 600       dump_printf (MSG_NOTE, "\n");
 601     }
 602
 603   *init = init_expr;
 604   *step = step_expr;
 605
 606   if (TREE_CODE (step_expr) != INTEGER_CST
 607       && (TREE_CODE (step_expr) != SSA_NAME
 608           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 609               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 610           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 611               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 612                   || !flag_associative_math)))
 613       && (TREE_CODE (step_expr) != REAL_CST
 614           || !flag_associative_math))
 615     {
 616       if (dump_enabled_p ())
 617         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 618                          "step unknown.\n");
 619       return false;
 620     }
 621
 622   return true;
 623 }
 624
 625 /* Function vect_analyze_scalar_cycles_1.
 626
 627    Examine the cross iteration def-use cycles of scalar variables
 628    in LOOP.  LOOP_VINFO represents the loop that is now being
 629    considered for vectorization (can be LOOP, or an outer-loop
 630    enclosing LOOP).  */
 631
 632 static void
 633 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 634 {
 635   basic_block bb = loop->header;
 636   tree init, step;
 637   auto_vec<gimple, 64> worklist;
 638   gphi_iterator gsi;
 639   bool double_reduc;
 640
 641   if (dump_enabled_p ())
 642     dump_printf_loc (MSG_NOTE, vect_location,
 643                      "=== vect_analyze_scalar_cycles ===\n");
 644
 645   /* First - identify all inductions.  Reduction detection assumes that all the
 646      inductions have been identified, therefore, this order must not be
 647      changed.  */
 648   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 649     {
 650       gphi *phi = gsi.phi ();
 651       tree access_fn = NULL;
 652       tree def = PHI_RESULT (phi);
 653       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 654
 655       if (dump_enabled_p ())
 656         {
 657           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 658           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 659           dump_printf (MSG_NOTE, "\n");
 660         }
 661
 662       /* Skip virtual phi's.  The data dependences that are associated with
 663          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 664       if (virtual_operand_p (def))
 665         continue;
 666
 667       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 668
 669       /* Analyze the evolution function.  */
 670       access_fn = analyze_scalar_evolution (loop, def);
 671       if (access_fn)
 672         {
 673           STRIP_NOPS (access_fn);
 674           if (dump_enabled_p ())
 675             {
 676               dump_printf_loc (MSG_NOTE, vect_location,
 677                                "Access function of PHI: ");
 678               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 679               dump_printf (MSG_NOTE, "\n");
 680             }
 681           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 682             = evolution_part_in_loop_num (access_fn, loop->num);
 683         }
 684
 685       if (!access_fn
 686           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 687           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 688               && TREE_CODE (step) != INTEGER_CST))
 689         {
 690           worklist.safe_push (phi);
 691           continue;
 692         }
 693
 694       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 695
 696       if (dump_enabled_p ())
 697         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 698       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 699     }
 700
 701
 702   /* Second - identify all reductions and nested cycles.  */
 703   while (worklist.length () > 0)
 704     {
 705       gimple phi = worklist.pop ();
 706       tree def = PHI_RESULT (phi);
 707       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 708       gimple reduc_stmt;
 709       bool nested_cycle;
 710
 711       if (dump_enabled_p ())
 712         {
 713           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 714           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 715           dump_printf (MSG_NOTE, "\n");
 716         }
 717
 718       gcc_assert (!virtual_operand_p (def)
 719                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 720
 721       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 722       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 723                                                 &double_reduc);
 724       if (reduc_stmt)
 725         {
 726           if (double_reduc)
 727             {
 728               if (dump_enabled_p ())
 729                 dump_printf_loc (MSG_NOTE, vect_location,
 730                                  "Detected double reduction.\n");
 731
 732               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 733               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 734                                                     vect_double_reduction_def;
 735             }
 736           else
 737             {
 738               if (nested_cycle)
 739                 {
 740                   if (dump_enabled_p ())
 741                     dump_printf_loc (MSG_NOTE, vect_location,
 742                                      "Detected vectorizable nested cycle.\n");
 743
 744                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 745                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 746                                                              vect_nested_cycle;
 747                 }
 748               else
 749                 {
 750                   if (dump_enabled_p ())
 751                     dump_printf_loc (MSG_NOTE, vect_location,
 752                                      "Detected reduction.\n");
 753
 754                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 755                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 756                                                            vect_reduction_def;
 757                   /* Store the reduction cycles for possible vectorization in
 758                      loop-aware SLP.  */
 759                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 760                 }
 761             }
 762         }
 763       else
 764         if (dump_enabled_p ())
 765           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 766                            "Unknown def-use cycle pattern.\n");
 767     }
 768 }
 769
 770
 771 /* Function vect_analyze_scalar_cycles.
 772
 773    Examine the cross iteration def-use cycles of scalar variables, by
 774    analyzing the loop-header PHIs of scalar variables.  Classify each
 775    cycle as one of the following: invariant, induction, reduction, unknown.
 776    We do that for the loop represented by LOOP_VINFO, and also to its
 777    inner-loop, if exists.
 778    Examples for scalar cycles:
 779
 780    Example1: reduction:
 781
 782               loop1:
 783               for (i=0; i<N; i++)
 784                  sum += a[i];
 785
 786    Example2: induction:
 787
 788               loop2:
 789               for (i=0; i<N; i++)
 790                  a[i] = i;  */
 791
 792 static void
 793 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 794 {
 795   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 796
 797   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 798
 799   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 800      Reductions in such inner-loop therefore have different properties than
 801      the reductions in the nest that gets vectorized:
 802      1. When vectorized, they are executed in the same order as in the original
 803         scalar loop, so we can't change the order of computation when
 804         vectorizing them.
 805      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 806         current checks are too strict.  */
 807
 808   if (loop->inner)
 809     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 810 }
 811
 812
 813 /* Function vect_get_loop_niters.
 814
 815    Determine how many iterations the loop is executed and place it
 816    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 817    in NUMBER_OF_ITERATIONSM1.
 818
 819    Return the loop exit condition.  */
 820
 821
 822 static gcond *
 823 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 824                       tree *number_of_iterationsm1)
 825 {
 826   tree niters;
 827
 828   if (dump_enabled_p ())
 829     dump_printf_loc (MSG_NOTE, vect_location,
 830                      "=== get_loop_niters ===\n");
 831
 832   niters = number_of_latch_executions (loop);
 833   *number_of_iterationsm1 = niters;
 834
 835   /* We want the number of loop header executions which is the number
 836      of latch executions plus one.
 837      ???  For UINT_MAX latch executions this number overflows to zero
 838      for loops like do { n++; } while (n != 0);  */
 839   if (niters && !chrec_contains_undetermined (niters))
 840     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 841                           build_int_cst (TREE_TYPE (niters), 1));
 842   *number_of_iterations = niters;
 843
 844   return get_loop_exit_condition (loop);
 845 }
 846
 847
 848 /* Function bb_in_loop_p
 849
 850    Used as predicate for dfs order traversal of the loop bbs.  */
 851
 852 static bool
 853 bb_in_loop_p (const_basic_block bb, const void *data)
 854 {
 855   const struct loop *const loop = (const struct loop *)data;
 856   if (flow_bb_inside_loop_p (loop, bb))
 857     return true;
 858   return false;
 859 }
 860
 861
 862 /* Function new_loop_vec_info.
 863
 864    Create and initialize a new loop_vec_info struct for LOOP, as well as
 865    stmt_vec_info structs for all the stmts in LOOP.  */
 866
 867 static loop_vec_info
 868 new_loop_vec_info (struct loop *loop)
 869 {
 870   loop_vec_info res;
 871   basic_block *bbs;
 872   gimple_stmt_iterator si;
 873   unsigned int i, nbbs;
 874
 875   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 876   LOOP_VINFO_LOOP (res) = loop;
 877
 878   bbs = get_loop_body (loop);
 879
 880   /* Create/Update stmt_info for all stmts in the loop.  */
 881   for (i = 0; i < loop->num_nodes; i++)
 882     {
 883       basic_block bb = bbs[i];
 884
 885       /* BBs in a nested inner-loop will have been already processed (because
 886          we will have called vect_analyze_loop_form for any nested inner-loop).
 887          Therefore, for stmts in an inner-loop we just want to update the
 888          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 889          loop_info of the outer-loop we are currently considering to vectorize
 890          (instead of the loop_info of the inner-loop).
 891          For stmts in other BBs we need to create a stmt_info from scratch.  */
 892       if (bb->loop_father != loop)
 893         {
 894           /* Inner-loop bb.  */
 895           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 896           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 897             {
 898               gimple phi = gsi_stmt (si);
 899               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 900               loop_vec_info inner_loop_vinfo =
 901                 STMT_VINFO_LOOP_VINFO (stmt_info);
 902               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 903               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 904             }
 905           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 906            {
 907               gimple stmt = gsi_stmt (si);
 908               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 909               loop_vec_info inner_loop_vinfo =
 910                  STMT_VINFO_LOOP_VINFO (stmt_info);
 911               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 912               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 913            }
 914         }
 915       else
 916         {
 917           /* bb in current nest.  */
 918           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 919             {
 920               gimple phi = gsi_stmt (si);
 921               gimple_set_uid (phi, 0);
 922               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 923             }
 924
 925           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 926             {
 927               gimple stmt = gsi_stmt (si);
 928               gimple_set_uid (stmt, 0);
 929               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 930             }
 931         }
 932     }
 933
 934   /* CHECKME: We want to visit all BBs before their successors (except for
 935      latch blocks, for which this assertion wouldn't hold).  In the simple
 936      case of the loop forms we allow, a dfs order of the BBs would the same
 937      as reversed postorder traversal, so we are safe.  */
 938
 939    free (bbs);
 940    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 941    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 942                               bbs, loop->num_nodes, loop);
 943    gcc_assert (nbbs == loop->num_nodes);
 944
 945   LOOP_VINFO_BBS (res) = bbs;
 946   LOOP_VINFO_NITERSM1 (res) = NULL;
 947   LOOP_VINFO_NITERS (res) = NULL;
 948   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 949   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 950   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 951   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 952   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 953   LOOP_VINFO_VECT_FACTOR (res) = 0;
 954   LOOP_VINFO_LOOP_NEST (res).create (3);
 955   LOOP_VINFO_DATAREFS (res).create (10);
 956   LOOP_VINFO_DDRS (res).create (10 * 10);
 957   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 958   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 959              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 960   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 961              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 962   LOOP_VINFO_GROUPED_STORES (res).create (10);
 963   LOOP_VINFO_REDUCTIONS (res).create (10);
 964   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 965   LOOP_VINFO_SLP_INSTANCES (res).create (10);
 966   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 967   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 968   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 969   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 970   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 971
 972   return res;
 973 }
 974
 975
 976 /* Function destroy_loop_vec_info.
 977
 978    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 979    stmts in the loop.  */
 980
 981 void
 982 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 983 {
 984   struct loop *loop;
 985   basic_block *bbs;
 986   int nbbs;
 987   gimple_stmt_iterator si;
 988   int j;
 989   vec<slp_instance> slp_instances;
 990   slp_instance instance;
 991   bool swapped;
 992
 993   if (!loop_vinfo)
 994     return;
 995
 996   loop = LOOP_VINFO_LOOP (loop_vinfo);
 997
 998   bbs = LOOP_VINFO_BBS (loop_vinfo);
 999   nbbs = clean_stmts ? loop->num_nodes : 0;
1000   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1001
1002   for (j = 0; j < nbbs; j++)
1003     {
1004       basic_block bb = bbs[j];
1005       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1006         free_stmt_vec_info (gsi_stmt (si));
1007
1008       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1009         {
1010           gimple stmt = gsi_stmt (si);
1011
1012           /* We may have broken canonical form by moving a constant
1013              into RHS1 of a commutative op.  Fix such occurrences.  */
1014           if (swapped && is_gimple_assign (stmt))
1015             {
1016               enum tree_code code = gimple_assign_rhs_code (stmt);
1017
1018               if ((code == PLUS_EXPR
1019                    || code == POINTER_PLUS_EXPR
1020                    || code == MULT_EXPR)
1021                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1022                 swap_ssa_operands (stmt,
1023                                    gimple_assign_rhs1_ptr (stmt),
1024                                    gimple_assign_rhs2_ptr (stmt));
1025             }
1026
1027           /* Free stmt_vec_info.  */
1028           free_stmt_vec_info (stmt);
1029           gsi_next (&si);
1030         }
1031     }
1032
1033   free (LOOP_VINFO_BBS (loop_vinfo));
1034   vect_destroy_datarefs (loop_vinfo, NULL);
1035   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1036   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1037   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1038   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1039   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1040   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1041     vect_free_slp_instance (instance);
1042
1043   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1044   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1045   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1046   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1047
1048   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1049   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1050
1051   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1052
1053   free (loop_vinfo);
1054   loop->aux = NULL;
1055 }
1056
1057
1058 /* Function vect_analyze_loop_1.
1059
1060    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1061    for it. The different analyses will record information in the
1062    loop_vec_info struct.  This is a subset of the analyses applied in
1063    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1064    that is now considered for (outer-loop) vectorization.  */
1065
1066 static loop_vec_info
1067 vect_analyze_loop_1 (struct loop *loop)
1068 {
1069   loop_vec_info loop_vinfo;
1070
1071   if (dump_enabled_p ())
1072     dump_printf_loc (MSG_NOTE, vect_location,
1073                      "===== analyze_loop_nest_1 =====\n");
1074
1075   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1076
1077   loop_vinfo = vect_analyze_loop_form (loop);
1078   if (!loop_vinfo)
1079     {
1080       if (dump_enabled_p ())
1081         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1082                          "bad inner-loop form.\n");
1083       return NULL;
1084     }
1085
1086   return loop_vinfo;
1087 }
1088
1089
1090 /* Function vect_analyze_loop_form.
1091
1092    Verify that certain CFG restrictions hold, including:
1093    - the loop has a pre-header
1094    - the loop has a single entry and exit
1095    - the loop exit condition is simple enough, and the number of iterations
1096      can be analyzed (a countable loop).  */
1097
1098 loop_vec_info
1099 vect_analyze_loop_form (struct loop *loop)
1100 {
1101   loop_vec_info loop_vinfo;
1102   gcond *loop_cond;
1103   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1104   loop_vec_info inner_loop_vinfo = NULL;
1105
1106   if (dump_enabled_p ())
1107     dump_printf_loc (MSG_NOTE, vect_location,
1108                      "=== vect_analyze_loop_form ===\n");
1109
1110   /* Different restrictions apply when we are considering an inner-most loop,
1111      vs. an outer (nested) loop.
1112      (FORNOW. May want to relax some of these restrictions in the future).  */
1113
1114   if (!loop->inner)
1115     {
1116       /* Inner-most loop.  We currently require that the number of BBs is
1117          exactly 2 (the header and latch).  Vectorizable inner-most loops
1118          look like this:
1119
1120                         (pre-header)
1121                            |
1122                           header <--------+
1123                            | |            |
1124                            | +--> latch --+
1125                            |
1126                         (exit-bb)  */
1127
1128       if (loop->num_nodes != 2)
1129         {
1130           if (dump_enabled_p ())
1131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1132                              "not vectorized: control flow in loop.\n");
1133           return NULL;
1134         }
1135
1136       if (empty_block_p (loop->header))
1137         {
1138           if (dump_enabled_p ())
1139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1140                              "not vectorized: empty loop.\n");
1141           return NULL;
1142         }
1143     }
1144   else
1145     {
1146       struct loop *innerloop = loop->inner;
1147       edge entryedge;
1148
1149       /* Nested loop. We currently require that the loop is doubly-nested,
1150          contains a single inner loop, and the number of BBs is exactly 5.
1151          Vectorizable outer-loops look like this:
1152
1153                         (pre-header)
1154                            |
1155                           header <---+
1156                            |         |
1157                           inner-loop |
1158                            |         |
1159                           tail ------+
1160                            |
1161                         (exit-bb)
1162
1163          The inner-loop has the properties expected of inner-most loops
1164          as described above.  */
1165
1166       if ((loop->inner)->inner || (loop->inner)->next)
1167         {
1168           if (dump_enabled_p ())
1169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1170                              "not vectorized: multiple nested loops.\n");
1171           return NULL;
1172         }
1173
1174       /* Analyze the inner-loop.  */
1175       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1176       if (!inner_loop_vinfo)
1177         {
1178           if (dump_enabled_p ())
1179             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1180                              "not vectorized: Bad inner loop.\n");
1181           return NULL;
1182         }
1183
1184       if (!expr_invariant_in_loop_p (loop,
1185                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1186         {
1187           if (dump_enabled_p ())
1188             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1189                              "not vectorized: inner-loop count not"
1190                              " invariant.\n");
1191           destroy_loop_vec_info (inner_loop_vinfo, true);
1192           return NULL;
1193         }
1194
1195       if (loop->num_nodes != 5)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: control flow in loop.\n");
1200           destroy_loop_vec_info (inner_loop_vinfo, true);
1201           return NULL;
1202         }
1203
1204       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1205       entryedge = EDGE_PRED (innerloop->header, 0);
1206       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1207         entryedge = EDGE_PRED (innerloop->header, 1);
1208
1209       if (entryedge->src != loop->header
1210           || !single_exit (innerloop)
1211           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1212         {
1213           if (dump_enabled_p ())
1214             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1215                              "not vectorized: unsupported outerloop form.\n");
1216           destroy_loop_vec_info (inner_loop_vinfo, true);
1217           return NULL;
1218         }
1219
1220       if (dump_enabled_p ())
1221         dump_printf_loc (MSG_NOTE, vect_location,
1222                          "Considering outer-loop vectorization.\n");
1223     }
1224
1225   if (!single_exit (loop)
1226       || EDGE_COUNT (loop->header->preds) != 2)
1227     {
1228       if (dump_enabled_p ())
1229         {
1230           if (!single_exit (loop))
1231             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1232                              "not vectorized: multiple exits.\n");
1233           else if (EDGE_COUNT (loop->header->preds) != 2)
1234             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1235                              "not vectorized: too many incoming edges.\n");
1236         }
1237       if (inner_loop_vinfo)
1238         destroy_loop_vec_info (inner_loop_vinfo, true);
1239       return NULL;
1240     }
1241
1242   /* We assume that the loop exit condition is at the end of the loop. i.e,
1243      that the loop is represented as a do-while (with a proper if-guard
1244      before the loop if needed), where the loop header contains all the
1245      executable statements, and the latch is empty.  */
1246   if (!empty_block_p (loop->latch)
1247       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1248     {
1249       if (dump_enabled_p ())
1250         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1251                          "not vectorized: latch block not empty.\n");
1252       if (inner_loop_vinfo)
1253         destroy_loop_vec_info (inner_loop_vinfo, true);
1254       return NULL;
1255     }
1256
1257   /* Make sure there exists a single-predecessor exit bb:  */
1258   if (!single_pred_p (single_exit (loop)->dest))
1259     {
1260       edge e = single_exit (loop);
1261       if (!(e->flags & EDGE_ABNORMAL))
1262         {
1263           split_loop_exit_edge (e);
1264           if (dump_enabled_p ())
1265             dump_printf (MSG_NOTE, "split exit edge.\n");
1266         }
1267       else
1268         {
1269           if (dump_enabled_p ())
1270             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1271                              "not vectorized: abnormal loop exit edge.\n");
1272           if (inner_loop_vinfo)
1273             destroy_loop_vec_info (inner_loop_vinfo, true);
1274           return NULL;
1275         }
1276     }
1277
1278   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1279                                     &number_of_iterationsm1);
1280   if (!loop_cond)
1281     {
1282       if (dump_enabled_p ())
1283         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284                          "not vectorized: complicated exit condition.\n");
1285       if (inner_loop_vinfo)
1286         destroy_loop_vec_info (inner_loop_vinfo, true);
1287       return NULL;
1288     }
1289
1290   if (!number_of_iterations
1291       || chrec_contains_undetermined (number_of_iterations))
1292     {
1293       if (dump_enabled_p ())
1294         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1295                          "not vectorized: number of iterations cannot be "
1296                          "computed.\n");
1297       if (inner_loop_vinfo)
1298         destroy_loop_vec_info (inner_loop_vinfo, true);
1299       return NULL;
1300     }
1301
1302   if (integer_zerop (number_of_iterations))
1303     {
1304       if (dump_enabled_p ())
1305         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1306                          "not vectorized: number of iterations = 0.\n");
1307       if (inner_loop_vinfo)
1308         destroy_loop_vec_info (inner_loop_vinfo, true);
1309       return NULL;
1310     }
1311
1312   loop_vinfo = new_loop_vec_info (loop);
1313   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1314   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1315   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1316
1317   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1318     {
1319       if (dump_enabled_p ())
1320         {
1321           dump_printf_loc (MSG_NOTE, vect_location,
1322                            "Symbolic number of iterations is ");
1323           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1324           dump_printf (MSG_NOTE, "\n");
1325         }
1326     }
1327
1328   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1329
1330   /* CHECKME: May want to keep it around it in the future.  */
1331   if (inner_loop_vinfo)
1332     destroy_loop_vec_info (inner_loop_vinfo, false);
1333
1334   gcc_assert (!loop->aux);
1335   loop->aux = loop_vinfo;
1336   return loop_vinfo;
1337 }
1338
1339
1340 /* Function vect_analyze_loop_operations.
1341
1342    Scan the loop stmts and make sure they are all vectorizable.  */
1343
1344 static bool
1345 vect_analyze_loop_operations (loop_vec_info loop_vinfo, bool slp)
1346 {
1347   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1348   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1349   int nbbs = loop->num_nodes;
1350   unsigned int vectorization_factor = 0;
1351   int i;
1352   stmt_vec_info stmt_info;
1353   bool need_to_vectorize = false;
1354   int min_profitable_iters;
1355   int min_scalar_loop_bound;
1356   unsigned int th;
1357   bool only_slp_in_loop = true, ok;
1358   HOST_WIDE_INT max_niter;
1359   HOST_WIDE_INT estimated_niter;
1360   int min_profitable_estimate;
1361
1362   if (dump_enabled_p ())
1363     dump_printf_loc (MSG_NOTE, vect_location,
1364                      "=== vect_analyze_loop_operations ===\n");
1365
1366   gcc_assert (LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1367   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1368   if (slp)
1369     {
1370       /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1371          vectorization factor of the loop is the unrolling factor required by
1372          the SLP instances.  If that unrolling factor is 1, we say, that we
1373          perform pure SLP on loop - cross iteration parallelism is not
1374          exploited.  */
1375       for (i = 0; i < nbbs; i++)
1376         {
1377           basic_block bb = bbs[i];
1378           for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1379                gsi_next (&si))
1380             {
1381               gimple stmt = gsi_stmt (si);
1382               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1383               gcc_assert (stmt_info);
1384               if ((STMT_VINFO_RELEVANT_P (stmt_info)
1385                    || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1386                   && !PURE_SLP_STMT (stmt_info))
1387                 /* STMT needs both SLP and loop-based vectorization.  */
1388                 only_slp_in_loop = false;
1389             }
1390         }
1391
1392       if (only_slp_in_loop)
1393         vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1394       else
1395         vectorization_factor = least_common_multiple (vectorization_factor,
1396                                 LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1397
1398       LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1399       if (dump_enabled_p ())
1400         dump_printf_loc (MSG_NOTE, vect_location,
1401                          "Updating vectorization factor to %d\n",
1402                          vectorization_factor);
1403     }
1404
1405   for (i = 0; i < nbbs; i++)
1406     {
1407       basic_block bb = bbs[i];
1408
1409       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1410            gsi_next (&si))
1411         {
1412           gphi *phi = si.phi ();
1413           ok = true;
1414
1415           stmt_info = vinfo_for_stmt (phi);
1416           if (dump_enabled_p ())
1417             {
1418               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1419               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1420               dump_printf (MSG_NOTE, "\n");
1421             }
1422
1423           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1424              (i.e., a phi in the tail of the outer-loop).  */
1425           if (! is_loop_header_bb_p (bb))
1426             {
1427               /* FORNOW: we currently don't support the case that these phis
1428                  are not used in the outerloop (unless it is double reduction,
1429                  i.e., this phi is vect_reduction_def), cause this case
1430                  requires to actually do something here.  */
1431               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1432                    || STMT_VINFO_LIVE_P (stmt_info))
1433                   && STMT_VINFO_DEF_TYPE (stmt_info)
1434                      != vect_double_reduction_def)
1435                 {
1436                   if (dump_enabled_p ())
1437                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1438                                      "Unsupported loop-closed phi in "
1439                                      "outer-loop.\n");
1440                   return false;
1441                 }
1442
1443               /* If PHI is used in the outer loop, we check that its operand
1444                  is defined in the inner loop.  */
1445               if (STMT_VINFO_RELEVANT_P (stmt_info))
1446                 {
1447                   tree phi_op;
1448                   gimple op_def_stmt;
1449
1450                   if (gimple_phi_num_args (phi) != 1)
1451                     return false;
1452
1453                   phi_op = PHI_ARG_DEF (phi, 0);
1454                   if (TREE_CODE (phi_op) != SSA_NAME)
1455                     return false;
1456
1457                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1458                   if (gimple_nop_p (op_def_stmt)
1459                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1460                       || !vinfo_for_stmt (op_def_stmt))
1461                     return false;
1462
1463                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1464                         != vect_used_in_outer
1465                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1466                            != vect_used_in_outer_by_reduction)
1467                     return false;
1468                 }
1469
1470               continue;
1471             }
1472
1473           gcc_assert (stmt_info);
1474
1475           if (STMT_VINFO_LIVE_P (stmt_info))
1476             {
1477               /* FORNOW: not yet supported.  */
1478               if (dump_enabled_p ())
1479                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1480                                  "not vectorized: value used after loop.\n");
1481               return false;
1482             }
1483
1484           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1485               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1486             {
1487               /* A scalar-dependence cycle that we don't support.  */
1488               if (dump_enabled_p ())
1489                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1490                                  "not vectorized: scalar dependence cycle.\n");
1491               return false;
1492             }
1493
1494           if (STMT_VINFO_RELEVANT_P (stmt_info))
1495             {
1496               need_to_vectorize = true;
1497               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1498                 ok = vectorizable_induction (phi, NULL, NULL);
1499             }
1500
1501           if (!ok)
1502             {
1503               if (dump_enabled_p ())
1504                 {
1505                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1506                                    "not vectorized: relevant phi not "
1507                                    "supported: ");
1508                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1509                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1510                 }
1511               return false;
1512             }
1513         }
1514
1515       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1516            gsi_next (&si))
1517         {
1518           gimple stmt = gsi_stmt (si);
1519           if (!gimple_clobber_p (stmt)
1520               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1521             return false;
1522         }
1523     } /* bbs */
1524
1525   /* All operations in the loop are either irrelevant (deal with loop
1526      control, or dead), or only used outside the loop and can be moved
1527      out of the loop (e.g. invariants, inductions).  The loop can be
1528      optimized away by scalar optimizations.  We're better off not
1529      touching this loop.  */
1530   if (!need_to_vectorize)
1531     {
1532       if (dump_enabled_p ())
1533         dump_printf_loc (MSG_NOTE, vect_location,
1534                          "All the computation can be taken out of the loop.\n");
1535       if (dump_enabled_p ())
1536         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1537                          "not vectorized: redundant loop. no profit to "
1538                          "vectorize.\n");
1539       return false;
1540     }
1541
1542   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1543     dump_printf_loc (MSG_NOTE, vect_location,
1544                      "vectorization_factor = %d, niters = "
1545                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1546                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1547
1548   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1549        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1550       || ((max_niter = max_stmt_executions_int (loop)) != -1
1551           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1552     {
1553       if (dump_enabled_p ())
1554         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1555                          "not vectorized: iteration count too small.\n");
1556       if (dump_enabled_p ())
1557         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1558                          "not vectorized: iteration count smaller than "
1559                          "vectorization factor.\n");
1560       return false;
1561     }
1562
1563   /* Analyze cost.  Decide if worth while to vectorize.  */
1564
1565   /* Once VF is set, SLP costs should be updated since the number of created
1566      vector stmts depends on VF.  */
1567   vect_update_slp_costs_according_to_vf (loop_vinfo);
1568
1569   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1570                                       &min_profitable_estimate);
1571   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1572
1573   if (min_profitable_iters < 0)
1574     {
1575       if (dump_enabled_p ())
1576         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1577                          "not vectorized: vectorization not profitable.\n");
1578       if (dump_enabled_p ())
1579         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1580                          "not vectorized: vector version will never be "
1581                          "profitable.\n");
1582       return false;
1583     }
1584
1585   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1586                             * vectorization_factor) - 1);
1587
1588
1589   /* Use the cost model only if it is more conservative than user specified
1590      threshold.  */
1591
1592   th = (unsigned) min_scalar_loop_bound;
1593   if (min_profitable_iters
1594       && (!min_scalar_loop_bound
1595           || min_profitable_iters > min_scalar_loop_bound))
1596     th = (unsigned) min_profitable_iters;
1597
1598   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1599
1600   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1601       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "not vectorized: vectorization not profitable.\n");
1606       if (dump_enabled_p ())
1607         dump_printf_loc (MSG_NOTE, vect_location,
1608                          "not vectorized: iteration count smaller than user "
1609                          "specified loop bound parameter or minimum profitable "
1610                          "iterations (whichever is more conservative).\n");
1611       return false;
1612     }
1613
1614   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1615       && ((unsigned HOST_WIDE_INT) estimated_niter
1616           <= MAX (th, (unsigned)min_profitable_estimate)))
1617     {
1618       if (dump_enabled_p ())
1619         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1620                          "not vectorized: estimated iteration count too "
1621                          "small.\n");
1622       if (dump_enabled_p ())
1623         dump_printf_loc (MSG_NOTE, vect_location,
1624                          "not vectorized: estimated iteration count smaller "
1625                          "than specified loop bound parameter or minimum "
1626                          "profitable iterations (whichever is more "
1627                          "conservative).\n");
1628       return false;
1629     }
1630
1631   return true;
1632 }
1633
1634
1635 /* Function vect_analyze_loop_2.
1636
1637    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1638    for it.  The different analyses will record information in the
1639    loop_vec_info struct.  */
1640 static bool
1641 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1642 {
1643   bool ok, slp = false;
1644   int max_vf = MAX_VECTORIZATION_FACTOR;
1645   int min_vf = 2;
1646   unsigned int th;
1647   unsigned int n_stmts = 0;
1648
1649   /* Find all data references in the loop (which correspond to vdefs/vuses)
1650      and analyze their evolution in the loop.  Also adjust the minimal
1651      vectorization factor according to the loads and stores.
1652
1653      FORNOW: Handle only simple, array references, which
1654      alignment can be forced, and aligned pointer-references.  */
1655
1656   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1657   if (!ok)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661                          "bad data references.\n");
1662       return false;
1663     }
1664
1665   /* Classify all cross-iteration scalar data-flow cycles.
1666      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1667
1668   vect_analyze_scalar_cycles (loop_vinfo);
1669
1670   vect_pattern_recog (loop_vinfo, NULL);
1671
1672   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1673      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1674
1675   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1676   if (!ok)
1677     {
1678       if (dump_enabled_p ())
1679         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1680                          "bad data access.\n");
1681       return false;
1682     }
1683
1684   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1685
1686   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1687   if (!ok)
1688     {
1689       if (dump_enabled_p ())
1690         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1691                          "unexpected pattern.\n");
1692       return false;
1693     }
1694
1695   /* Analyze data dependences between the data-refs in the loop
1696      and adjust the maximum vectorization factor according to
1697      the dependences.
1698      FORNOW: fail at the first data dependence that we encounter.  */
1699
1700   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1701   if (!ok
1702       || max_vf < min_vf)
1703     {
1704       if (dump_enabled_p ())
1705             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1706                              "bad data dependence.\n");
1707       return false;
1708     }
1709
1710   ok = vect_determine_vectorization_factor (loop_vinfo);
1711   if (!ok)
1712     {
1713       if (dump_enabled_p ())
1714         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1715                          "can't determine vectorization factor.\n");
1716       return false;
1717     }
1718   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1719     {
1720       if (dump_enabled_p ())
1721         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1722                          "bad data dependence.\n");
1723       return false;
1724     }
1725
1726   /* Analyze the alignment of the data-refs in the loop.
1727      Fail if a data reference is found that cannot be vectorized.  */
1728
1729   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1730   if (!ok)
1731     {
1732       if (dump_enabled_p ())
1733         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1734                          "bad data alignment.\n");
1735       return false;
1736     }
1737
1738   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1739      It is important to call pruning after vect_analyze_data_ref_accesses,
1740      since we use grouping information gathered by interleaving analysis.  */
1741   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1742   if (!ok)
1743     {
1744       if (dump_enabled_p ())
1745         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1746                          "number of versioning for alias "
1747                          "run-time tests exceeds %d "
1748                          "(--param vect-max-version-for-alias-checks)\n",
1749                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1750       return false;
1751     }
1752
1753   /* This pass will decide on using loop versioning and/or loop peeling in
1754      order to enhance the alignment of data references in the loop.  */
1755
1756   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1757   if (!ok)
1758     {
1759       if (dump_enabled_p ())
1760         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1761                          "bad data alignment.\n");
1762       return false;
1763     }
1764
1765   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1766   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1767   if (ok)
1768     {
1769       /* Decide which possible SLP instances to SLP.  */
1770       slp = vect_make_slp_decision (loop_vinfo);
1771
1772       /* Find stmts that need to be both vectorized and SLPed.  */
1773       vect_detect_hybrid_slp (loop_vinfo);
1774     }
1775   else
1776     return false;
1777
1778   /* Scan all the operations in the loop and make sure they are
1779      vectorizable.  */
1780
1781   ok = vect_analyze_loop_operations (loop_vinfo, slp);
1782   if (!ok)
1783     {
1784       if (dump_enabled_p ())
1785         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1786                          "bad operation or unsupported loop bound.\n");
1787       return false;
1788     }
1789
1790   /* Decide whether we need to create an epilogue loop to handle
1791      remaining scalar iterations.  */
1792   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1793         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1794        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1795
1796   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1797       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1798     {
1799       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1800                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1801           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1802         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1803     }
1804   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1805            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1806                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1807                /* In case of versioning, check if the maximum number of
1808                   iterations is greater than th.  If they are identical,
1809                   the epilogue is unnecessary.  */
1810                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1811                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1812                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1813                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1814     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1815
1816   /* If an epilogue loop is required make sure we can create one.  */
1817   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1818       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1819     {
1820       if (dump_enabled_p ())
1821         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1822       if (!vect_can_advance_ivs_p (loop_vinfo)
1823           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1824                                            single_exit (LOOP_VINFO_LOOP
1825                                                          (loop_vinfo))))
1826         {
1827           if (dump_enabled_p ())
1828             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1829                              "not vectorized: can't create required "
1830                              "epilog loop\n");
1831           return false;
1832         }
1833     }
1834
1835   return true;
1836 }
1837
1838 /* Function vect_analyze_loop.
1839
1840    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1841    for it.  The different analyses will record information in the
1842    loop_vec_info struct.  */
1843 loop_vec_info
1844 vect_analyze_loop (struct loop *loop)
1845 {
1846   loop_vec_info loop_vinfo;
1847   unsigned int vector_sizes;
1848
1849   /* Autodetect first vector size we try.  */
1850   current_vector_size = 0;
1851   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1852
1853   if (dump_enabled_p ())
1854     dump_printf_loc (MSG_NOTE, vect_location,
1855                      "===== analyze_loop_nest =====\n");
1856
1857   if (loop_outer (loop)
1858       && loop_vec_info_for_loop (loop_outer (loop))
1859       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1860     {
1861       if (dump_enabled_p ())
1862         dump_printf_loc (MSG_NOTE, vect_location,
1863                          "outer-loop already vectorized.\n");
1864       return NULL;
1865     }
1866
1867   while (1)
1868     {
1869       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1870       loop_vinfo = vect_analyze_loop_form (loop);
1871       if (!loop_vinfo)
1872         {
1873           if (dump_enabled_p ())
1874             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1875                              "bad loop form.\n");
1876           return NULL;
1877         }
1878
1879       if (vect_analyze_loop_2 (loop_vinfo))
1880         {
1881           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1882
1883           return loop_vinfo;
1884         }
1885
1886       destroy_loop_vec_info (loop_vinfo, true);
1887
1888       vector_sizes &= ~current_vector_size;
1889       if (vector_sizes == 0
1890           || current_vector_size == 0)
1891         return NULL;
1892
1893       /* Try the next biggest vector size.  */
1894       current_vector_size = 1 << floor_log2 (vector_sizes);
1895       if (dump_enabled_p ())
1896         dump_printf_loc (MSG_NOTE, vect_location,
1897                          "***** Re-trying analysis with "
1898                          "vector size %d\n", current_vector_size);
1899     }
1900 }
1901
1902
1903 /* Function reduction_code_for_scalar_code
1904
1905    Input:
1906    CODE - tree_code of a reduction operations.
1907
1908    Output:
1909    REDUC_CODE - the corresponding tree-code to be used to reduce the
1910       vector of partial results into a single scalar result, or ERROR_MARK
1911       if the operation is a supported reduction operation, but does not have
1912       such a tree-code.
1913
1914    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1915
1916 static bool
1917 reduction_code_for_scalar_code (enum tree_code code,
1918                                 enum tree_code *reduc_code)
1919 {
1920   switch (code)
1921     {
1922       case MAX_EXPR:
1923         *reduc_code = REDUC_MAX_EXPR;
1924         return true;
1925
1926       case MIN_EXPR:
1927         *reduc_code = REDUC_MIN_EXPR;
1928         return true;
1929
1930       case PLUS_EXPR:
1931         *reduc_code = REDUC_PLUS_EXPR;
1932         return true;
1933
1934       case MULT_EXPR:
1935       case MINUS_EXPR:
1936       case BIT_IOR_EXPR:
1937       case BIT_XOR_EXPR:
1938       case BIT_AND_EXPR:
1939         *reduc_code = ERROR_MARK;
1940         return true;
1941
1942       default:
1943        return false;
1944     }
1945 }
1946
1947
1948 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
1949    STMT is printed with a message MSG. */
1950
1951 static void
1952 report_vect_op (int msg_type, gimple stmt, const char *msg)
1953 {
1954   dump_printf_loc (msg_type, vect_location, "%s", msg);
1955   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
1956   dump_printf (msg_type, "\n");
1957 }
1958
1959
1960 /* Detect SLP reduction of the form:
1961
1962    #a1 = phi <a5, a0>
1963    a2 = operation (a1)
1964    a3 = operation (a2)
1965    a4 = operation (a3)
1966    a5 = operation (a4)
1967
1968    #a = phi <a5>
1969
1970    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
1971    FIRST_STMT is the first reduction stmt in the chain
1972    (a2 = operation (a1)).
1973
1974    Return TRUE if a reduction chain was detected.  */
1975
1976 static bool
1977 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
1978 {
1979   struct loop *loop = (gimple_bb (phi))->loop_father;
1980   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
1981   enum tree_code code;
1982   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
1983   stmt_vec_info use_stmt_info, current_stmt_info;
1984   tree lhs;
1985   imm_use_iterator imm_iter;
1986   use_operand_p use_p;
1987   int nloop_uses, size = 0, n_out_of_loop_uses;
1988   bool found = false;
1989
1990   if (loop != vect_loop)
1991     return false;
1992
1993   lhs = PHI_RESULT (phi);
1994   code = gimple_assign_rhs_code (first_stmt);
1995   while (1)
1996     {
1997       nloop_uses = 0;
1998       n_out_of_loop_uses = 0;
1999       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2000         {
2001           gimple use_stmt = USE_STMT (use_p);
2002           if (is_gimple_debug (use_stmt))
2003             continue;
2004
2005           /* Check if we got back to the reduction phi.  */
2006           if (use_stmt == phi)
2007             {
2008               loop_use_stmt = use_stmt;
2009               found = true;
2010               break;
2011             }
2012
2013           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2014             {
2015               if (vinfo_for_stmt (use_stmt)
2016                   && !STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (use_stmt)))
2017                 {
2018                   loop_use_stmt = use_stmt;
2019                   nloop_uses++;
2020                 }
2021             }
2022            else
2023              n_out_of_loop_uses++;
2024
2025            /* There are can be either a single use in the loop or two uses in
2026               phi nodes.  */
2027            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2028              return false;
2029         }
2030
2031       if (found)
2032         break;
2033
2034       /* We reached a statement with no loop uses.  */
2035       if (nloop_uses == 0)
2036         return false;
2037
2038       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2039       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2040         return false;
2041
2042       if (!is_gimple_assign (loop_use_stmt)
2043           || code != gimple_assign_rhs_code (loop_use_stmt)
2044           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2045         return false;
2046
2047       /* Insert USE_STMT into reduction chain.  */
2048       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2049       if (current_stmt)
2050         {
2051           current_stmt_info = vinfo_for_stmt (current_stmt);
2052           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2053           GROUP_FIRST_ELEMENT (use_stmt_info)
2054             = GROUP_FIRST_ELEMENT (current_stmt_info);
2055         }
2056       else
2057         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2058
2059       lhs = gimple_assign_lhs (loop_use_stmt);
2060       current_stmt = loop_use_stmt;
2061       size++;
2062    }
2063
2064   if (!found || loop_use_stmt != phi || size < 2)
2065     return false;
2066
2067   /* Swap the operands, if needed, to make the reduction operand be the second
2068      operand.  */
2069   lhs = PHI_RESULT (phi);
2070   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2071   while (next_stmt)
2072     {
2073       if (gimple_assign_rhs2 (next_stmt) == lhs)
2074         {
2075           tree op = gimple_assign_rhs1 (next_stmt);
2076           gimple def_stmt = NULL;
2077
2078           if (TREE_CODE (op) == SSA_NAME)
2079             def_stmt = SSA_NAME_DEF_STMT (op);
2080
2081           /* Check that the other def is either defined in the loop
2082              ("vect_internal_def"), or it's an induction (defined by a
2083              loop-header phi-node).  */
2084           if (def_stmt
2085               && gimple_bb (def_stmt)
2086               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2087               && (is_gimple_assign (def_stmt)
2088                   || is_gimple_call (def_stmt)
2089                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2090                            == vect_induction_def
2091                   || (gimple_code (def_stmt) == GIMPLE_PHI
2092                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2093                                   == vect_internal_def
2094                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2095             {
2096               lhs = gimple_assign_lhs (next_stmt);
2097               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2098               continue;
2099             }
2100
2101           return false;
2102         }
2103       else
2104         {
2105           tree op = gimple_assign_rhs2 (next_stmt);
2106           gimple def_stmt = NULL;
2107
2108           if (TREE_CODE (op) == SSA_NAME)
2109             def_stmt = SSA_NAME_DEF_STMT (op);
2110
2111           /* Check that the other def is either defined in the loop
2112             ("vect_internal_def"), or it's an induction (defined by a
2113             loop-header phi-node).  */
2114           if (def_stmt
2115               && gimple_bb (def_stmt)
2116               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2117               && (is_gimple_assign (def_stmt)
2118                   || is_gimple_call (def_stmt)
2119                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2120                               == vect_induction_def
2121                   || (gimple_code (def_stmt) == GIMPLE_PHI
2122                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2123                                   == vect_internal_def
2124                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2125             {
2126               if (dump_enabled_p ())
2127                 {
2128                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2129                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2130                   dump_printf (MSG_NOTE, "\n");
2131                 }
2132
2133               swap_ssa_operands (next_stmt,
2134                                  gimple_assign_rhs1_ptr (next_stmt),
2135                                  gimple_assign_rhs2_ptr (next_stmt));
2136               update_stmt (next_stmt);
2137
2138               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2139                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2140             }
2141           else
2142             return false;
2143         }
2144
2145       lhs = gimple_assign_lhs (next_stmt);
2146       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2147     }
2148
2149   /* Save the chain for further analysis in SLP detection.  */
2150   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2151   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2152   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2153
2154   return true;
2155 }
2156
2157
2158 /* Function vect_is_simple_reduction_1
2159
2160    (1) Detect a cross-iteration def-use cycle that represents a simple
2161    reduction computation.  We look for the following pattern:
2162
2163    loop_header:
2164      a1 = phi < a0, a2 >
2165      a3 = ...
2166      a2 = operation (a3, a1)
2167
2168    or
2169
2170    a3 = ...
2171    loop_header:
2172      a1 = phi < a0, a2 >
2173      a2 = operation (a3, a1)
2174
2175    such that:
2176    1. operation is commutative and associative and it is safe to
2177       change the order of the computation (if CHECK_REDUCTION is true)
2178    2. no uses for a2 in the loop (a2 is used out of the loop)
2179    3. no uses of a1 in the loop besides the reduction operation
2180    4. no uses of a1 outside the loop.
2181
2182    Conditions 1,4 are tested here.
2183    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2184
2185    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2186    nested cycles, if CHECK_REDUCTION is false.
2187
2188    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2189    reductions:
2190
2191      a1 = phi < a0, a2 >
2192      inner loop (def of a3)
2193      a2 = phi < a3 >
2194
2195    If MODIFY is true it tries also to rework the code in-place to enable
2196    detection of more reduction patterns.  For the time being we rewrite
2197    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2198 */
2199
2200 static gimple
2201 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2202                             bool check_reduction, bool *double_reduc,
2203                             bool modify)
2204 {
2205   struct loop *loop = (gimple_bb (phi))->loop_father;
2206   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2207   edge latch_e = loop_latch_edge (loop);
2208   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2209   gimple def_stmt, def1 = NULL, def2 = NULL;
2210   enum tree_code orig_code, code;
2211   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2212   tree type;
2213   int nloop_uses;
2214   tree name;
2215   imm_use_iterator imm_iter;
2216   use_operand_p use_p;
2217   bool phi_def;
2218
2219   *double_reduc = false;
2220
2221   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2222      otherwise, we assume outer loop vectorization.  */
2223   gcc_assert ((check_reduction && loop == vect_loop)
2224               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2225
2226   name = PHI_RESULT (phi);
2227   /* ???  If there are no uses of the PHI result the inner loop reduction
2228      won't be detected as possibly double-reduction by vectorizable_reduction
2229      because that tries to walk the PHI arg from the preheader edge which
2230      can be constant.  See PR60382.  */
2231   if (has_zero_uses (name))
2232     return NULL;
2233   nloop_uses = 0;
2234   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2235     {
2236       gimple use_stmt = USE_STMT (use_p);
2237       if (is_gimple_debug (use_stmt))
2238         continue;
2239
2240       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2241         {
2242           if (dump_enabled_p ())
2243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2244                              "intermediate value used outside loop.\n");
2245
2246           return NULL;
2247         }
2248
2249       if (vinfo_for_stmt (use_stmt)
2250           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2251         nloop_uses++;
2252       if (nloop_uses > 1)
2253         {
2254           if (dump_enabled_p ())
2255             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2256                              "reduction used in loop.\n");
2257           return NULL;
2258         }
2259     }
2260
2261   if (TREE_CODE (loop_arg) != SSA_NAME)
2262     {
2263       if (dump_enabled_p ())
2264         {
2265           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2266                            "reduction: not ssa_name: ");
2267           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2268           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2269         }
2270       return NULL;
2271     }
2272
2273   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2274   if (!def_stmt)
2275     {
2276       if (dump_enabled_p ())
2277         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2278                          "reduction: no def_stmt.\n");
2279       return NULL;
2280     }
2281
2282   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2283     {
2284       if (dump_enabled_p ())
2285         {
2286           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2287           dump_printf (MSG_NOTE, "\n");
2288         }
2289       return NULL;
2290     }
2291
2292   if (is_gimple_assign (def_stmt))
2293     {
2294       name = gimple_assign_lhs (def_stmt);
2295       phi_def = false;
2296     }
2297   else
2298     {
2299       name = PHI_RESULT (def_stmt);
2300       phi_def = true;
2301     }
2302
2303   nloop_uses = 0;
2304   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2305     {
2306       gimple use_stmt = USE_STMT (use_p);
2307       if (is_gimple_debug (use_stmt))
2308         continue;
2309       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt))
2310           && vinfo_for_stmt (use_stmt)
2311           && !is_pattern_stmt_p (vinfo_for_stmt (use_stmt)))
2312         nloop_uses++;
2313       if (nloop_uses > 1)
2314         {
2315           if (dump_enabled_p ())
2316             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2317                              "reduction used in loop.\n");
2318           return NULL;
2319         }
2320     }
2321
2322   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2323      defined in the inner loop.  */
2324   if (phi_def)
2325     {
2326       op1 = PHI_ARG_DEF (def_stmt, 0);
2327
2328       if (gimple_phi_num_args (def_stmt) != 1
2329           || TREE_CODE (op1) != SSA_NAME)
2330         {
2331           if (dump_enabled_p ())
2332             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2333                              "unsupported phi node definition.\n");
2334
2335           return NULL;
2336         }
2337
2338       def1 = SSA_NAME_DEF_STMT (op1);
2339       if (gimple_bb (def1)
2340           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2341           && loop->inner
2342           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2343           && is_gimple_assign (def1))
2344         {
2345           if (dump_enabled_p ())
2346             report_vect_op (MSG_NOTE, def_stmt,
2347                             "detected double reduction: ");
2348
2349           *double_reduc = true;
2350           return def_stmt;
2351         }
2352
2353       return NULL;
2354     }
2355
2356   code = orig_code = gimple_assign_rhs_code (def_stmt);
2357
2358   /* We can handle "res -= x[i]", which is non-associative by
2359      simply rewriting this into "res += -x[i]".  Avoid changing
2360      gimple instruction for the first simple tests and only do this
2361      if we're allowed to change code at all.  */
2362   if (code == MINUS_EXPR
2363       && modify
2364       && (op1 = gimple_assign_rhs1 (def_stmt))
2365       && TREE_CODE (op1) == SSA_NAME
2366       && SSA_NAME_DEF_STMT (op1) == phi)
2367     code = PLUS_EXPR;
2368
2369   if (check_reduction
2370       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2371     {
2372       if (dump_enabled_p ())
2373         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2374                         "reduction: not commutative/associative: ");
2375       return NULL;
2376     }
2377
2378   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2379     {
2380       if (code != COND_EXPR)
2381         {
2382           if (dump_enabled_p ())
2383             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2384                             "reduction: not binary operation: ");
2385
2386           return NULL;
2387         }
2388
2389       op3 = gimple_assign_rhs1 (def_stmt);
2390       if (COMPARISON_CLASS_P (op3))
2391         {
2392           op4 = TREE_OPERAND (op3, 1);
2393           op3 = TREE_OPERAND (op3, 0);
2394         }
2395
2396       op1 = gimple_assign_rhs2 (def_stmt);
2397       op2 = gimple_assign_rhs3 (def_stmt);
2398
2399       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2400         {
2401           if (dump_enabled_p ())
2402             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2403                             "reduction: uses not ssa_names: ");
2404
2405           return NULL;
2406         }
2407     }
2408   else
2409     {
2410       op1 = gimple_assign_rhs1 (def_stmt);
2411       op2 = gimple_assign_rhs2 (def_stmt);
2412
2413       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2414         {
2415           if (dump_enabled_p ())
2416             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2417                             "reduction: uses not ssa_names: ");
2418
2419           return NULL;
2420         }
2421    }
2422
2423   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2424   if ((TREE_CODE (op1) == SSA_NAME
2425        && !types_compatible_p (type,TREE_TYPE (op1)))
2426       || (TREE_CODE (op2) == SSA_NAME
2427           && !types_compatible_p (type, TREE_TYPE (op2)))
2428       || (op3 && TREE_CODE (op3) == SSA_NAME
2429           && !types_compatible_p (type, TREE_TYPE (op3)))
2430       || (op4 && TREE_CODE (op4) == SSA_NAME
2431           && !types_compatible_p (type, TREE_TYPE (op4))))
2432     {
2433       if (dump_enabled_p ())
2434         {
2435           dump_printf_loc (MSG_NOTE, vect_location,
2436                            "reduction: multiple types: operation type: ");
2437           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2438           dump_printf (MSG_NOTE, ", operands types: ");
2439           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2440                              TREE_TYPE (op1));
2441           dump_printf (MSG_NOTE, ",");
2442           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2443                              TREE_TYPE (op2));
2444           if (op3)
2445             {
2446               dump_printf (MSG_NOTE, ",");
2447               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2448                                  TREE_TYPE (op3));
2449             }
2450
2451           if (op4)
2452             {
2453               dump_printf (MSG_NOTE, ",");
2454               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2455                                  TREE_TYPE (op4));
2456             }
2457           dump_printf (MSG_NOTE, "\n");
2458         }
2459
2460       return NULL;
2461     }
2462
2463   /* Check that it's ok to change the order of the computation.
2464      Generally, when vectorizing a reduction we change the order of the
2465      computation.  This may change the behavior of the program in some
2466      cases, so we need to check that this is ok.  One exception is when
2467      vectorizing an outer-loop: the inner-loop is executed sequentially,
2468      and therefore vectorizing reductions in the inner-loop during
2469      outer-loop vectorization is safe.  */
2470
2471   /* CHECKME: check for !flag_finite_math_only too?  */
2472   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2473       && check_reduction)
2474     {
2475       /* Changing the order of operations changes the semantics.  */
2476       if (dump_enabled_p ())
2477         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2478                         "reduction: unsafe fp math optimization: ");
2479       return NULL;
2480     }
2481   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2482            && check_reduction)
2483     {
2484       /* Changing the order of operations changes the semantics.  */
2485       if (dump_enabled_p ())
2486         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2487                         "reduction: unsafe int math optimization: ");
2488       return NULL;
2489     }
2490   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2491     {
2492       /* Changing the order of operations changes the semantics.  */
2493       if (dump_enabled_p ())
2494         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2495                         "reduction: unsafe fixed-point math optimization: ");
2496       return NULL;
2497     }
2498
2499   /* If we detected "res -= x[i]" earlier, rewrite it into
2500      "res += -x[i]" now.  If this turns out to be useless reassoc
2501      will clean it up again.  */
2502   if (orig_code == MINUS_EXPR)
2503     {
2504       tree rhs = gimple_assign_rhs2 (def_stmt);
2505       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2506       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2507       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2508       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2509                                                           loop_info, NULL));
2510       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2511       gimple_assign_set_rhs2 (def_stmt, negrhs);
2512       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2513       update_stmt (def_stmt);
2514     }
2515
2516   /* Reduction is safe. We're dealing with one of the following:
2517      1) integer arithmetic and no trapv
2518      2) floating point arithmetic, and special flags permit this optimization
2519      3) nested cycle (i.e., outer loop vectorization).  */
2520   if (TREE_CODE (op1) == SSA_NAME)
2521     def1 = SSA_NAME_DEF_STMT (op1);
2522
2523   if (TREE_CODE (op2) == SSA_NAME)
2524     def2 = SSA_NAME_DEF_STMT (op2);
2525
2526   if (code != COND_EXPR
2527       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2528     {
2529       if (dump_enabled_p ())
2530         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2531       return NULL;
2532     }
2533
2534   /* Check that one def is the reduction def, defined by PHI,
2535      the other def is either defined in the loop ("vect_internal_def"),
2536      or it's an induction (defined by a loop-header phi-node).  */
2537
2538   if (def2 && def2 == phi
2539       && (code == COND_EXPR
2540           || !def1 || gimple_nop_p (def1)
2541           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2542           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2543               && (is_gimple_assign (def1)
2544                   || is_gimple_call (def1)
2545                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2546                       == vect_induction_def
2547                   || (gimple_code (def1) == GIMPLE_PHI
2548                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2549                           == vect_internal_def
2550                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2551     {
2552       if (dump_enabled_p ())
2553         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2554       return def_stmt;
2555     }
2556
2557   if (def1 && def1 == phi
2558       && (code == COND_EXPR
2559           || !def2 || gimple_nop_p (def2)
2560           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2561           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2562               && (is_gimple_assign (def2)
2563                   || is_gimple_call (def2)
2564                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2565                       == vect_induction_def
2566                   || (gimple_code (def2) == GIMPLE_PHI
2567                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2568                           == vect_internal_def
2569                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2570     {
2571       if (check_reduction)
2572         {
2573           /* Swap operands (just for simplicity - so that the rest of the code
2574              can assume that the reduction variable is always the last (second)
2575              argument).  */
2576           if (dump_enabled_p ())
2577             report_vect_op (MSG_NOTE, def_stmt,
2578                             "detected reduction: need to swap operands: ");
2579
2580           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2581                              gimple_assign_rhs2_ptr (def_stmt));
2582
2583           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2584             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2585         }
2586       else
2587         {
2588           if (dump_enabled_p ())
2589             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2590         }
2591
2592       return def_stmt;
2593     }
2594
2595   /* Try to find SLP reduction chain.  */
2596   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2597     {
2598       if (dump_enabled_p ())
2599         report_vect_op (MSG_NOTE, def_stmt,
2600                         "reduction: detected reduction chain: ");
2601
2602       return def_stmt;
2603     }
2604
2605   if (dump_enabled_p ())
2606     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2607                     "reduction: unknown pattern: ");
2608
2609   return NULL;
2610 }
2611
2612 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2613    in-place.  Arguments as there.  */
2614
2615 static gimple
2616 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2617                           bool check_reduction, bool *double_reduc)
2618 {
2619   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2620                                      double_reduc, false);
2621 }
2622
2623 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2624    in-place if it enables detection of more reductions.  Arguments
2625    as there.  */
2626
2627 gimple
2628 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2629                           bool check_reduction, bool *double_reduc)
2630 {
2631   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2632                                      double_reduc, true);
2633 }
2634
2635 /* Calculate the cost of one scalar iteration of the loop.  */
2636 int
2637 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
2638 {
2639   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
2640   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
2641   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
2642   int innerloop_iters, i, stmt_cost;
2643
2644   /* Count statements in scalar loop.  Using this as scalar cost for a single
2645      iteration for now.
2646
2647      TODO: Add outer loop support.
2648
2649      TODO: Consider assigning different costs to different scalar
2650      statements.  */
2651
2652   /* FORNOW.  */
2653   innerloop_iters = 1;
2654   if (loop->inner)
2655     innerloop_iters = 50; /* FIXME */
2656
2657   for (i = 0; i < nbbs; i++)
2658     {
2659       gimple_stmt_iterator si;
2660       basic_block bb = bbs[i];
2661
2662       if (bb->loop_father == loop->inner)
2663         factor = innerloop_iters;
2664       else
2665         factor = 1;
2666
2667       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
2668         {
2669           gimple stmt = gsi_stmt (si);
2670           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
2671
2672           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
2673             continue;
2674
2675           /* Skip stmts that are not vectorized inside the loop.  */
2676           if (stmt_info
2677               && !STMT_VINFO_RELEVANT_P (stmt_info)
2678               && (!STMT_VINFO_LIVE_P (stmt_info)
2679                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
2680               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
2681             continue;
2682
2683           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
2684             {
2685               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
2686                stmt_cost = vect_get_stmt_cost (scalar_load);
2687              else
2688                stmt_cost = vect_get_stmt_cost (scalar_store);
2689             }
2690           else
2691             stmt_cost = vect_get_stmt_cost (scalar_stmt);
2692
2693           scalar_single_iter_cost += stmt_cost * factor;
2694         }
2695     }
2696   return scalar_single_iter_cost;
2697 }
2698
2699 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2700 int
2701 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2702                              int *peel_iters_epilogue,
2703                              int scalar_single_iter_cost,
2704                              stmt_vector_for_cost *prologue_cost_vec,
2705                              stmt_vector_for_cost *epilogue_cost_vec)
2706 {
2707   int retval = 0;
2708   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2709
2710   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2711     {
2712       *peel_iters_epilogue = vf/2;
2713       if (dump_enabled_p ())
2714         dump_printf_loc (MSG_NOTE, vect_location,
2715                          "cost model: epilogue peel iters set to vf/2 "
2716                          "because loop iterations are unknown .\n");
2717
2718       /* If peeled iterations are known but number of scalar loop
2719          iterations are unknown, count a taken branch per peeled loop.  */
2720       retval = record_stmt_cost (prologue_cost_vec, 2, cond_branch_taken,
2721                                  NULL, 0, vect_prologue);
2722     }
2723   else
2724     {
2725       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2726       peel_iters_prologue = niters < peel_iters_prologue ?
2727                             niters : peel_iters_prologue;
2728       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2729       /* If we need to peel for gaps, but no peeling is required, we have to
2730          peel VF iterations.  */
2731       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2732         *peel_iters_epilogue = vf;
2733     }
2734
2735   if (peel_iters_prologue)
2736     retval += record_stmt_cost (prologue_cost_vec,
2737                                 peel_iters_prologue * scalar_single_iter_cost,
2738                                 scalar_stmt, NULL, 0, vect_prologue);
2739   if (*peel_iters_epilogue)
2740     retval += record_stmt_cost (epilogue_cost_vec,
2741                                 *peel_iters_epilogue * scalar_single_iter_cost,
2742                                 scalar_stmt, NULL, 0, vect_epilogue);
2743   return retval;
2744 }
2745
2746 /* Function vect_estimate_min_profitable_iters
2747
2748    Return the number of iterations required for the vector version of the
2749    loop to be profitable relative to the cost of the scalar version of the
2750    loop.  */
2751
2752 static void
2753 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2754                                     int *ret_min_profitable_niters,
2755                                     int *ret_min_profitable_estimate)
2756 {
2757   int min_profitable_iters;
2758   int min_profitable_estimate;
2759   int peel_iters_prologue;
2760   int peel_iters_epilogue;
2761   unsigned vec_inside_cost = 0;
2762   int vec_outside_cost = 0;
2763   unsigned vec_prologue_cost = 0;
2764   unsigned vec_epilogue_cost = 0;
2765   int scalar_single_iter_cost = 0;
2766   int scalar_outside_cost = 0;
2767   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2768   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2769   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2770
2771   /* Cost model disabled.  */
2772   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2773     {
2774       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2775       *ret_min_profitable_niters = 0;
2776       *ret_min_profitable_estimate = 0;
2777       return;
2778     }
2779
2780   /* Requires loop versioning tests to handle misalignment.  */
2781   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2782     {
2783       /*  FIXME: Make cost depend on complexity of individual check.  */
2784       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2785       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2786                             vect_prologue);
2787       dump_printf (MSG_NOTE,
2788                    "cost model: Adding cost of checks for loop "
2789                    "versioning to treat misalignment.\n");
2790     }
2791
2792   /* Requires loop versioning with alias checks.  */
2793   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2794     {
2795       /*  FIXME: Make cost depend on complexity of individual check.  */
2796       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2797       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2798                             vect_prologue);
2799       dump_printf (MSG_NOTE,
2800                    "cost model: Adding cost of checks for loop "
2801                    "versioning aliasing.\n");
2802     }
2803
2804   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2805       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2806     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2807                           vect_prologue);
2808
2809   /* Count statements in scalar loop.  Using this as scalar cost for a single
2810      iteration for now.
2811
2812      TODO: Add outer loop support.
2813
2814      TODO: Consider assigning different costs to different scalar
2815      statements.  */
2816
2817   scalar_single_iter_cost = vect_get_single_scalar_iteration_cost (loop_vinfo);
2818
2819   /* Add additional cost for the peeled instructions in prologue and epilogue
2820      loop.
2821
2822      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2823      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2824
2825      TODO: Build an expression that represents peel_iters for prologue and
2826      epilogue to be used in a run-time test.  */
2827
2828   if (npeel  < 0)
2829     {
2830       peel_iters_prologue = vf/2;
2831       dump_printf (MSG_NOTE, "cost model: "
2832                    "prologue peel iters set to vf/2.\n");
2833
2834       /* If peeling for alignment is unknown, loop bound of main loop becomes
2835          unknown.  */
2836       peel_iters_epilogue = vf/2;
2837       dump_printf (MSG_NOTE, "cost model: "
2838                    "epilogue peel iters set to vf/2 because "
2839                    "peeling for alignment is unknown.\n");
2840
2841       /* If peeled iterations are unknown, count a taken branch and a not taken
2842          branch per peeled loop. Even if scalar loop iterations are known,
2843          vector iterations are not known since peeled prologue iterations are
2844          not known. Hence guards remain the same.  */
2845       (void) add_stmt_cost (target_cost_data, 2, cond_branch_taken,
2846                             NULL, 0, vect_prologue);
2847       (void) add_stmt_cost (target_cost_data, 2, cond_branch_not_taken,
2848                             NULL, 0, vect_prologue);
2849       /* FORNOW: Don't attempt to pass individual scalar instructions to
2850          the model; just assume linear cost for scalar iterations.  */
2851       (void) add_stmt_cost (target_cost_data,
2852                             peel_iters_prologue * scalar_single_iter_cost,
2853                             scalar_stmt, NULL, 0, vect_prologue);
2854       (void) add_stmt_cost (target_cost_data,
2855                             peel_iters_epilogue * scalar_single_iter_cost,
2856                             scalar_stmt, NULL, 0, vect_epilogue);
2857     }
2858   else
2859     {
2860       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2861       stmt_info_for_cost *si;
2862       int j;
2863       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2864
2865       prologue_cost_vec.create (2);
2866       epilogue_cost_vec.create (2);
2867       peel_iters_prologue = npeel;
2868
2869       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2870                                           &peel_iters_epilogue,
2871                                           scalar_single_iter_cost,
2872                                           &prologue_cost_vec,
2873                                           &epilogue_cost_vec);
2874
2875       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2876         {
2877           struct _stmt_vec_info *stmt_info
2878             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2879           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2880                                 si->misalign, vect_prologue);
2881         }
2882
2883       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2884         {
2885           struct _stmt_vec_info *stmt_info
2886             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2887           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2888                                 si->misalign, vect_epilogue);
2889         }
2890
2891       prologue_cost_vec.release ();
2892       epilogue_cost_vec.release ();
2893     }
2894
2895   /* FORNOW: The scalar outside cost is incremented in one of the
2896      following ways:
2897
2898      1. The vectorizer checks for alignment and aliasing and generates
2899      a condition that allows dynamic vectorization.  A cost model
2900      check is ANDED with the versioning condition.  Hence scalar code
2901      path now has the added cost of the versioning check.
2902
2903        if (cost > th & versioning_check)
2904          jmp to vector code
2905
2906      Hence run-time scalar is incremented by not-taken branch cost.
2907
2908      2. The vectorizer then checks if a prologue is required.  If the
2909      cost model check was not done before during versioning, it has to
2910      be done before the prologue check.
2911
2912        if (cost <= th)
2913          prologue = scalar_iters
2914        if (prologue == 0)
2915          jmp to vector code
2916        else
2917          execute prologue
2918        if (prologue == num_iters)
2919          go to exit
2920
2921      Hence the run-time scalar cost is incremented by a taken branch,
2922      plus a not-taken branch, plus a taken branch cost.
2923
2924      3. The vectorizer then checks if an epilogue is required.  If the
2925      cost model check was not done before during prologue check, it
2926      has to be done with the epilogue check.
2927
2928        if (prologue == 0)
2929          jmp to vector code
2930        else
2931          execute prologue
2932        if (prologue == num_iters)
2933          go to exit
2934        vector code:
2935          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2936            jmp to epilogue
2937
2938      Hence the run-time scalar cost should be incremented by 2 taken
2939      branches.
2940
2941      TODO: The back end may reorder the BBS's differently and reverse
2942      conditions/branch directions.  Change the estimates below to
2943      something more reasonable.  */
2944
2945   /* If the number of iterations is known and we do not do versioning, we can
2946      decide whether to vectorize at compile time.  Hence the scalar version
2947      do not carry cost model guard costs.  */
2948   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2949       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2950       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2951     {
2952       /* Cost model check occurs at versioning.  */
2953       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2954           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2955         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2956       else
2957         {
2958           /* Cost model check occurs at prologue generation.  */
2959           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2960             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2961               + vect_get_stmt_cost (cond_branch_not_taken);
2962           /* Cost model check occurs at epilogue generation.  */
2963           else
2964             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2965         }
2966     }
2967
2968   /* Complete the target-specific cost calculations.  */
2969   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2970                &vec_inside_cost, &vec_epilogue_cost);
2971
2972   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2973
2974   /* Calculate number of iterations required to make the vector version
2975      profitable, relative to the loop bodies only.  The following condition
2976      must hold true:
2977      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
2978      where
2979      SIC = scalar iteration cost, VIC = vector iteration cost,
2980      VOC = vector outside cost, VF = vectorization factor,
2981      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
2982      SOC = scalar outside cost for run time cost model check.  */
2983
2984   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
2985     {
2986       if (vec_outside_cost <= 0)
2987         min_profitable_iters = 1;
2988       else
2989         {
2990           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
2991                                   - vec_inside_cost * peel_iters_prologue
2992                                   - vec_inside_cost * peel_iters_epilogue)
2993                                  / ((scalar_single_iter_cost * vf)
2994                                     - vec_inside_cost);
2995
2996           if ((scalar_single_iter_cost * vf * min_profitable_iters)
2997               <= (((int) vec_inside_cost * min_profitable_iters)
2998                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
2999             min_profitable_iters++;
3000         }
3001     }
3002   /* vector version will never be profitable.  */
3003   else
3004     {
3005       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3006         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3007                     "did not happen for a simd loop");
3008
3009       if (dump_enabled_p ())
3010         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3011                          "cost model: the vector iteration cost = %d "
3012                          "divided by the scalar iteration cost = %d "
3013                          "is greater or equal to the vectorization factor = %d"
3014                          ".\n",
3015                          vec_inside_cost, scalar_single_iter_cost, vf);
3016       *ret_min_profitable_niters = -1;
3017       *ret_min_profitable_estimate = -1;
3018       return;
3019     }
3020
3021   if (dump_enabled_p ())
3022     {
3023       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3024       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3025                    vec_inside_cost);
3026       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3027                    vec_prologue_cost);
3028       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3029                    vec_epilogue_cost);
3030       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3031                    scalar_single_iter_cost);
3032       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3033                    scalar_outside_cost);
3034       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3035                    vec_outside_cost);
3036       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3037                    peel_iters_prologue);
3038       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3039                    peel_iters_epilogue);
3040       dump_printf (MSG_NOTE,
3041                    "  Calculated minimum iters for profitability: %d\n",
3042                    min_profitable_iters);
3043       dump_printf (MSG_NOTE, "\n");
3044     }
3045
3046   min_profitable_iters =
3047         min_profitable_iters < vf ? vf : min_profitable_iters;
3048
3049   /* Because the condition we create is:
3050      if (niters <= min_profitable_iters)
3051        then skip the vectorized loop.  */
3052   min_profitable_iters--;
3053
3054   if (dump_enabled_p ())
3055     dump_printf_loc (MSG_NOTE, vect_location,
3056                      "  Runtime profitability threshold = %d\n",
3057                      min_profitable_iters);
3058
3059   *ret_min_profitable_niters = min_profitable_iters;
3060
3061   /* Calculate number of iterations required to make the vector version
3062      profitable, relative to the loop bodies only.
3063
3064      Non-vectorized variant is SIC * niters and it must win over vector
3065      variant on the expected loop trip count.  The following condition must hold true:
3066      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3067
3068   if (vec_outside_cost <= 0)
3069     min_profitable_estimate = 1;
3070   else
3071     {
3072       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3073                                  - vec_inside_cost * peel_iters_prologue
3074                                  - vec_inside_cost * peel_iters_epilogue)
3075                                  / ((scalar_single_iter_cost * vf)
3076                                    - vec_inside_cost);
3077     }
3078   min_profitable_estimate --;
3079   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3080   if (dump_enabled_p ())
3081     dump_printf_loc (MSG_NOTE, vect_location,
3082                      "  Static estimate profitability threshold = %d\n",
3083                       min_profitable_iters);
3084
3085   *ret_min_profitable_estimate = min_profitable_estimate;
3086 }
3087
3088 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3089    vector elements (not bits) for a vector of mode MODE.  */
3090 static void
3091 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3092                               unsigned char *sel)
3093 {
3094   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3095
3096   for (i = 0; i < nelt; i++)
3097     sel[i] = (i + offset) & (2*nelt - 1);
3098 }
3099
3100 /* Checks whether the target supports whole-vector shifts for vectors of mode
3101    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3102    it supports vec_perm_const with masks for all necessary shift amounts.  */
3103 static bool
3104 have_whole_vector_shift (enum machine_mode mode)
3105 {
3106   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3107     return true;
3108
3109   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3110     return false;
3111
3112   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3113   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3114
3115   for (i = nelt/2; i >= 1; i/=2)
3116     {
3117       calc_vec_perm_mask_for_shift (mode, i, sel);
3118       if (!can_vec_perm_p (mode, false, sel))
3119         return false;
3120     }
3121   return true;
3122 }
3123
3124 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3125    functions. Design better to avoid maintenance issues.  */
3126
3127 /* Function vect_model_reduction_cost.
3128
3129    Models cost for a reduction operation, including the vector ops
3130    generated within the strip-mine loop, the initial definition before
3131    the loop, and the epilogue code that must be generated.  */
3132
3133 static bool
3134 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3135                            int ncopies)
3136 {
3137   int prologue_cost = 0, epilogue_cost = 0;
3138   enum tree_code code;
3139   optab optab;
3140   tree vectype;
3141   gimple stmt, orig_stmt;
3142   tree reduction_op;
3143   machine_mode mode;
3144   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3145   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3146   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3147
3148   /* Cost of reduction op inside loop.  */
3149   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3150                                         stmt_info, 0, vect_body);
3151   stmt = STMT_VINFO_STMT (stmt_info);
3152
3153   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3154     {
3155     case GIMPLE_SINGLE_RHS:
3156       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt)) == ternary_op);
3157       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), 2);
3158       break;
3159     case GIMPLE_UNARY_RHS:
3160       reduction_op = gimple_assign_rhs1 (stmt);
3161       break;
3162     case GIMPLE_BINARY_RHS:
3163       reduction_op = gimple_assign_rhs2 (stmt);
3164       break;
3165     case GIMPLE_TERNARY_RHS:
3166       reduction_op = gimple_assign_rhs3 (stmt);
3167       break;
3168     default:
3169       gcc_unreachable ();
3170     }
3171
3172   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3173   if (!vectype)
3174     {
3175       if (dump_enabled_p ())
3176         {
3177           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3178                            "unsupported data-type ");
3179           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3180                              TREE_TYPE (reduction_op));
3181           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3182         }
3183       return false;
3184    }
3185
3186   mode = TYPE_MODE (vectype);
3187   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3188
3189   if (!orig_stmt)
3190     orig_stmt = STMT_VINFO_STMT (stmt_info);
3191
3192   code = gimple_assign_rhs_code (orig_stmt);
3193
3194   /* Add in cost for initial definition.  */
3195   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3196                                   stmt_info, 0, vect_prologue);
3197
3198   /* Determine cost of epilogue code.
3199
3200      We have a reduction operator that will reduce the vector in one statement.
3201      Also requires scalar extract.  */
3202
3203   if (!nested_in_vect_loop_p (loop, orig_stmt))
3204     {
3205       if (reduc_code != ERROR_MARK)
3206         {
3207           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3208                                           stmt_info, 0, vect_epilogue);
3209           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3210                                           stmt_info, 0, vect_epilogue);
3211         }
3212       else
3213         {
3214           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3215           tree bitsize =
3216             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3217           int element_bitsize = tree_to_uhwi (bitsize);
3218           int nelements = vec_size_in_bits / element_bitsize;
3219
3220           optab = optab_for_tree_code (code, vectype, optab_default);
3221
3222           /* We have a whole vector shift available.  */
3223           if (VECTOR_MODE_P (mode)
3224               && optab_handler (optab, mode) != CODE_FOR_nothing
3225               && have_whole_vector_shift (mode))
3226             {
3227               /* Final reduction via vector shifts and the reduction operator.
3228                  Also requires scalar extract.  */
3229               epilogue_cost += add_stmt_cost (target_cost_data,
3230                                               exact_log2 (nelements) * 2,
3231                                               vector_stmt, stmt_info, 0,
3232                                               vect_epilogue);
3233               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3234                                               vec_to_scalar, stmt_info, 0,
3235                                               vect_epilogue);
3236             }
3237           else
3238             /* Use extracts and reduction op for final reduction.  For N
3239                elements, we have N extracts and N-1 reduction ops.  */
3240             epilogue_cost += add_stmt_cost (target_cost_data,
3241                                             nelements + nelements - 1,
3242                                             vector_stmt, stmt_info, 0,
3243                                             vect_epilogue);
3244         }
3245     }
3246
3247   if (dump_enabled_p ())
3248     dump_printf (MSG_NOTE,
3249                  "vect_model_reduction_cost: inside_cost = %d, "
3250                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3251                  prologue_cost, epilogue_cost);
3252
3253   return true;
3254 }
3255
3256
3257 /* Function vect_model_induction_cost.
3258
3259    Models cost for induction operations.  */
3260
3261 static void
3262 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3263 {
3264   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3265   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3266   unsigned inside_cost, prologue_cost;
3267
3268   /* loop cost for vec_loop.  */
3269   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3270                                stmt_info, 0, vect_body);
3271
3272   /* prologue cost for vec_init and vec_step.  */
3273   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3274                                  stmt_info, 0, vect_prologue);
3275
3276   if (dump_enabled_p ())
3277     dump_printf_loc (MSG_NOTE, vect_location,
3278                      "vect_model_induction_cost: inside_cost = %d, "
3279                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3280 }
3281
3282
3283 /* Function get_initial_def_for_induction
3284
3285    Input:
3286    STMT - a stmt that performs an induction operation in the loop.
3287    IV_PHI - the initial value of the induction variable
3288
3289    Output:
3290    Return a vector variable, initialized with the first VF values of
3291    the induction variable.  E.g., for an iv with IV_PHI='X' and
3292    evolution S, for a vector of 4 units, we want to return:
3293    [X, X + S, X + 2*S, X + 3*S].  */
3294
3295 static tree
3296 get_initial_def_for_induction (gimple iv_phi)
3297 {
3298   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3299   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3300   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3301   tree vectype;
3302   int nunits;
3303   edge pe = loop_preheader_edge (loop);
3304   struct loop *iv_loop;
3305   basic_block new_bb;
3306   tree new_vec, vec_init, vec_step, t;
3307   tree new_var;
3308   tree new_name;
3309   gimple init_stmt, new_stmt;
3310   gphi *induction_phi;
3311   tree induc_def, vec_def, vec_dest;
3312   tree init_expr, step_expr;
3313   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3314   int i;
3315   int ncopies;
3316   tree expr;
3317   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3318   bool nested_in_vect_loop = false;
3319   gimple_seq stmts = NULL;
3320   imm_use_iterator imm_iter;
3321   use_operand_p use_p;
3322   gimple exit_phi;
3323   edge latch_e;
3324   tree loop_arg;
3325   gimple_stmt_iterator si;
3326   basic_block bb = gimple_bb (iv_phi);
3327   tree stepvectype;
3328   tree resvectype;
3329
3330   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3331   if (nested_in_vect_loop_p (loop, iv_phi))
3332     {
3333       nested_in_vect_loop = true;
3334       iv_loop = loop->inner;
3335     }
3336   else
3337     iv_loop = loop;
3338   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3339
3340   latch_e = loop_latch_edge (iv_loop);
3341   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3342
3343   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3344   gcc_assert (step_expr != NULL_TREE);
3345
3346   pe = loop_preheader_edge (iv_loop);
3347   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3348                                      loop_preheader_edge (iv_loop));
3349
3350   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3351   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3352   gcc_assert (vectype);
3353   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3354   ncopies = vf / nunits;
3355
3356   gcc_assert (phi_info);
3357   gcc_assert (ncopies >= 1);
3358
3359   /* Convert the step to the desired type.  */
3360   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3361                                                   step_expr),
3362                                     &stmts, true, NULL_TREE);
3363   if (stmts)
3364     {
3365       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3366       gcc_assert (!new_bb);
3367     }
3368
3369   /* Find the first insertion point in the BB.  */
3370   si = gsi_after_labels (bb);
3371
3372   /* Create the vector that holds the initial_value of the induction.  */
3373   if (nested_in_vect_loop)
3374     {
3375       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3376          been created during vectorization of previous stmts.  We obtain it
3377          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3378       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3379       /* If the initial value is not of proper type, convert it.  */
3380       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3381         {
3382           new_stmt
3383             = gimple_build_assign (vect_get_new_vect_var (vectype,
3384                                                           vect_simple_var,
3385                                                           "vec_iv_"),
3386                                    VIEW_CONVERT_EXPR,
3387                                    build1 (VIEW_CONVERT_EXPR, vectype,
3388                                            vec_init));
3389           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3390           gimple_assign_set_lhs (new_stmt, vec_init);
3391           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3392                                                  new_stmt);
3393           gcc_assert (!new_bb);
3394           set_vinfo_for_stmt (new_stmt,
3395                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3396         }
3397     }
3398   else
3399     {
3400       vec<constructor_elt, va_gc> *v;
3401
3402       /* iv_loop is the loop to be vectorized. Create:
3403          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3404       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3405                                        vect_scalar_var, "var_");
3406       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3407                                                      init_expr),
3408                                        &stmts, false, new_var);
3409       if (stmts)
3410         {
3411           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3412           gcc_assert (!new_bb);
3413         }
3414
3415       vec_alloc (v, nunits);
3416       bool constant_p = is_gimple_min_invariant (new_name);
3417       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3418       for (i = 1; i < nunits; i++)
3419         {
3420           /* Create: new_name_i = new_name + step_expr  */
3421           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3422                                   new_name, step_expr);
3423           if (!is_gimple_min_invariant (new_name))
3424             {
3425               init_stmt = gimple_build_assign (new_var, new_name);
3426               new_name = make_ssa_name (new_var, init_stmt);
3427               gimple_assign_set_lhs (init_stmt, new_name);
3428               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3429               gcc_assert (!new_bb);
3430               if (dump_enabled_p ())
3431                 {
3432                   dump_printf_loc (MSG_NOTE, vect_location,
3433                                    "created new init_stmt: ");
3434                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3435                   dump_printf (MSG_NOTE, "\n");
3436                 }
3437               constant_p = false;
3438             }
3439           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3440         }
3441       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3442       if (constant_p)
3443         new_vec = build_vector_from_ctor (vectype, v);
3444       else
3445         new_vec = build_constructor (vectype, v);
3446       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3447     }
3448
3449
3450   /* Create the vector that holds the step of the induction.  */
3451   if (nested_in_vect_loop)
3452     /* iv_loop is nested in the loop to be vectorized. Generate:
3453        vec_step = [S, S, S, S]  */
3454     new_name = step_expr;
3455   else
3456     {
3457       /* iv_loop is the loop to be vectorized. Generate:
3458           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3459       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3460         {
3461           expr = build_int_cst (integer_type_node, vf);
3462           expr = fold_convert (TREE_TYPE (step_expr), expr);
3463         }
3464       else
3465         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3466       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3467                               expr, step_expr);
3468       if (TREE_CODE (step_expr) == SSA_NAME)
3469         new_name = vect_init_vector (iv_phi, new_name,
3470                                      TREE_TYPE (step_expr), NULL);
3471     }
3472
3473   t = unshare_expr (new_name);
3474   gcc_assert (CONSTANT_CLASS_P (new_name)
3475               || TREE_CODE (new_name) == SSA_NAME);
3476   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3477   gcc_assert (stepvectype);
3478   new_vec = build_vector_from_val (stepvectype, t);
3479   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3480
3481
3482   /* Create the following def-use cycle:
3483      loop prolog:
3484          vec_init = ...
3485          vec_step = ...
3486      loop:
3487          vec_iv = PHI <vec_init, vec_loop>
3488          ...
3489          STMT
3490          ...
3491          vec_loop = vec_iv + vec_step;  */
3492
3493   /* Create the induction-phi that defines the induction-operand.  */
3494   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3495   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3496   set_vinfo_for_stmt (induction_phi,
3497                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3498   induc_def = PHI_RESULT (induction_phi);
3499
3500   /* Create the iv update inside the loop  */
3501   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3502   vec_def = make_ssa_name (vec_dest, new_stmt);
3503   gimple_assign_set_lhs (new_stmt, vec_def);
3504   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3505   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3506                                                    NULL));
3507
3508   /* Set the arguments of the phi node:  */
3509   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3510   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3511                UNKNOWN_LOCATION);
3512
3513
3514   /* In case that vectorization factor (VF) is bigger than the number
3515      of elements that we can fit in a vectype (nunits), we have to generate
3516      more than one vector stmt - i.e - we need to "unroll" the
3517      vector stmt by a factor VF/nunits.  For more details see documentation
3518      in vectorizable_operation.  */
3519
3520   if (ncopies > 1)
3521     {
3522       stmt_vec_info prev_stmt_vinfo;
3523       /* FORNOW. This restriction should be relaxed.  */
3524       gcc_assert (!nested_in_vect_loop);
3525
3526       /* Create the vector that holds the step of the induction.  */
3527       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3528         {
3529           expr = build_int_cst (integer_type_node, nunits);
3530           expr = fold_convert (TREE_TYPE (step_expr), expr);
3531         }
3532       else
3533         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3534       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3535                               expr, step_expr);
3536       if (TREE_CODE (step_expr) == SSA_NAME)
3537         new_name = vect_init_vector (iv_phi, new_name,
3538                                      TREE_TYPE (step_expr), NULL);
3539       t = unshare_expr (new_name);
3540       gcc_assert (CONSTANT_CLASS_P (new_name)
3541                   || TREE_CODE (new_name) == SSA_NAME);
3542       new_vec = build_vector_from_val (stepvectype, t);
3543       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3544
3545       vec_def = induc_def;
3546       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3547       for (i = 1; i < ncopies; i++)
3548         {
3549           /* vec_i = vec_prev + vec_step  */
3550           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3551                                           vec_def, vec_step);
3552           vec_def = make_ssa_name (vec_dest, new_stmt);
3553           gimple_assign_set_lhs (new_stmt, vec_def);
3554
3555           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3556           if (!useless_type_conversion_p (resvectype, vectype))
3557             {
3558               new_stmt
3559                 = gimple_build_assign
3560                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3561                                                 "vec_iv_"),
3562                          VIEW_CONVERT_EXPR,
3563                          build1 (VIEW_CONVERT_EXPR, resvectype,
3564                                  gimple_assign_lhs (new_stmt)));
3565               gimple_assign_set_lhs (new_stmt,
3566                                      make_ssa_name
3567                                        (gimple_assign_lhs (new_stmt), new_stmt));
3568               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3569             }
3570           set_vinfo_for_stmt (new_stmt,
3571                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3572           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3573           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3574         }
3575     }
3576
3577   if (nested_in_vect_loop)
3578     {
3579       /* Find the loop-closed exit-phi of the induction, and record
3580          the final vector of induction results:  */
3581       exit_phi = NULL;
3582       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3583         {
3584           gimple use_stmt = USE_STMT (use_p);
3585           if (is_gimple_debug (use_stmt))
3586             continue;
3587
3588           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3589             {
3590               exit_phi = use_stmt;
3591               break;
3592             }
3593         }
3594       if (exit_phi)
3595         {
3596           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3597           /* FORNOW. Currently not supporting the case that an inner-loop induction
3598              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3599           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3600                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3601
3602           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3603           if (dump_enabled_p ())
3604             {
3605               dump_printf_loc (MSG_NOTE, vect_location,
3606                                "vector of inductions after inner-loop:");
3607               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3608               dump_printf (MSG_NOTE, "\n");
3609             }
3610         }
3611     }
3612
3613
3614   if (dump_enabled_p ())
3615     {
3616       dump_printf_loc (MSG_NOTE, vect_location,
3617                        "transform induction: created def-use cycle: ");
3618       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3619       dump_printf (MSG_NOTE, "\n");
3620       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3621                         SSA_NAME_DEF_STMT (vec_def), 0);
3622       dump_printf (MSG_NOTE, "\n");
3623     }
3624
3625   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3626   if (!useless_type_conversion_p (resvectype, vectype))
3627     {
3628       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3629                                                              vect_simple_var,
3630                                                              "vec_iv_"),
3631                                       VIEW_CONVERT_EXPR,
3632                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3633                                               induc_def));
3634       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3635       gimple_assign_set_lhs (new_stmt, induc_def);
3636       si = gsi_after_labels (bb);
3637       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3638       set_vinfo_for_stmt (new_stmt,
3639                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3640       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3641         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3642     }
3643
3644   return induc_def;
3645 }
3646
3647
3648 /* Function get_initial_def_for_reduction
3649
3650    Input:
3651    STMT - a stmt that performs a reduction operation in the loop.
3652    INIT_VAL - the initial value of the reduction variable
3653
3654    Output:
3655    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3656         of the reduction (used for adjusting the epilog - see below).
3657    Return a vector variable, initialized according to the operation that STMT
3658         performs. This vector will be used as the initial value of the
3659         vector of partial results.
3660
3661    Option1 (adjust in epilog): Initialize the vector as follows:
3662      add/bit or/xor:    [0,0,...,0,0]
3663      mult/bit and:      [1,1,...,1,1]
3664      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3665    and when necessary (e.g. add/mult case) let the caller know
3666    that it needs to adjust the result by init_val.
3667
3668    Option2: Initialize the vector as follows:
3669      add/bit or/xor:    [init_val,0,0,...,0]
3670      mult/bit and:      [init_val,1,1,...,1]
3671      min/max/cond_expr: [init_val,init_val,...,init_val]
3672    and no adjustments are needed.
3673
3674    For example, for the following code:
3675
3676    s = init_val;
3677    for (i=0;i<n;i++)
3678      s = s + a[i];
3679
3680    STMT is 's = s + a[i]', and the reduction variable is 's'.
3681    For a vector of 4 units, we want to return either [0,0,0,init_val],
3682    or [0,0,0,0] and let the caller know that it needs to adjust
3683    the result at the end by 'init_val'.
3684
3685    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3686    initialization vector is simpler (same element in all entries), if
3687    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3688
3689    A cost model should help decide between these two schemes.  */
3690
3691 tree
3692 get_initial_def_for_reduction (gimple stmt, tree init_val,
3693                                tree *adjustment_def)
3694 {
3695   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3696   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3697   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3698   tree scalar_type = TREE_TYPE (init_val);
3699   tree vectype = get_vectype_for_scalar_type (scalar_type);
3700   int nunits;
3701   enum tree_code code = gimple_assign_rhs_code (stmt);
3702   tree def_for_init;
3703   tree init_def;
3704   tree *elts;
3705   int i;
3706   bool nested_in_vect_loop = false;
3707   tree init_value;
3708   REAL_VALUE_TYPE real_init_val = dconst0;
3709   int int_init_val = 0;
3710   gimple def_stmt = NULL;
3711
3712   gcc_assert (vectype);
3713   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3714
3715   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3716               || SCALAR_FLOAT_TYPE_P (scalar_type));
3717
3718   if (nested_in_vect_loop_p (loop, stmt))
3719     nested_in_vect_loop = true;
3720   else
3721     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3722
3723   /* In case of double reduction we only create a vector variable to be put
3724      in the reduction phi node.  The actual statement creation is done in
3725      vect_create_epilog_for_reduction.  */
3726   if (adjustment_def && nested_in_vect_loop
3727       && TREE_CODE (init_val) == SSA_NAME
3728       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3729       && gimple_code (def_stmt) == GIMPLE_PHI
3730       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3731       && vinfo_for_stmt (def_stmt)
3732       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3733           == vect_double_reduction_def)
3734     {
3735       *adjustment_def = NULL;
3736       return vect_create_destination_var (init_val, vectype);
3737     }
3738
3739   if (TREE_CONSTANT (init_val))
3740     {
3741       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3742         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3743       else
3744         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3745     }
3746   else
3747     init_value = init_val;
3748
3749   switch (code)
3750     {
3751       case WIDEN_SUM_EXPR:
3752       case DOT_PROD_EXPR:
3753       case SAD_EXPR:
3754       case PLUS_EXPR:
3755       case MINUS_EXPR:
3756       case BIT_IOR_EXPR:
3757       case BIT_XOR_EXPR:
3758       case MULT_EXPR:
3759       case BIT_AND_EXPR:
3760         /* ADJUSMENT_DEF is NULL when called from
3761            vect_create_epilog_for_reduction to vectorize double reduction.  */
3762         if (adjustment_def)
3763           {
3764             if (nested_in_vect_loop)
3765               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3766                                                               NULL);
3767             else
3768               *adjustment_def = init_val;
3769           }
3770
3771         if (code == MULT_EXPR)
3772           {
3773             real_init_val = dconst1;
3774             int_init_val = 1;
3775           }
3776
3777         if (code == BIT_AND_EXPR)
3778           int_init_val = -1;
3779
3780         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3781           def_for_init = build_real (scalar_type, real_init_val);
3782         else
3783           def_for_init = build_int_cst (scalar_type, int_init_val);
3784
3785         /* Create a vector of '0' or '1' except the first element.  */
3786         elts = XALLOCAVEC (tree, nunits);
3787         for (i = nunits - 2; i >= 0; --i)
3788           elts[i + 1] = def_for_init;
3789
3790         /* Option1: the first element is '0' or '1' as well.  */
3791         if (adjustment_def)
3792           {
3793             elts[0] = def_for_init;
3794             init_def = build_vector (vectype, elts);
3795             break;
3796           }
3797
3798         /* Option2: the first element is INIT_VAL.  */
3799         elts[0] = init_val;
3800         if (TREE_CONSTANT (init_val))
3801           init_def = build_vector (vectype, elts);
3802         else
3803           {
3804             vec<constructor_elt, va_gc> *v;
3805             vec_alloc (v, nunits);
3806             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3807             for (i = 1; i < nunits; ++i)
3808               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3809             init_def = build_constructor (vectype, v);
3810           }
3811
3812         break;
3813
3814       case MIN_EXPR:
3815       case MAX_EXPR:
3816       case COND_EXPR:
3817         if (adjustment_def)
3818           {
3819             *adjustment_def = NULL_TREE;
3820             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3821             break;
3822           }
3823
3824         init_def = build_vector_from_val (vectype, init_value);
3825         break;
3826
3827       default:
3828         gcc_unreachable ();
3829     }
3830
3831   return init_def;
3832 }
3833
3834 /* Function vect_create_epilog_for_reduction
3835
3836    Create code at the loop-epilog to finalize the result of a reduction
3837    computation.
3838
3839    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3840      reduction statements.
3841    STMT is the scalar reduction stmt that is being vectorized.
3842    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3843      number of elements that we can fit in a vectype (nunits).  In this case
3844      we have to generate more than one vector stmt - i.e - we need to "unroll"
3845      the vector stmt by a factor VF/nunits.  For more details see documentation
3846      in vectorizable_operation.
3847    REDUC_CODE is the tree-code for the epilog reduction.
3848    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3849      computation.
3850    REDUC_INDEX is the index of the operand in the right hand side of the
3851      statement that is defined by REDUCTION_PHI.
3852    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3853    SLP_NODE is an SLP node containing a group of reduction statements. The
3854      first one in this group is STMT.
3855
3856    This function:
3857    1. Creates the reduction def-use cycles: sets the arguments for
3858       REDUCTION_PHIS:
3859       The loop-entry argument is the vectorized initial-value of the reduction.
3860       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3861       sums.
3862    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3863       by applying the operation specified by REDUC_CODE if available, or by
3864       other means (whole-vector shifts or a scalar loop).
3865       The function also creates a new phi node at the loop exit to preserve
3866       loop-closed form, as illustrated below.
3867
3868      The flow at the entry to this function:
3869
3870         loop:
3871           vec_def = phi <null, null>            # REDUCTION_PHI
3872           VECT_DEF = vector_stmt                # vectorized form of STMT
3873           s_loop = scalar_stmt                  # (scalar) STMT
3874         loop_exit:
3875           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3876           use <s_out0>
3877           use <s_out0>
3878
3879      The above is transformed by this function into:
3880
3881         loop:
3882           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3883           VECT_DEF = vector_stmt                # vectorized form of STMT
3884           s_loop = scalar_stmt                  # (scalar) STMT
3885         loop_exit:
3886           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3887           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3888           v_out2 = reduce <v_out1>
3889           s_out3 = extract_field <v_out2, 0>
3890           s_out4 = adjust_result <s_out3>
3891           use <s_out4>
3892           use <s_out4>
3893 */
3894
3895 static void
3896 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
3897                                   int ncopies, enum tree_code reduc_code,
3898                                   vec<gimple> reduction_phis,
3899                                   int reduc_index, bool double_reduc,
3900                                   slp_tree slp_node)
3901 {
3902   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3903   stmt_vec_info prev_phi_info;
3904   tree vectype;
3905   machine_mode mode;
3906   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3907   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3908   basic_block exit_bb;
3909   tree scalar_dest;
3910   tree scalar_type;
3911   gimple new_phi = NULL, phi;
3912   gimple_stmt_iterator exit_gsi;
3913   tree vec_dest;
3914   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3915   gimple epilog_stmt = NULL;
3916   enum tree_code code = gimple_assign_rhs_code (stmt);
3917   gimple exit_phi;
3918   tree bitsize;
3919   tree adjustment_def = NULL;
3920   tree vec_initial_def = NULL;
3921   tree reduction_op, expr, def;
3922   tree orig_name, scalar_result;
3923   imm_use_iterator imm_iter, phi_imm_iter;
3924   use_operand_p use_p, phi_use_p;
3925   gimple use_stmt, orig_stmt, reduction_phi = NULL;
3926   bool nested_in_vect_loop = false;
3927   auto_vec<gimple> new_phis;
3928   auto_vec<gimple> inner_phis;
3929   enum vect_def_type dt = vect_unknown_def_type;
3930   int j, i;
3931   auto_vec<tree> scalar_results;
3932   unsigned int group_size = 1, k, ratio;
3933   auto_vec<tree> vec_initial_defs;
3934   auto_vec<gimple> phis;
3935   bool slp_reduc = false;
3936   tree new_phi_result;
3937   gimple inner_phi = NULL;
3938
3939   if (slp_node)
3940     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3941
3942   if (nested_in_vect_loop_p (loop, stmt))
3943     {
3944       outer_loop = loop;
3945       loop = loop->inner;
3946       nested_in_vect_loop = true;
3947       gcc_assert (!slp_node);
3948     }
3949
3950   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3951     {
3952     case GIMPLE_SINGLE_RHS:
3953       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3954                   == ternary_op);
3955       reduction_op = TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3956       break;
3957     case GIMPLE_UNARY_RHS:
3958       reduction_op = gimple_assign_rhs1 (stmt);
3959       break;
3960     case GIMPLE_BINARY_RHS:
3961       reduction_op = reduc_index ?
3962                      gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt);
3963       break;
3964     case GIMPLE_TERNARY_RHS:
3965       reduction_op = gimple_op (stmt, reduc_index + 1);
3966       break;
3967     default:
3968       gcc_unreachable ();
3969     }
3970
3971   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3972   gcc_assert (vectype);
3973   mode = TYPE_MODE (vectype);
3974
3975   /* 1. Create the reduction def-use cycle:
3976      Set the arguments of REDUCTION_PHIS, i.e., transform
3977
3978         loop:
3979           vec_def = phi <null, null>            # REDUCTION_PHI
3980           VECT_DEF = vector_stmt                # vectorized form of STMT
3981           ...
3982
3983      into:
3984
3985         loop:
3986           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3987           VECT_DEF = vector_stmt                # vectorized form of STMT
3988           ...
3989
3990      (in case of SLP, do it for all the phis). */
3991
3992   /* Get the loop-entry arguments.  */
3993   if (slp_node)
3994     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3995                        NULL, slp_node, reduc_index);
3996   else
3997     {
3998       vec_initial_defs.create (1);
3999      /* For the case of reduction, vect_get_vec_def_for_operand returns
4000         the scalar def before the loop, that defines the initial value
4001         of the reduction variable.  */
4002       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4003                                                       &adjustment_def);
4004       vec_initial_defs.quick_push (vec_initial_def);
4005     }
4006
4007   /* Set phi nodes arguments.  */
4008   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4009     {
4010       tree vec_init_def, def;
4011       gimple_seq stmts;
4012       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4013                                            true, NULL_TREE);
4014       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4015       def = vect_defs[i];
4016       for (j = 0; j < ncopies; j++)
4017         {
4018           /* Set the loop-entry arg of the reduction-phi.  */
4019           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4020                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4021
4022           /* Set the loop-latch arg for the reduction-phi.  */
4023           if (j > 0)
4024             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4025
4026           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4027                        UNKNOWN_LOCATION);
4028
4029           if (dump_enabled_p ())
4030             {
4031               dump_printf_loc (MSG_NOTE, vect_location,
4032                                "transform reduction: created def-use cycle: ");
4033               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4034               dump_printf (MSG_NOTE, "\n");
4035               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4036               dump_printf (MSG_NOTE, "\n");
4037             }
4038
4039           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4040         }
4041     }
4042
4043   /* 2. Create epilog code.
4044         The reduction epilog code operates across the elements of the vector
4045         of partial results computed by the vectorized loop.
4046         The reduction epilog code consists of:
4047
4048         step 1: compute the scalar result in a vector (v_out2)
4049         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4050         step 3: adjust the scalar result (s_out3) if needed.
4051
4052         Step 1 can be accomplished using one the following three schemes:
4053           (scheme 1) using reduc_code, if available.
4054           (scheme 2) using whole-vector shifts, if available.
4055           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4056                      combined.
4057
4058           The overall epilog code looks like this:
4059
4060           s_out0 = phi <s_loop>         # original EXIT_PHI
4061           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4062           v_out2 = reduce <v_out1>              # step 1
4063           s_out3 = extract_field <v_out2, 0>    # step 2
4064           s_out4 = adjust_result <s_out3>       # step 3
4065
4066           (step 3 is optional, and steps 1 and 2 may be combined).
4067           Lastly, the uses of s_out0 are replaced by s_out4.  */
4068
4069
4070   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4071          v_out1 = phi <VECT_DEF>
4072          Store them in NEW_PHIS.  */
4073
4074   exit_bb = single_exit (loop)->dest;
4075   prev_phi_info = NULL;
4076   new_phis.create (vect_defs.length ());
4077   FOR_EACH_VEC_ELT (vect_defs, i, def)
4078     {
4079       for (j = 0; j < ncopies; j++)
4080         {
4081           tree new_def = copy_ssa_name (def);
4082           phi = create_phi_node (new_def, exit_bb);
4083           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4084           if (j == 0)
4085             new_phis.quick_push (phi);
4086           else
4087             {
4088               def = vect_get_vec_def_for_stmt_copy (dt, def);
4089               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4090             }
4091
4092           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4093           prev_phi_info = vinfo_for_stmt (phi);
4094         }
4095     }
4096
4097   /* The epilogue is created for the outer-loop, i.e., for the loop being
4098      vectorized.  Create exit phis for the outer loop.  */
4099   if (double_reduc)
4100     {
4101       loop = outer_loop;
4102       exit_bb = single_exit (loop)->dest;
4103       inner_phis.create (vect_defs.length ());
4104       FOR_EACH_VEC_ELT (new_phis, i, phi)
4105         {
4106           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4107           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4108           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4109                            PHI_RESULT (phi));
4110           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4111                                                             loop_vinfo, NULL));
4112           inner_phis.quick_push (phi);
4113           new_phis[i] = outer_phi;
4114           prev_phi_info = vinfo_for_stmt (outer_phi);
4115           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4116             {
4117               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4118               new_result = copy_ssa_name (PHI_RESULT (phi));
4119               outer_phi = create_phi_node (new_result, exit_bb);
4120               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4121                                PHI_RESULT (phi));
4122               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4123                                                         loop_vinfo, NULL));
4124               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4125               prev_phi_info = vinfo_for_stmt (outer_phi);
4126             }
4127         }
4128     }
4129
4130   exit_gsi = gsi_after_labels (exit_bb);
4131
4132   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4133          (i.e. when reduc_code is not available) and in the final adjustment
4134          code (if needed).  Also get the original scalar reduction variable as
4135          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4136          represents a reduction pattern), the tree-code and scalar-def are
4137          taken from the original stmt that the pattern-stmt (STMT) replaces.
4138          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4139          are taken from STMT.  */
4140
4141   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4142   if (!orig_stmt)
4143     {
4144       /* Regular reduction  */
4145       orig_stmt = stmt;
4146     }
4147   else
4148     {
4149       /* Reduction pattern  */
4150       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4151       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4152       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4153     }
4154
4155   code = gimple_assign_rhs_code (orig_stmt);
4156   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4157      partial results are added and not subtracted.  */
4158   if (code == MINUS_EXPR)
4159     code = PLUS_EXPR;
4160
4161   scalar_dest = gimple_assign_lhs (orig_stmt);
4162   scalar_type = TREE_TYPE (scalar_dest);
4163   scalar_results.create (group_size);
4164   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4165   bitsize = TYPE_SIZE (scalar_type);
4166
4167   /* In case this is a reduction in an inner-loop while vectorizing an outer
4168      loop - we don't need to extract a single scalar result at the end of the
4169      inner-loop (unless it is double reduction, i.e., the use of reduction is
4170      outside the outer-loop).  The final vector of partial results will be used
4171      in the vectorized outer-loop, or reduced to a scalar result at the end of
4172      the outer-loop.  */
4173   if (nested_in_vect_loop && !double_reduc)
4174     goto vect_finalize_reduction;
4175
4176   /* SLP reduction without reduction chain, e.g.,
4177      # a1 = phi <a2, a0>
4178      # b1 = phi <b2, b0>
4179      a2 = operation (a1)
4180      b2 = operation (b1)  */
4181   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4182
4183   /* In case of reduction chain, e.g.,
4184      # a1 = phi <a3, a0>
4185      a2 = operation (a1)
4186      a3 = operation (a2),
4187
4188      we may end up with more than one vector result.  Here we reduce them to
4189      one vector.  */
4190   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4191     {
4192       tree first_vect = PHI_RESULT (new_phis[0]);
4193       tree tmp;
4194       gassign *new_vec_stmt = NULL;
4195
4196       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4197       for (k = 1; k < new_phis.length (); k++)
4198         {
4199           gimple next_phi = new_phis[k];
4200           tree second_vect = PHI_RESULT (next_phi);
4201
4202           tmp = build2 (code, vectype,  first_vect, second_vect);
4203           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4204           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4205           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4206           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4207         }
4208
4209       new_phi_result = first_vect;
4210       if (new_vec_stmt)
4211         {
4212           new_phis.truncate (0);
4213           new_phis.safe_push (new_vec_stmt);
4214         }
4215     }
4216   else
4217     new_phi_result = PHI_RESULT (new_phis[0]);
4218
4219   /* 2.3 Create the reduction code, using one of the three schemes described
4220          above. In SLP we simply need to extract all the elements from the
4221          vector (without reducing them), so we use scalar shifts.  */
4222   if (reduc_code != ERROR_MARK && !slp_reduc)
4223     {
4224       tree tmp;
4225       tree vec_elem_type;
4226
4227       /*** Case 1:  Create:
4228            v_out2 = reduc_expr <v_out1>  */
4229
4230       if (dump_enabled_p ())
4231         dump_printf_loc (MSG_NOTE, vect_location,
4232                          "Reduce using direct vector reduction.\n");
4233
4234       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4235       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4236         {
4237           tree tmp_dest =
4238               vect_create_destination_var (scalar_dest, vec_elem_type);
4239           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4240           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4241           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4242           gimple_assign_set_lhs (epilog_stmt, new_temp);
4243           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4244
4245           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4246         }
4247       else
4248         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4249       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4250       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4251       gimple_assign_set_lhs (epilog_stmt, new_temp);
4252       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4253       scalar_results.safe_push (new_temp);
4254     }
4255   else
4256     {
4257       bool reduce_with_shift = have_whole_vector_shift (mode);
4258       int element_bitsize = tree_to_uhwi (bitsize);
4259       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4260       tree vec_temp;
4261
4262       /* Regardless of whether we have a whole vector shift, if we're
4263          emulating the operation via tree-vect-generic, we don't want
4264          to use it.  Only the first round of the reduction is likely
4265          to still be profitable via emulation.  */
4266       /* ??? It might be better to emit a reduction tree code here, so that
4267          tree-vect-generic can expand the first round via bit tricks.  */
4268       if (!VECTOR_MODE_P (mode))
4269         reduce_with_shift = false;
4270       else
4271         {
4272           optab optab = optab_for_tree_code (code, vectype, optab_default);
4273           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4274             reduce_with_shift = false;
4275         }
4276
4277       if (reduce_with_shift && !slp_reduc)
4278         {
4279           int nelements = vec_size_in_bits / element_bitsize;
4280           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4281
4282           int elt_offset;
4283
4284           tree zero_vec = build_zero_cst (vectype);
4285           /*** Case 2: Create:
4286              for (offset = nelements/2; offset >= 1; offset/=2)
4287                 {
4288                   Create:  va' = vec_shift <va, offset>
4289                   Create:  va = vop <va, va'>
4290                 }  */
4291
4292           tree rhs;
4293
4294           if (dump_enabled_p ())
4295             dump_printf_loc (MSG_NOTE, vect_location,
4296                              "Reduce using vector shifts\n");
4297
4298           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4299           new_temp = new_phi_result;
4300           for (elt_offset = nelements / 2;
4301                elt_offset >= 1;
4302                elt_offset /= 2)
4303             {
4304               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4305               tree mask = vect_gen_perm_mask_any (vectype, sel);
4306               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4307                                                  new_temp, zero_vec, mask);
4308               new_name = make_ssa_name (vec_dest, epilog_stmt);
4309               gimple_assign_set_lhs (epilog_stmt, new_name);
4310               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4311
4312               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4313                                                  new_temp);
4314               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4315               gimple_assign_set_lhs (epilog_stmt, new_temp);
4316               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4317             }
4318
4319           /* 2.4  Extract the final scalar result.  Create:
4320              s_out3 = extract_field <v_out2, bitpos>  */
4321
4322           if (dump_enabled_p ())
4323             dump_printf_loc (MSG_NOTE, vect_location,
4324                              "extract scalar result\n");
4325
4326           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4327                         bitsize, bitsize_zero_node);
4328           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4329           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4330           gimple_assign_set_lhs (epilog_stmt, new_temp);
4331           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4332           scalar_results.safe_push (new_temp);
4333         }
4334       else
4335         {
4336           /*** Case 3: Create:
4337              s = extract_field <v_out2, 0>
4338              for (offset = element_size;
4339                   offset < vector_size;
4340                   offset += element_size;)
4341                {
4342                  Create:  s' = extract_field <v_out2, offset>
4343                  Create:  s = op <s, s'>  // For non SLP cases
4344                }  */
4345
4346           if (dump_enabled_p ())
4347             dump_printf_loc (MSG_NOTE, vect_location,
4348                              "Reduce using scalar code.\n");
4349
4350           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4351           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4352             {
4353               int bit_offset;
4354               if (gimple_code (new_phi) == GIMPLE_PHI)
4355                 vec_temp = PHI_RESULT (new_phi);
4356               else
4357                 vec_temp = gimple_assign_lhs (new_phi);
4358               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4359                             bitsize_zero_node);
4360               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4361               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4362               gimple_assign_set_lhs (epilog_stmt, new_temp);
4363               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4364
4365               /* In SLP we don't need to apply reduction operation, so we just
4366                  collect s' values in SCALAR_RESULTS.  */
4367               if (slp_reduc)
4368                 scalar_results.safe_push (new_temp);
4369
4370               for (bit_offset = element_bitsize;
4371                    bit_offset < vec_size_in_bits;
4372                    bit_offset += element_bitsize)
4373                 {
4374                   tree bitpos = bitsize_int (bit_offset);
4375                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4376                                      bitsize, bitpos);
4377
4378                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4379                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4380                   gimple_assign_set_lhs (epilog_stmt, new_name);
4381                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4382
4383                   if (slp_reduc)
4384                     {
4385                       /* In SLP we don't need to apply reduction operation, so
4386                          we just collect s' values in SCALAR_RESULTS.  */
4387                       new_temp = new_name;
4388                       scalar_results.safe_push (new_name);
4389                     }
4390                   else
4391                     {
4392                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4393                                                          new_name, new_temp);
4394                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4395                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4396                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4397                     }
4398                 }
4399             }
4400
4401           /* The only case where we need to reduce scalar results in SLP, is
4402              unrolling.  If the size of SCALAR_RESULTS is greater than
4403              GROUP_SIZE, we reduce them combining elements modulo
4404              GROUP_SIZE.  */
4405           if (slp_reduc)
4406             {
4407               tree res, first_res, new_res;
4408               gimple new_stmt;
4409
4410               /* Reduce multiple scalar results in case of SLP unrolling.  */
4411               for (j = group_size; scalar_results.iterate (j, &res);
4412                    j++)
4413                 {
4414                   first_res = scalar_results[j % group_size];
4415                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4416                                                   first_res, res);
4417                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4418                   gimple_assign_set_lhs (new_stmt, new_res);
4419                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4420                   scalar_results[j % group_size] = new_res;
4421                 }
4422             }
4423           else
4424             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4425             scalar_results.safe_push (new_temp);
4426         }
4427     }
4428
4429 vect_finalize_reduction:
4430
4431   if (double_reduc)
4432     loop = loop->inner;
4433
4434   /* 2.5 Adjust the final result by the initial value of the reduction
4435          variable. (When such adjustment is not needed, then
4436          'adjustment_def' is zero).  For example, if code is PLUS we create:
4437          new_temp = loop_exit_def + adjustment_def  */
4438
4439   if (adjustment_def)
4440     {
4441       gcc_assert (!slp_reduc);
4442       if (nested_in_vect_loop)
4443         {
4444           new_phi = new_phis[0];
4445           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4446           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4447           new_dest = vect_create_destination_var (scalar_dest, vectype);
4448         }
4449       else
4450         {
4451           new_temp = scalar_results[0];
4452           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4453           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4454           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4455         }
4456
4457       epilog_stmt = gimple_build_assign (new_dest, expr);
4458       new_temp = make_ssa_name (new_dest, epilog_stmt);
4459       gimple_assign_set_lhs (epilog_stmt, new_temp);
4460       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4461       if (nested_in_vect_loop)
4462         {
4463           set_vinfo_for_stmt (epilog_stmt,
4464                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4465                                                  NULL));
4466           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4467                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4468
4469           if (!double_reduc)
4470             scalar_results.quick_push (new_temp);
4471           else
4472             scalar_results[0] = new_temp;
4473         }
4474       else
4475         scalar_results[0] = new_temp;
4476
4477       new_phis[0] = epilog_stmt;
4478     }
4479
4480   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4481           phis with new adjusted scalar results, i.e., replace use <s_out0>
4482           with use <s_out4>.
4483
4484      Transform:
4485         loop_exit:
4486           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4487           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4488           v_out2 = reduce <v_out1>
4489           s_out3 = extract_field <v_out2, 0>
4490           s_out4 = adjust_result <s_out3>
4491           use <s_out0>
4492           use <s_out0>
4493
4494      into:
4495
4496         loop_exit:
4497           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4498           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4499           v_out2 = reduce <v_out1>
4500           s_out3 = extract_field <v_out2, 0>
4501           s_out4 = adjust_result <s_out3>
4502           use <s_out4>
4503           use <s_out4> */
4504
4505
4506   /* In SLP reduction chain we reduce vector results into one vector if
4507      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4508      the last stmt in the reduction chain, since we are looking for the loop
4509      exit phi node.  */
4510   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4511     {
4512       scalar_dest = gimple_assign_lhs (
4513                         SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1]);
4514       group_size = 1;
4515     }
4516
4517   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4518      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4519      need to match SCALAR_RESULTS with corresponding statements.  The first
4520      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4521      the first vector stmt, etc.
4522      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4523   if (group_size > new_phis.length ())
4524     {
4525       ratio = group_size / new_phis.length ();
4526       gcc_assert (!(group_size % new_phis.length ()));
4527     }
4528   else
4529     ratio = 1;
4530
4531   for (k = 0; k < group_size; k++)
4532     {
4533       if (k % ratio == 0)
4534         {
4535           epilog_stmt = new_phis[k / ratio];
4536           reduction_phi = reduction_phis[k / ratio];
4537           if (double_reduc)
4538             inner_phi = inner_phis[k / ratio];
4539         }
4540
4541       if (slp_reduc)
4542         {
4543           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4544
4545           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4546           /* SLP statements can't participate in patterns.  */
4547           gcc_assert (!orig_stmt);
4548           scalar_dest = gimple_assign_lhs (current_stmt);
4549         }
4550
4551       phis.create (3);
4552       /* Find the loop-closed-use at the loop exit of the original scalar
4553          result.  (The reduction result is expected to have two immediate uses -
4554          one at the latch block, and one at the loop exit).  */
4555       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4556         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4557             && !is_gimple_debug (USE_STMT (use_p)))
4558           phis.safe_push (USE_STMT (use_p));
4559
4560       /* While we expect to have found an exit_phi because of loop-closed-ssa
4561          form we can end up without one if the scalar cycle is dead.  */
4562
4563       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4564         {
4565           if (outer_loop)
4566             {
4567               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4568               gphi *vect_phi;
4569
4570               /* FORNOW. Currently not supporting the case that an inner-loop
4571                  reduction is not used in the outer-loop (but only outside the
4572                  outer-loop), unless it is double reduction.  */
4573               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4574                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4575                           || double_reduc);
4576
4577               STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4578               if (!double_reduc
4579                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4580                       != vect_double_reduction_def)
4581                 continue;
4582
4583               /* Handle double reduction:
4584
4585                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4586                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4587                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4588                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4589
4590                  At that point the regular reduction (stmt2 and stmt3) is
4591                  already vectorized, as well as the exit phi node, stmt4.
4592                  Here we vectorize the phi node of double reduction, stmt1, and
4593                  update all relevant statements.  */
4594
4595               /* Go through all the uses of s2 to find double reduction phi
4596                  node, i.e., stmt1 above.  */
4597               orig_name = PHI_RESULT (exit_phi);
4598               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4599                 {
4600                   stmt_vec_info use_stmt_vinfo;
4601                   stmt_vec_info new_phi_vinfo;
4602                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4603                   basic_block bb = gimple_bb (use_stmt);
4604                   gimple use;
4605
4606                   /* Check that USE_STMT is really double reduction phi
4607                      node.  */
4608                   if (gimple_code (use_stmt) != GIMPLE_PHI
4609                       || gimple_phi_num_args (use_stmt) != 2
4610                       || bb->loop_father != outer_loop)
4611                     continue;
4612                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4613                   if (!use_stmt_vinfo
4614                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4615                           != vect_double_reduction_def)
4616                     continue;
4617
4618                   /* Create vector phi node for double reduction:
4619                      vs1 = phi <vs0, vs2>
4620                      vs1 was created previously in this function by a call to
4621                        vect_get_vec_def_for_operand and is stored in
4622                        vec_initial_def;
4623                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4624                      vs0 is created here.  */
4625
4626                   /* Create vector phi node.  */
4627                   vect_phi = create_phi_node (vec_initial_def, bb);
4628                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4629                                     loop_vec_info_for_loop (outer_loop), NULL);
4630                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4631
4632                   /* Create vs0 - initial def of the double reduction phi.  */
4633                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4634                                              loop_preheader_edge (outer_loop));
4635                   init_def = get_initial_def_for_reduction (stmt,
4636                                                           preheader_arg, NULL);
4637                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4638                                                     vectype, NULL);
4639
4640                   /* Update phi node arguments with vs0 and vs2.  */
4641                   add_phi_arg (vect_phi, vect_phi_init,
4642                                loop_preheader_edge (outer_loop),
4643                                UNKNOWN_LOCATION);
4644                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4645                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4646                   if (dump_enabled_p ())
4647                     {
4648                       dump_printf_loc (MSG_NOTE, vect_location,
4649                                        "created double reduction phi node: ");
4650                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4651                       dump_printf (MSG_NOTE, "\n");
4652                     }
4653
4654                   vect_phi_res = PHI_RESULT (vect_phi);
4655
4656                   /* Replace the use, i.e., set the correct vs1 in the regular
4657                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4658                      loop is redundant.  */
4659                   use = reduction_phi;
4660                   for (j = 0; j < ncopies; j++)
4661                     {
4662                       edge pr_edge = loop_preheader_edge (loop);
4663                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4664                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4665                     }
4666                 }
4667             }
4668         }
4669
4670       phis.release ();
4671       if (nested_in_vect_loop)
4672         {
4673           if (double_reduc)
4674             loop = outer_loop;
4675           else
4676             continue;
4677         }
4678
4679       phis.create (3);
4680       /* Find the loop-closed-use at the loop exit of the original scalar
4681          result.  (The reduction result is expected to have two immediate uses,
4682          one at the latch block, and one at the loop exit).  For double
4683          reductions we are looking for exit phis of the outer loop.  */
4684       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4685         {
4686           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4687             {
4688               if (!is_gimple_debug (USE_STMT (use_p)))
4689                 phis.safe_push (USE_STMT (use_p));
4690             }
4691           else
4692             {
4693               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4694                 {
4695                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4696
4697                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4698                     {
4699                       if (!flow_bb_inside_loop_p (loop,
4700                                              gimple_bb (USE_STMT (phi_use_p)))
4701                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4702                         phis.safe_push (USE_STMT (phi_use_p));
4703                     }
4704                 }
4705             }
4706         }
4707
4708       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4709         {
4710           /* Replace the uses:  */
4711           orig_name = PHI_RESULT (exit_phi);
4712           scalar_result = scalar_results[k];
4713           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4714             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4715               SET_USE (use_p, scalar_result);
4716         }
4717
4718       phis.release ();
4719     }
4720 }
4721
4722
4723 /* Function vectorizable_reduction.
4724
4725    Check if STMT performs a reduction operation that can be vectorized.
4726    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4727    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4728    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4729
4730    This function also handles reduction idioms (patterns) that have been
4731    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4732    of this form:
4733      X = pattern_expr (arg0, arg1, ..., X)
4734    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4735    sequence that had been detected and replaced by the pattern-stmt (STMT).
4736
4737    In some cases of reduction patterns, the type of the reduction variable X is
4738    different than the type of the other arguments of STMT.
4739    In such cases, the vectype that is used when transforming STMT into a vector
4740    stmt is different than the vectype that is used to determine the
4741    vectorization factor, because it consists of a different number of elements
4742    than the actual number of elements that are being operated upon in parallel.
4743
4744    For example, consider an accumulation of shorts into an int accumulator.
4745    On some targets it's possible to vectorize this pattern operating on 8
4746    shorts at a time (hence, the vectype for purposes of determining the
4747    vectorization factor should be V8HI); on the other hand, the vectype that
4748    is used to create the vector form is actually V4SI (the type of the result).
4749
4750    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4751    indicates what is the actual level of parallelism (V8HI in the example), so
4752    that the right vectorization factor would be derived.  This vectype
4753    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4754    be used to create the vectorized stmt.  The right vectype for the vectorized
4755    stmt is obtained from the type of the result X:
4756         get_vectype_for_scalar_type (TREE_TYPE (X))
4757
4758    This means that, contrary to "regular" reductions (or "regular" stmts in
4759    general), the following equation:
4760       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4761    does *NOT* necessarily hold for reduction patterns.  */
4762
4763 bool
4764 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4765                         gimple *vec_stmt, slp_tree slp_node)
4766 {
4767   tree vec_dest;
4768   tree scalar_dest;
4769   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4770   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4771   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4772   tree vectype_in = NULL_TREE;
4773   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4774   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4775   enum tree_code code, orig_code, epilog_reduc_code;
4776   machine_mode vec_mode;
4777   int op_type;
4778   optab optab, reduc_optab;
4779   tree new_temp = NULL_TREE;
4780   tree def;
4781   gimple def_stmt;
4782   enum vect_def_type dt;
4783   gphi *new_phi = NULL;
4784   tree scalar_type;
4785   bool is_simple_use;
4786   gimple orig_stmt;
4787   stmt_vec_info orig_stmt_info;
4788   tree expr = NULL_TREE;
4789   int i;
4790   int ncopies;
4791   int epilog_copies;
4792   stmt_vec_info prev_stmt_info, prev_phi_info;
4793   bool single_defuse_cycle = false;
4794   tree reduc_def = NULL_TREE;
4795   gimple new_stmt = NULL;
4796   int j;
4797   tree ops[3];
4798   bool nested_cycle = false, found_nested_cycle_def = false;
4799   gimple reduc_def_stmt = NULL;
4800   /* The default is that the reduction variable is the last in statement.  */
4801   int reduc_index = 2;
4802   bool double_reduc = false, dummy;
4803   basic_block def_bb;
4804   struct loop * def_stmt_loop, *outer_loop = NULL;
4805   tree def_arg;
4806   gimple def_arg_stmt;
4807   auto_vec<tree> vec_oprnds0;
4808   auto_vec<tree> vec_oprnds1;
4809   auto_vec<tree> vect_defs;
4810   auto_vec<gimple> phis;
4811   int vec_num;
4812   tree def0, def1, tem, op0, op1 = NULL_TREE;
4813
4814   /* In case of reduction chain we switch to the first stmt in the chain, but
4815      we don't update STMT_INFO, since only the last stmt is marked as reduction
4816      and has reduction properties.  */
4817   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4818     stmt = GROUP_FIRST_ELEMENT (stmt_info);
4819
4820   if (nested_in_vect_loop_p (loop, stmt))
4821     {
4822       outer_loop = loop;
4823       loop = loop->inner;
4824       nested_cycle = true;
4825     }
4826
4827   /* 1. Is vectorizable reduction?  */
4828   /* Not supportable if the reduction variable is used in the loop, unless
4829      it's a reduction chain.  */
4830   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4831       && !GROUP_FIRST_ELEMENT (stmt_info))
4832     return false;
4833
4834   /* Reductions that are not used even in an enclosing outer-loop,
4835      are expected to be "live" (used out of the loop).  */
4836   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4837       && !STMT_VINFO_LIVE_P (stmt_info))
4838     return false;
4839
4840   /* Make sure it was already recognized as a reduction computation.  */
4841   if (STMT_VINFO_DEF_TYPE (stmt_info) != vect_reduction_def
4842       && STMT_VINFO_DEF_TYPE (stmt_info) != vect_nested_cycle)
4843     return false;
4844
4845   /* 2. Has this been recognized as a reduction pattern?
4846
4847      Check if STMT represents a pattern that has been recognized
4848      in earlier analysis stages.  For stmts that represent a pattern,
4849      the STMT_VINFO_RELATED_STMT field records the last stmt in
4850      the original sequence that constitutes the pattern.  */
4851
4852   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4853   if (orig_stmt)
4854     {
4855       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4856       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4857       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4858     }
4859
4860   /* 3. Check the operands of the operation.  The first operands are defined
4861         inside the loop body. The last operand is the reduction variable,
4862         which is defined by the loop-header-phi.  */
4863
4864   gcc_assert (is_gimple_assign (stmt));
4865
4866   /* Flatten RHS.  */
4867   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4868     {
4869     case GIMPLE_SINGLE_RHS:
4870       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4871       if (op_type == ternary_op)
4872         {
4873           tree rhs = gimple_assign_rhs1 (stmt);
4874           ops[0] = TREE_OPERAND (rhs, 0);
4875           ops[1] = TREE_OPERAND (rhs, 1);
4876           ops[2] = TREE_OPERAND (rhs, 2);
4877           code = TREE_CODE (rhs);
4878         }
4879       else
4880         return false;
4881       break;
4882
4883     case GIMPLE_BINARY_RHS:
4884       code = gimple_assign_rhs_code (stmt);
4885       op_type = TREE_CODE_LENGTH (code);
4886       gcc_assert (op_type == binary_op);
4887       ops[0] = gimple_assign_rhs1 (stmt);
4888       ops[1] = gimple_assign_rhs2 (stmt);
4889       break;
4890
4891     case GIMPLE_TERNARY_RHS:
4892       code = gimple_assign_rhs_code (stmt);
4893       op_type = TREE_CODE_LENGTH (code);
4894       gcc_assert (op_type == ternary_op);
4895       ops[0] = gimple_assign_rhs1 (stmt);
4896       ops[1] = gimple_assign_rhs2 (stmt);
4897       ops[2] = gimple_assign_rhs3 (stmt);
4898       break;
4899
4900     case GIMPLE_UNARY_RHS:
4901       return false;
4902
4903     default:
4904       gcc_unreachable ();
4905     }
4906
4907   if (code == COND_EXPR && slp_node)
4908     return false;
4909
4910   scalar_dest = gimple_assign_lhs (stmt);
4911   scalar_type = TREE_TYPE (scalar_dest);
4912   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4913       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4914     return false;
4915
4916   /* Do not try to vectorize bit-precision reductions.  */
4917   if ((TYPE_PRECISION (scalar_type)
4918        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4919     return false;
4920
4921   /* All uses but the last are expected to be defined in the loop.
4922      The last use is the reduction variable.  In case of nested cycle this
4923      assumption is not true: we use reduc_index to record the index of the
4924      reduction variable.  */
4925   for (i = 0; i < op_type - 1; i++)
4926     {
4927       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4928       if (i == 0 && code == COND_EXPR)
4929         continue;
4930
4931       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4932                                             &def_stmt, &def, &dt, &tem);
4933       if (!vectype_in)
4934         vectype_in = tem;
4935       gcc_assert (is_simple_use);
4936
4937       if (dt != vect_internal_def
4938           && dt != vect_external_def
4939           && dt != vect_constant_def
4940           && dt != vect_induction_def
4941           && !(dt == vect_nested_cycle && nested_cycle))
4942         return false;
4943
4944       if (dt == vect_nested_cycle)
4945         {
4946           found_nested_cycle_def = true;
4947           reduc_def_stmt = def_stmt;
4948           reduc_index = i;
4949         }
4950     }
4951
4952   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
4953                                         &def_stmt, &def, &dt, &tem);
4954   if (!vectype_in)
4955     vectype_in = tem;
4956   gcc_assert (is_simple_use);
4957   if (!(dt == vect_reduction_def
4958         || dt == vect_nested_cycle
4959         || ((dt == vect_internal_def || dt == vect_external_def
4960              || dt == vect_constant_def || dt == vect_induction_def)
4961             && nested_cycle && found_nested_cycle_def)))
4962     {
4963       /* For pattern recognized stmts, orig_stmt might be a reduction,
4964          but some helper statements for the pattern might not, or
4965          might be COND_EXPRs with reduction uses in the condition.  */
4966       gcc_assert (orig_stmt);
4967       return false;
4968     }
4969   if (!found_nested_cycle_def)
4970     reduc_def_stmt = def_stmt;
4971
4972   gcc_assert (gimple_code (reduc_def_stmt) == GIMPLE_PHI);
4973   if (orig_stmt)
4974     gcc_assert (orig_stmt == vect_is_simple_reduction (loop_vinfo,
4975                                                        reduc_def_stmt,
4976                                                        !nested_cycle,
4977                                                        &dummy));
4978   else
4979     {
4980       gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4981                                              !nested_cycle, &dummy);
4982       /* We changed STMT to be the first stmt in reduction chain, hence we
4983          check that in this case the first element in the chain is STMT.  */
4984       gcc_assert (stmt == tmp
4985                   || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4986     }
4987
4988   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4989     return false;
4990
4991   if (slp_node || PURE_SLP_STMT (stmt_info))
4992     ncopies = 1;
4993   else
4994     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
4995                / TYPE_VECTOR_SUBPARTS (vectype_in));
4996
4997   gcc_assert (ncopies >= 1);
4998
4999   vec_mode = TYPE_MODE (vectype_in);
5000
5001   if (code == COND_EXPR)
5002     {
5003       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5004         {
5005           if (dump_enabled_p ())
5006             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5007                              "unsupported condition in reduction\n");
5008
5009             return false;
5010         }
5011     }
5012   else
5013     {
5014       /* 4. Supportable by target?  */
5015
5016       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5017           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5018         {
5019           /* Shifts and rotates are only supported by vectorizable_shifts,
5020              not vectorizable_reduction.  */
5021           if (dump_enabled_p ())
5022             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5023                              "unsupported shift or rotation.\n");
5024           return false;
5025         }
5026
5027       /* 4.1. check support for the operation in the loop  */
5028       optab = optab_for_tree_code (code, vectype_in, optab_default);
5029       if (!optab)
5030         {
5031           if (dump_enabled_p ())
5032             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5033                              "no optab.\n");
5034
5035           return false;
5036         }
5037
5038       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5039         {
5040           if (dump_enabled_p ())
5041             dump_printf (MSG_NOTE, "op not supported by target.\n");
5042
5043           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5044               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5045                   < vect_min_worthwhile_factor (code))
5046             return false;
5047
5048           if (dump_enabled_p ())
5049             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5050         }
5051
5052       /* Worthwhile without SIMD support?  */
5053       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5054           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5055              < vect_min_worthwhile_factor (code))
5056         {
5057           if (dump_enabled_p ())
5058             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5059                              "not worthwhile without SIMD support.\n");
5060
5061           return false;
5062         }
5063     }
5064
5065   /* 4.2. Check support for the epilog operation.
5066
5067           If STMT represents a reduction pattern, then the type of the
5068           reduction variable may be different than the type of the rest
5069           of the arguments.  For example, consider the case of accumulation
5070           of shorts into an int accumulator; The original code:
5071                         S1: int_a = (int) short_a;
5072           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5073
5074           was replaced with:
5075                         STMT: int_acc = widen_sum <short_a, int_acc>
5076
5077           This means that:
5078           1. The tree-code that is used to create the vector operation in the
5079              epilog code (that reduces the partial results) is not the
5080              tree-code of STMT, but is rather the tree-code of the original
5081              stmt from the pattern that STMT is replacing.  I.e, in the example
5082              above we want to use 'widen_sum' in the loop, but 'plus' in the
5083              epilog.
5084           2. The type (mode) we use to check available target support
5085              for the vector operation to be created in the *epilog*, is
5086              determined by the type of the reduction variable (in the example
5087              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5088              However the type (mode) we use to check available target support
5089              for the vector operation to be created *inside the loop*, is
5090              determined by the type of the other arguments to STMT (in the
5091              example we'd check this: optab_handler (widen_sum_optab,
5092              vect_short_mode)).
5093
5094           This is contrary to "regular" reductions, in which the types of all
5095           the arguments are the same as the type of the reduction variable.
5096           For "regular" reductions we can therefore use the same vector type
5097           (and also the same tree-code) when generating the epilog code and
5098           when generating the code inside the loop.  */
5099
5100   if (orig_stmt)
5101     {
5102       /* This is a reduction pattern: get the vectype from the type of the
5103          reduction variable, and get the tree-code from orig_stmt.  */
5104       orig_code = gimple_assign_rhs_code (orig_stmt);
5105       gcc_assert (vectype_out);
5106       vec_mode = TYPE_MODE (vectype_out);
5107     }
5108   else
5109     {
5110       /* Regular reduction: use the same vectype and tree-code as used for
5111          the vector code inside the loop can be used for the epilog code. */
5112       orig_code = code;
5113     }
5114
5115   if (nested_cycle)
5116     {
5117       def_bb = gimple_bb (reduc_def_stmt);
5118       def_stmt_loop = def_bb->loop_father;
5119       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5120                                        loop_preheader_edge (def_stmt_loop));
5121       if (TREE_CODE (def_arg) == SSA_NAME
5122           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5123           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5124           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5125           && vinfo_for_stmt (def_arg_stmt)
5126           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5127               == vect_double_reduction_def)
5128         double_reduc = true;
5129     }
5130
5131   epilog_reduc_code = ERROR_MARK;
5132   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5133     {
5134       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5135                                          optab_default);
5136       if (!reduc_optab)
5137         {
5138           if (dump_enabled_p ())
5139             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5140                              "no optab for reduction.\n");
5141
5142           epilog_reduc_code = ERROR_MARK;
5143         }
5144       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5145         {
5146           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5147           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5148             {
5149               if (dump_enabled_p ())
5150                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5151                                  "reduc op not supported by target.\n");
5152
5153               epilog_reduc_code = ERROR_MARK;
5154             }
5155         }
5156     }
5157   else
5158     {
5159       if (!nested_cycle || double_reduc)
5160         {
5161           if (dump_enabled_p ())
5162             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5163                              "no reduc code for scalar code.\n");
5164
5165           return false;
5166         }
5167     }
5168
5169   if (double_reduc && ncopies > 1)
5170     {
5171       if (dump_enabled_p ())
5172         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5173                          "multiple types in double reduction\n");
5174
5175       return false;
5176     }
5177
5178   /* In case of widenning multiplication by a constant, we update the type
5179      of the constant to be the type of the other operand.  We check that the
5180      constant fits the type in the pattern recognition pass.  */
5181   if (code == DOT_PROD_EXPR
5182       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5183     {
5184       if (TREE_CODE (ops[0]) == INTEGER_CST)
5185         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5186       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5187         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5188       else
5189         {
5190           if (dump_enabled_p ())
5191             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5192                              "invalid types in dot-prod\n");
5193
5194           return false;
5195         }
5196     }
5197
5198   if (!vec_stmt) /* transformation not required.  */
5199     {
5200       if (!vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies))
5201         return false;
5202       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5203       return true;
5204     }
5205
5206   /** Transform.  **/
5207
5208   if (dump_enabled_p ())
5209     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5210
5211   /* FORNOW: Multiple types are not supported for condition.  */
5212   if (code == COND_EXPR)
5213     gcc_assert (ncopies == 1);
5214
5215   /* Create the destination vector  */
5216   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5217
5218   /* In case the vectorization factor (VF) is bigger than the number
5219      of elements that we can fit in a vectype (nunits), we have to generate
5220      more than one vector stmt - i.e - we need to "unroll" the
5221      vector stmt by a factor VF/nunits.  For more details see documentation
5222      in vectorizable_operation.  */
5223
5224   /* If the reduction is used in an outer loop we need to generate
5225      VF intermediate results, like so (e.g. for ncopies=2):
5226         r0 = phi (init, r0)
5227         r1 = phi (init, r1)
5228         r0 = x0 + r0;
5229         r1 = x1 + r1;
5230     (i.e. we generate VF results in 2 registers).
5231     In this case we have a separate def-use cycle for each copy, and therefore
5232     for each copy we get the vector def for the reduction variable from the
5233     respective phi node created for this copy.
5234
5235     Otherwise (the reduction is unused in the loop nest), we can combine
5236     together intermediate results, like so (e.g. for ncopies=2):
5237         r = phi (init, r)
5238         r = x0 + r;
5239         r = x1 + r;
5240    (i.e. we generate VF/2 results in a single register).
5241    In this case for each copy we get the vector def for the reduction variable
5242    from the vectorized reduction operation generated in the previous iteration.
5243   */
5244
5245   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5246     {
5247       single_defuse_cycle = true;
5248       epilog_copies = 1;
5249     }
5250   else
5251     epilog_copies = ncopies;
5252
5253   prev_stmt_info = NULL;
5254   prev_phi_info = NULL;
5255   if (slp_node)
5256     {
5257       vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5258       gcc_assert (TYPE_VECTOR_SUBPARTS (vectype_out)
5259                   == TYPE_VECTOR_SUBPARTS (vectype_in));
5260     }
5261   else
5262     {
5263       vec_num = 1;
5264       vec_oprnds0.create (1);
5265       if (op_type == ternary_op)
5266         vec_oprnds1.create (1);
5267     }
5268
5269   phis.create (vec_num);
5270   vect_defs.create (vec_num);
5271   if (!slp_node)
5272     vect_defs.quick_push (NULL_TREE);
5273
5274   for (j = 0; j < ncopies; j++)
5275     {
5276       if (j == 0 || !single_defuse_cycle)
5277         {
5278           for (i = 0; i < vec_num; i++)
5279             {
5280               /* Create the reduction-phi that defines the reduction
5281                  operand.  */
5282               new_phi = create_phi_node (vec_dest, loop->header);
5283               set_vinfo_for_stmt (new_phi,
5284                                   new_stmt_vec_info (new_phi, loop_vinfo,
5285                                                      NULL));
5286                if (j == 0 || slp_node)
5287                  phis.quick_push (new_phi);
5288             }
5289         }
5290
5291       if (code == COND_EXPR)
5292         {
5293           gcc_assert (!slp_node);
5294           vectorizable_condition (stmt, gsi, vec_stmt,
5295                                   PHI_RESULT (phis[0]),
5296                                   reduc_index, NULL);
5297           /* Multiple types are not supported for condition.  */
5298           break;
5299         }
5300
5301       /* Handle uses.  */
5302       if (j == 0)
5303         {
5304           op0 = ops[!reduc_index];
5305           if (op_type == ternary_op)
5306             {
5307               if (reduc_index == 0)
5308                 op1 = ops[2];
5309               else
5310                 op1 = ops[1];
5311             }
5312
5313           if (slp_node)
5314             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5315                                slp_node, -1);
5316           else
5317             {
5318               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5319                                                             stmt, NULL);
5320               vec_oprnds0.quick_push (loop_vec_def0);
5321               if (op_type == ternary_op)
5322                {
5323                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5324                                                                NULL);
5325                  vec_oprnds1.quick_push (loop_vec_def1);
5326                }
5327             }
5328         }
5329       else
5330         {
5331           if (!slp_node)
5332             {
5333               enum vect_def_type dt;
5334               gimple dummy_stmt;
5335               tree dummy;
5336
5337               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5338                                   &dummy_stmt, &dummy, &dt);
5339               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5340                                                               loop_vec_def0);
5341               vec_oprnds0[0] = loop_vec_def0;
5342               if (op_type == ternary_op)
5343                 {
5344                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5345                                       &dummy, &dt);
5346                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5347                                                                 loop_vec_def1);
5348                   vec_oprnds1[0] = loop_vec_def1;
5349                 }
5350             }
5351
5352           if (single_defuse_cycle)
5353             reduc_def = gimple_assign_lhs (new_stmt);
5354
5355           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5356         }
5357
5358       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5359         {
5360           if (slp_node)
5361             reduc_def = PHI_RESULT (phis[i]);
5362           else
5363             {
5364               if (!single_defuse_cycle || j == 0)
5365                 reduc_def = PHI_RESULT (new_phi);
5366             }
5367
5368           def1 = ((op_type == ternary_op)
5369                   ? vec_oprnds1[i] : NULL);
5370           if (op_type == binary_op)
5371             {
5372               if (reduc_index == 0)
5373                 expr = build2 (code, vectype_out, reduc_def, def0);
5374               else
5375                 expr = build2 (code, vectype_out, def0, reduc_def);
5376             }
5377           else
5378             {
5379               if (reduc_index == 0)
5380                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5381               else
5382                 {
5383                   if (reduc_index == 1)
5384                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5385                   else
5386                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5387                 }
5388             }
5389
5390           new_stmt = gimple_build_assign (vec_dest, expr);
5391           new_temp = make_ssa_name (vec_dest, new_stmt);
5392           gimple_assign_set_lhs (new_stmt, new_temp);
5393           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5394
5395           if (slp_node)
5396             {
5397               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5398               vect_defs.quick_push (new_temp);
5399             }
5400           else
5401             vect_defs[0] = new_temp;
5402         }
5403
5404       if (slp_node)
5405         continue;
5406
5407       if (j == 0)
5408         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5409       else
5410         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5411
5412       prev_stmt_info = vinfo_for_stmt (new_stmt);
5413       prev_phi_info = vinfo_for_stmt (new_phi);
5414     }
5415
5416   /* Finalize the reduction-phi (set its arguments) and create the
5417      epilog reduction code.  */
5418   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5419     {
5420       new_temp = gimple_assign_lhs (*vec_stmt);
5421       vect_defs[0] = new_temp;
5422     }
5423
5424   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5425                                     epilog_reduc_code, phis, reduc_index,
5426                                     double_reduc, slp_node);
5427
5428   return true;
5429 }
5430
5431 /* Function vect_min_worthwhile_factor.
5432
5433    For a loop where we could vectorize the operation indicated by CODE,
5434    return the minimum vectorization factor that makes it worthwhile
5435    to use generic vectors.  */
5436 int
5437 vect_min_worthwhile_factor (enum tree_code code)
5438 {
5439   switch (code)
5440     {
5441     case PLUS_EXPR:
5442     case MINUS_EXPR:
5443     case NEGATE_EXPR:
5444       return 4;
5445
5446     case BIT_AND_EXPR:
5447     case BIT_IOR_EXPR:
5448     case BIT_XOR_EXPR:
5449     case BIT_NOT_EXPR:
5450       return 2;
5451
5452     default:
5453       return INT_MAX;
5454     }
5455 }
5456
5457
5458 /* Function vectorizable_induction
5459
5460    Check if PHI performs an induction computation that can be vectorized.
5461    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5462    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5463    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5464
5465 bool
5466 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5467                         gimple *vec_stmt)
5468 {
5469   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5470   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5471   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5472   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5473   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5474   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5475   tree vec_def;
5476
5477   gcc_assert (ncopies >= 1);
5478   /* FORNOW. These restrictions should be relaxed.  */
5479   if (nested_in_vect_loop_p (loop, phi))
5480     {
5481       imm_use_iterator imm_iter;
5482       use_operand_p use_p;
5483       gimple exit_phi;
5484       edge latch_e;
5485       tree loop_arg;
5486
5487       if (ncopies > 1)
5488         {
5489           if (dump_enabled_p ())
5490             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5491                              "multiple types in nested loop.\n");
5492           return false;
5493         }
5494
5495       exit_phi = NULL;
5496       latch_e = loop_latch_edge (loop->inner);
5497       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5498       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5499         {
5500           gimple use_stmt = USE_STMT (use_p);
5501           if (is_gimple_debug (use_stmt))
5502             continue;
5503
5504           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5505             {
5506               exit_phi = use_stmt;
5507               break;
5508             }
5509         }
5510       if (exit_phi)
5511         {
5512           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5513           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5514                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5515             {
5516               if (dump_enabled_p ())
5517                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5518                                  "inner-loop induction only used outside "
5519                                  "of the outer vectorized loop.\n");
5520               return false;
5521             }
5522         }
5523     }
5524
5525   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5526     return false;
5527
5528   /* FORNOW: SLP not supported.  */
5529   if (STMT_SLP_TYPE (stmt_info))
5530     return false;
5531
5532   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5533
5534   if (gimple_code (phi) != GIMPLE_PHI)
5535     return false;
5536
5537   if (!vec_stmt) /* transformation not required.  */
5538     {
5539       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5540       if (dump_enabled_p ())
5541         dump_printf_loc (MSG_NOTE, vect_location,
5542                          "=== vectorizable_induction ===\n");
5543       vect_model_induction_cost (stmt_info, ncopies);
5544       return true;
5545     }
5546
5547   /** Transform.  **/
5548
5549   if (dump_enabled_p ())
5550     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5551
5552   vec_def = get_initial_def_for_induction (phi);
5553   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5554   return true;
5555 }
5556
5557 /* Function vectorizable_live_operation.
5558
5559    STMT computes a value that is used outside the loop.  Check if
5560    it can be supported.  */
5561
5562 bool
5563 vectorizable_live_operation (gimple stmt,
5564                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5565                              gimple *vec_stmt)
5566 {
5567   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5568   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5569   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5570   int i;
5571   int op_type;
5572   tree op;
5573   tree def;
5574   gimple def_stmt;
5575   enum vect_def_type dt;
5576   enum tree_code code;
5577   enum gimple_rhs_class rhs_class;
5578
5579   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5580
5581   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5582     return false;
5583
5584   if (!is_gimple_assign (stmt))
5585     {
5586       if (gimple_call_internal_p (stmt)
5587           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5588           && gimple_call_lhs (stmt)
5589           && loop->simduid
5590           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5591           && loop->simduid
5592              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5593         {
5594           edge e = single_exit (loop);
5595           basic_block merge_bb = e->dest;
5596           imm_use_iterator imm_iter;
5597           use_operand_p use_p;
5598           tree lhs = gimple_call_lhs (stmt);
5599
5600           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5601             {
5602               gimple use_stmt = USE_STMT (use_p);
5603               if (gimple_code (use_stmt) == GIMPLE_PHI
5604                   && gimple_bb (use_stmt) == merge_bb)
5605                 {
5606                   if (vec_stmt)
5607                     {
5608                       tree vfm1
5609                         = build_int_cst (unsigned_type_node,
5610                                          loop_vinfo->vectorization_factor - 1);
5611                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5612                     }
5613                   return true;
5614                 }
5615             }
5616         }
5617
5618       return false;
5619     }
5620
5621   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5622     return false;
5623
5624   /* FORNOW. CHECKME. */
5625   if (nested_in_vect_loop_p (loop, stmt))
5626     return false;
5627
5628   code = gimple_assign_rhs_code (stmt);
5629   op_type = TREE_CODE_LENGTH (code);
5630   rhs_class = get_gimple_rhs_class (code);
5631   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5632   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5633
5634   /* FORNOW: support only if all uses are invariant.  This means
5635      that the scalar operations can remain in place, unvectorized.
5636      The original last scalar value that they compute will be used.  */
5637
5638   for (i = 0; i < op_type; i++)
5639     {
5640       if (rhs_class == GIMPLE_SINGLE_RHS)
5641         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5642       else
5643         op = gimple_op (stmt, i + 1);
5644       if (op
5645           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5646                                   &dt))
5647         {
5648           if (dump_enabled_p ())
5649             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5650                              "use not simple.\n");
5651           return false;
5652         }
5653
5654       if (dt != vect_external_def && dt != vect_constant_def)
5655         return false;
5656     }
5657
5658   /* No transformation is required for the cases we currently support.  */
5659   return true;
5660 }
5661
5662 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5663
5664 static void
5665 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5666 {
5667   ssa_op_iter op_iter;
5668   imm_use_iterator imm_iter;
5669   def_operand_p def_p;
5670   gimple ustmt;
5671
5672   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5673     {
5674       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5675         {
5676           basic_block bb;
5677
5678           if (!is_gimple_debug (ustmt))
5679             continue;
5680
5681           bb = gimple_bb (ustmt);
5682
5683           if (!flow_bb_inside_loop_p (loop, bb))
5684             {
5685               if (gimple_debug_bind_p (ustmt))
5686                 {
5687                   if (dump_enabled_p ())
5688                     dump_printf_loc (MSG_NOTE, vect_location,
5689                                      "killing debug use\n");
5690
5691                   gimple_debug_bind_reset_value (ustmt);
5692                   update_stmt (ustmt);
5693                 }
5694               else
5695                 gcc_unreachable ();
5696             }
5697         }
5698     }
5699 }
5700
5701
5702 /* This function builds ni_name = number of iterations.  Statements
5703    are emitted on the loop preheader edge.  */
5704
5705 static tree
5706 vect_build_loop_niters (loop_vec_info loop_vinfo)
5707 {
5708   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5709   if (TREE_CODE (ni) == INTEGER_CST)
5710     return ni;
5711   else
5712     {
5713       tree ni_name, var;
5714       gimple_seq stmts = NULL;
5715       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5716
5717       var = create_tmp_var (TREE_TYPE (ni), "niters");
5718       ni_name = force_gimple_operand (ni, &stmts, false, var);
5719       if (stmts)
5720         gsi_insert_seq_on_edge_immediate (pe, stmts);
5721
5722       return ni_name;
5723     }
5724 }
5725
5726
5727 /* This function generates the following statements:
5728
5729    ni_name = number of iterations loop executes
5730    ratio = ni_name / vf
5731    ratio_mult_vf_name = ratio * vf
5732
5733    and places them on the loop preheader edge.  */
5734
5735 static void
5736 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5737                                  tree ni_name,
5738                                  tree *ratio_mult_vf_name_ptr,
5739                                  tree *ratio_name_ptr)
5740 {
5741   tree ni_minus_gap_name;
5742   tree var;
5743   tree ratio_name;
5744   tree ratio_mult_vf_name;
5745   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5746   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5747   tree log_vf;
5748
5749   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5750
5751   /* If epilogue loop is required because of data accesses with gaps, we
5752      subtract one iteration from the total number of iterations here for
5753      correct calculation of RATIO.  */
5754   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5755     {
5756       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5757                                        ni_name,
5758                                        build_one_cst (TREE_TYPE (ni_name)));
5759       if (!is_gimple_val (ni_minus_gap_name))
5760         {
5761           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5762           gimple stmts = NULL;
5763           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5764                                                     true, var);
5765           gsi_insert_seq_on_edge_immediate (pe, stmts);
5766         }
5767     }
5768   else
5769     ni_minus_gap_name = ni_name;
5770
5771   /* Create: ratio = ni >> log2(vf) */
5772   /* ???  As we have ni == number of latch executions + 1, ni could
5773      have overflown to zero.  So avoid computing ratio based on ni
5774      but compute it using the fact that we know ratio will be at least
5775      one, thus via (ni - vf) >> log2(vf) + 1.  */
5776   ratio_name
5777     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5778                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5779                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5780                                              ni_minus_gap_name,
5781                                              build_int_cst
5782                                                (TREE_TYPE (ni_name), vf)),
5783                                 log_vf),
5784                    build_int_cst (TREE_TYPE (ni_name), 1));
5785   if (!is_gimple_val (ratio_name))
5786     {
5787       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5788       gimple stmts = NULL;
5789       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5790       gsi_insert_seq_on_edge_immediate (pe, stmts);
5791     }
5792   *ratio_name_ptr = ratio_name;
5793
5794   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5795
5796   if (ratio_mult_vf_name_ptr)
5797     {
5798       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5799                                         ratio_name, log_vf);
5800       if (!is_gimple_val (ratio_mult_vf_name))
5801         {
5802           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5803           gimple stmts = NULL;
5804           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5805                                                      true, var);
5806           gsi_insert_seq_on_edge_immediate (pe, stmts);
5807         }
5808       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5809     }
5810
5811   return;
5812 }
5813
5814
5815 /* Function vect_transform_loop.
5816
5817    The analysis phase has determined that the loop is vectorizable.
5818    Vectorize the loop - created vectorized stmts to replace the scalar
5819    stmts in the loop, and update the loop exit condition.  */
5820
5821 void
5822 vect_transform_loop (loop_vec_info loop_vinfo)
5823 {
5824   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5825   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5826   int nbbs = loop->num_nodes;
5827   int i;
5828   tree ratio = NULL;
5829   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5830   bool grouped_store;
5831   bool slp_scheduled = false;
5832   gimple stmt, pattern_stmt;
5833   gimple_seq pattern_def_seq = NULL;
5834   gimple_stmt_iterator pattern_def_si = gsi_none ();
5835   bool transform_pattern_stmt = false;
5836   bool check_profitability = false;
5837   int th;
5838   /* Record number of iterations before we started tampering with the profile. */
5839   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5840
5841   if (dump_enabled_p ())
5842     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5843
5844   /* If profile is inprecise, we have chance to fix it up.  */
5845   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5846     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5847
5848   /* Use the more conservative vectorization threshold.  If the number
5849      of iterations is constant assume the cost check has been performed
5850      by our caller.  If the threshold makes all loops profitable that
5851      run at least the vectorization factor number of times checking
5852      is pointless, too.  */
5853   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5854   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5855       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5856     {
5857       if (dump_enabled_p ())
5858         dump_printf_loc (MSG_NOTE, vect_location,
5859                          "Profitability threshold is %d loop iterations.\n",
5860                          th);
5861       check_profitability = true;
5862     }
5863
5864   /* Version the loop first, if required, so the profitability check
5865      comes first.  */
5866
5867   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5868       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5869     {
5870       vect_loop_versioning (loop_vinfo, th, check_profitability);
5871       check_profitability = false;
5872     }
5873
5874   tree ni_name = vect_build_loop_niters (loop_vinfo);
5875   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5876
5877   /* Peel the loop if there are data refs with unknown alignment.
5878      Only one data ref with unknown store is allowed.  */
5879
5880   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5881     {
5882       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5883                                      th, check_profitability);
5884       check_profitability = false;
5885       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5886          be re-computed.  */
5887       ni_name = NULL_TREE;
5888     }
5889
5890   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5891      compile time constant), or it is a constant that doesn't divide by the
5892      vectorization factor, then an epilog loop needs to be created.
5893      We therefore duplicate the loop: the original loop will be vectorized,
5894      and will compute the first (n/VF) iterations.  The second copy of the loop
5895      will remain scalar and will compute the remaining (n%VF) iterations.
5896      (VF is the vectorization factor).  */
5897
5898   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5899       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5900     {
5901       tree ratio_mult_vf;
5902       if (!ni_name)
5903         ni_name = vect_build_loop_niters (loop_vinfo);
5904       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5905                                        &ratio);
5906       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5907                                       th, check_profitability);
5908     }
5909   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5910     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5911                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5912   else
5913     {
5914       if (!ni_name)
5915         ni_name = vect_build_loop_niters (loop_vinfo);
5916       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5917     }
5918
5919   /* 1) Make sure the loop header has exactly two entries
5920      2) Make sure we have a preheader basic block.  */
5921
5922   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5923
5924   split_edge (loop_preheader_edge (loop));
5925
5926   /* FORNOW: the vectorizer supports only loops which body consist
5927      of one basic block (header + empty latch). When the vectorizer will
5928      support more involved loop forms, the order by which the BBs are
5929      traversed need to be reconsidered.  */
5930
5931   for (i = 0; i < nbbs; i++)
5932     {
5933       basic_block bb = bbs[i];
5934       stmt_vec_info stmt_info;
5935
5936       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5937            gsi_next (&si))
5938         {
5939           gphi *phi = si.phi ();
5940           if (dump_enabled_p ())
5941             {
5942               dump_printf_loc (MSG_NOTE, vect_location,
5943                                "------>vectorizing phi: ");
5944               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5945               dump_printf (MSG_NOTE, "\n");
5946             }
5947           stmt_info = vinfo_for_stmt (phi);
5948           if (!stmt_info)
5949             continue;
5950
5951           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5952             vect_loop_kill_debug_uses (loop, phi);
5953
5954           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5955               && !STMT_VINFO_LIVE_P (stmt_info))
5956             continue;
5957
5958           if (STMT_VINFO_VECTYPE (stmt_info)
5959               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5960                   != (unsigned HOST_WIDE_INT) vectorization_factor)
5961               && dump_enabled_p ())
5962             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5963
5964           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5965             {
5966               if (dump_enabled_p ())
5967                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5968               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5969             }
5970         }
5971
5972       pattern_stmt = NULL;
5973       for (gimple_stmt_iterator si = gsi_start_bb (bb);
5974            !gsi_end_p (si) || transform_pattern_stmt;)
5975         {
5976           bool is_store;
5977
5978           if (transform_pattern_stmt)
5979             stmt = pattern_stmt;
5980           else
5981             {
5982               stmt = gsi_stmt (si);
5983               /* During vectorization remove existing clobber stmts.  */
5984               if (gimple_clobber_p (stmt))
5985                 {
5986                   unlink_stmt_vdef (stmt);
5987                   gsi_remove (&si, true);
5988                   release_defs (stmt);
5989                   continue;
5990                 }
5991             }
5992
5993           if (dump_enabled_p ())
5994             {
5995               dump_printf_loc (MSG_NOTE, vect_location,
5996                                "------>vectorizing statement: ");
5997               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5998               dump_printf (MSG_NOTE, "\n");
5999             }
6000
6001           stmt_info = vinfo_for_stmt (stmt);
6002
6003           /* vector stmts created in the outer-loop during vectorization of
6004              stmts in an inner-loop may not have a stmt_info, and do not
6005              need to be vectorized.  */
6006           if (!stmt_info)
6007             {
6008               gsi_next (&si);
6009               continue;
6010             }
6011
6012           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6013             vect_loop_kill_debug_uses (loop, stmt);
6014
6015           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6016               && !STMT_VINFO_LIVE_P (stmt_info))
6017             {
6018               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6019                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6020                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6021                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6022                 {
6023                   stmt = pattern_stmt;
6024                   stmt_info = vinfo_for_stmt (stmt);
6025                 }
6026               else
6027                 {
6028                   gsi_next (&si);
6029                   continue;
6030                 }
6031             }
6032           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6033                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6034                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6035                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6036             transform_pattern_stmt = true;
6037
6038           /* If pattern statement has def stmts, vectorize them too.  */
6039           if (is_pattern_stmt_p (stmt_info))
6040             {
6041               if (pattern_def_seq == NULL)
6042                 {
6043                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6044                   pattern_def_si = gsi_start (pattern_def_seq);
6045                 }
6046               else if (!gsi_end_p (pattern_def_si))
6047                 gsi_next (&pattern_def_si);
6048               if (pattern_def_seq != NULL)
6049                 {
6050                   gimple pattern_def_stmt = NULL;
6051                   stmt_vec_info pattern_def_stmt_info = NULL;
6052
6053                   while (!gsi_end_p (pattern_def_si))
6054                     {
6055                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6056                       pattern_def_stmt_info
6057                         = vinfo_for_stmt (pattern_def_stmt);
6058                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6059                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6060                         break;
6061                       gsi_next (&pattern_def_si);
6062                     }
6063
6064                   if (!gsi_end_p (pattern_def_si))
6065                     {
6066                       if (dump_enabled_p ())
6067                         {
6068                           dump_printf_loc (MSG_NOTE, vect_location,
6069                                            "==> vectorizing pattern def "
6070                                            "stmt: ");
6071                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6072                                             pattern_def_stmt, 0);
6073                           dump_printf (MSG_NOTE, "\n");
6074                         }
6075
6076                       stmt = pattern_def_stmt;
6077                       stmt_info = pattern_def_stmt_info;
6078                     }
6079                   else
6080                     {
6081                       pattern_def_si = gsi_none ();
6082                       transform_pattern_stmt = false;
6083                     }
6084                 }
6085               else
6086                 transform_pattern_stmt = false;
6087             }
6088
6089           if (STMT_VINFO_VECTYPE (stmt_info))
6090             {
6091               unsigned int nunits
6092                 = (unsigned int)
6093                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6094               if (!STMT_SLP_TYPE (stmt_info)
6095                   && nunits != (unsigned int) vectorization_factor
6096                   && dump_enabled_p ())
6097                   /* For SLP VF is set according to unrolling factor, and not
6098                      to vector size, hence for SLP this print is not valid.  */
6099                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6100             }
6101
6102           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6103              reached.  */
6104           if (STMT_SLP_TYPE (stmt_info))
6105             {
6106               if (!slp_scheduled)
6107                 {
6108                   slp_scheduled = true;
6109
6110                   if (dump_enabled_p ())
6111                     dump_printf_loc (MSG_NOTE, vect_location,
6112                                      "=== scheduling SLP instances ===\n");
6113
6114                   vect_schedule_slp (loop_vinfo, NULL);
6115                 }
6116
6117               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6118               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6119                 {
6120                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6121                     {
6122                       pattern_def_seq = NULL;
6123                       gsi_next (&si);
6124                     }
6125                   continue;
6126                 }
6127             }
6128
6129           /* -------- vectorize statement ------------ */
6130           if (dump_enabled_p ())
6131             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6132
6133           grouped_store = false;
6134           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6135           if (is_store)
6136             {
6137               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6138                 {
6139                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6140                      interleaving chain was completed - free all the stores in
6141                      the chain.  */
6142                   gsi_next (&si);
6143                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6144                 }
6145               else
6146                 {
6147                   /* Free the attached stmt_vec_info and remove the stmt.  */
6148                   gimple store = gsi_stmt (si);
6149                   free_stmt_vec_info (store);
6150                   unlink_stmt_vdef (store);
6151                   gsi_remove (&si, true);
6152                   release_defs (store);
6153                 }
6154
6155               /* Stores can only appear at the end of pattern statements.  */
6156               gcc_assert (!transform_pattern_stmt);
6157               pattern_def_seq = NULL;
6158             }
6159           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6160             {
6161               pattern_def_seq = NULL;
6162               gsi_next (&si);
6163             }
6164         }                       /* stmts in BB */
6165     }                           /* BBs in loop */
6166
6167   slpeel_make_loop_iterate_ntimes (loop, ratio);
6168
6169   /* Reduce loop iterations by the vectorization factor.  */
6170   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6171                       expected_iterations / vectorization_factor);
6172   loop->nb_iterations_upper_bound
6173     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6174   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6175       && loop->nb_iterations_upper_bound != 0)
6176     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6177   if (loop->any_estimate)
6178     {
6179       loop->nb_iterations_estimate
6180         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6181        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6182            && loop->nb_iterations_estimate != 0)
6183          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6184     }
6185
6186   if (dump_enabled_p ())
6187     {
6188       dump_printf_loc (MSG_NOTE, vect_location,
6189                        "LOOP VECTORIZED\n");
6190       if (loop->inner)
6191         dump_printf_loc (MSG_NOTE, vect_location,
6192                          "OUTER LOOP VECTORIZED\n");
6193       dump_printf (MSG_NOTE, "\n");
6194     }
6195 }