gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "backend.h"
  27 #include "cfghooks.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "rtl.h"
  31 #include "ssa.h"
  32 #include "alias.h"
  33 #include "fold-const.h"
  34 #include "stor-layout.h"
  35 #include "cfganal.h"
  36 #include "gimple-pretty-print.h"
  37 #include "internal-fn.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-pass.h"
  45 #include "cfgloop.h"
  46 #include "flags.h"
  47 #include "insn-codes.h"
  48 #include "optabs-tree.h"
  49 #include "params.h"
  50 #include "diagnostic-core.h"
  51 #include "tree-chrec.h"
  52 #include "tree-scalar-evolution.h"
  53 #include "tree-vectorizer.h"
  54 #include "target.h"
  55 #include "gimple-fold.h"
  56
  57 /* Loop Vectorization Pass.
  58
  59    This pass tries to vectorize loops.
  60
  61    For example, the vectorizer transforms the following simple loop:
  62
  63         short a[N]; short b[N]; short c[N]; int i;
  64
  65         for (i=0; i<N; i++){
  66           a[i] = b[i] + c[i];
  67         }
  68
  69    as if it was manually vectorized by rewriting the source code into:
  70
  71         typedef int __attribute__((mode(V8HI))) v8hi;
  72         short a[N];  short b[N]; short c[N];   int i;
  73         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  74         v8hi va, vb, vc;
  75
  76         for (i=0; i<N/8; i++){
  77           vb = pb[i];
  78           vc = pc[i];
  79           va = vb + vc;
  80           pa[i] = va;
  81         }
  82
  83         The main entry to this pass is vectorize_loops(), in which
  84    the vectorizer applies a set of analyses on a given set of loops,
  85    followed by the actual vectorization transformation for the loops that
  86    had successfully passed the analysis phase.
  87         Throughout this pass we make a distinction between two types of
  88    data: scalars (which are represented by SSA_NAMES), and memory references
  89    ("data-refs").  These two types of data require different handling both
  90    during analysis and transformation. The types of data-refs that the
  91    vectorizer currently supports are ARRAY_REFS which base is an array DECL
  92    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
  93    accesses are required to have a simple (consecutive) access pattern.
  94
  95    Analysis phase:
  96    ===============
  97         The driver for the analysis phase is vect_analyze_loop().
  98    It applies a set of analyses, some of which rely on the scalar evolution
  99    analyzer (scev) developed by Sebastian Pop.
 100
 101         During the analysis phase the vectorizer records some information
 102    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 103    loop, as well as general information about the loop as a whole, which is
 104    recorded in a "loop_vec_info" struct attached to each loop.
 105
 106    Transformation phase:
 107    =====================
 108         The loop transformation phase scans all the stmts in the loop, and
 109    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 110    the loop that needs to be vectorized.  It inserts the vector code sequence
 111    just before the scalar stmt S, and records a pointer to the vector code
 112    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 113    attached to S).  This pointer will be used for the vectorization of following
 114    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 115    otherwise, we rely on dead code elimination for removing it.
 116
 117         For example, say stmt S1 was vectorized into stmt VS1:
 118
 119    VS1: vb = px[i];
 120    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 121    S2:  a = b;
 122
 123    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 124    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 125    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 126    resulting sequence would be:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    VS2: va = vb;
 131    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 132
 133         Operands that are not SSA_NAMEs, are data-refs that appear in
 134    load/store operations (like 'x[i]' in S1), and are handled differently.
 135
 136    Target modeling:
 137    =================
 138         Currently the only target specific information that is used is the
 139    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 140    Targets that can support different sizes of vectors, for now will need
 141    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 142    flexibility will be added in the future.
 143
 144         Since we only vectorize operations which vector form can be
 145    expressed using existing tree codes, to verify that an operation is
 146    supported, the vectorizer checks the relevant optab at the relevant
 147    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 148    the value found is CODE_FOR_nothing, then there's no target support, and
 149    we can't vectorize the stmt.
 150
 151    For additional information on this project see:
 152    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 153 */
 154
 155 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 156
 157 /* Function vect_determine_vectorization_factor
 158
 159    Determine the vectorization factor (VF).  VF is the number of data elements
 160    that are operated upon in parallel in a single iteration of the vectorized
 161    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 162    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 163    elements can fit in a single vector register.
 164
 165    We currently support vectorization of loops in which all types operated upon
 166    are of the same size.  Therefore this function currently sets VF according to
 167    the size of the types operated upon, and fails if there are multiple sizes
 168    in the loop.
 169
 170    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 171    original loop:
 172         for (i=0; i<N; i++){
 173           a[i] = b[i] + c[i];
 174         }
 175
 176    vectorized loop:
 177         for (i=0; i<N; i+=VF){
 178           a[i:VF] = b[i:VF] + c[i:VF];
 179         }
 180 */
 181
 182 static bool
 183 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 184 {
 185   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 186   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 187   int nbbs = loop->num_nodes;
 188   unsigned int vectorization_factor = 0;
 189   tree scalar_type;
 190   gphi *phi;
 191   tree vectype;
 192   unsigned int nunits;
 193   stmt_vec_info stmt_info;
 194   int i;
 195   HOST_WIDE_INT dummy;
 196   gimple *stmt, *pattern_stmt = NULL;
 197   gimple_seq pattern_def_seq = NULL;
 198   gimple_stmt_iterator pattern_def_si = gsi_none ();
 199   bool analyze_pattern_stmt = false;
 200
 201   if (dump_enabled_p ())
 202     dump_printf_loc (MSG_NOTE, vect_location,
 203                      "=== vect_determine_vectorization_factor ===\n");
 204
 205   for (i = 0; i < nbbs; i++)
 206     {
 207       basic_block bb = bbs[i];
 208
 209       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 210            gsi_next (&si))
 211         {
 212           phi = si.phi ();
 213           stmt_info = vinfo_for_stmt (phi);
 214           if (dump_enabled_p ())
 215             {
 216               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 217               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 218               dump_printf (MSG_NOTE, "\n");
 219             }
 220
 221           gcc_assert (stmt_info);
 222
 223           if (STMT_VINFO_RELEVANT_P (stmt_info))
 224             {
 225               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 226               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 227
 228               if (dump_enabled_p ())
 229                 {
 230                   dump_printf_loc (MSG_NOTE, vect_location,
 231                                    "get vectype for scalar type:  ");
 232                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 233                   dump_printf (MSG_NOTE, "\n");
 234                 }
 235
 236               vectype = get_vectype_for_scalar_type (scalar_type);
 237               if (!vectype)
 238                 {
 239                   if (dump_enabled_p ())
 240                     {
 241                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 242                                        "not vectorized: unsupported "
 243                                        "data-type ");
 244                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 245                                          scalar_type);
 246                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 247                     }
 248                   return false;
 249                 }
 250               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 251
 252               if (dump_enabled_p ())
 253                 {
 254                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 255                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 256                   dump_printf (MSG_NOTE, "\n");
 257                 }
 258
 259               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 260               if (dump_enabled_p ())
 261                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 262                                  nunits);
 263
 264               if (!vectorization_factor
 265                   || (nunits > vectorization_factor))
 266                 vectorization_factor = nunits;
 267             }
 268         }
 269
 270       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 271            !gsi_end_p (si) || analyze_pattern_stmt;)
 272         {
 273           tree vf_vectype;
 274
 275           if (analyze_pattern_stmt)
 276             stmt = pattern_stmt;
 277           else
 278             stmt = gsi_stmt (si);
 279
 280           stmt_info = vinfo_for_stmt (stmt);
 281
 282           if (dump_enabled_p ())
 283             {
 284               dump_printf_loc (MSG_NOTE, vect_location,
 285                                "==> examining statement: ");
 286               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 287               dump_printf (MSG_NOTE, "\n");
 288             }
 289
 290           gcc_assert (stmt_info);
 291
 292           /* Skip stmts which do not need to be vectorized.  */
 293           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 294                && !STMT_VINFO_LIVE_P (stmt_info))
 295               || gimple_clobber_p (stmt))
 296             {
 297               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 298                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 299                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 300                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 301                 {
 302                   stmt = pattern_stmt;
 303                   stmt_info = vinfo_for_stmt (pattern_stmt);
 304                   if (dump_enabled_p ())
 305                     {
 306                       dump_printf_loc (MSG_NOTE, vect_location,
 307                                        "==> examining pattern statement: ");
 308                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 309                       dump_printf (MSG_NOTE, "\n");
 310                     }
 311                 }
 312               else
 313                 {
 314                   if (dump_enabled_p ())
 315                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 316                   gsi_next (&si);
 317                   continue;
 318                 }
 319             }
 320           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 321                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 322                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 323                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 324             analyze_pattern_stmt = true;
 325
 326           /* If a pattern statement has def stmts, analyze them too.  */
 327           if (is_pattern_stmt_p (stmt_info))
 328             {
 329               if (pattern_def_seq == NULL)
 330                 {
 331                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 332                   pattern_def_si = gsi_start (pattern_def_seq);
 333                 }
 334               else if (!gsi_end_p (pattern_def_si))
 335                 gsi_next (&pattern_def_si);
 336               if (pattern_def_seq != NULL)
 337                 {
 338                   gimple *pattern_def_stmt = NULL;
 339                   stmt_vec_info pattern_def_stmt_info = NULL;
 340
 341                   while (!gsi_end_p (pattern_def_si))
 342                     {
 343                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 344                       pattern_def_stmt_info
 345                         = vinfo_for_stmt (pattern_def_stmt);
 346                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 347                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 348                         break;
 349                       gsi_next (&pattern_def_si);
 350                     }
 351
 352                   if (!gsi_end_p (pattern_def_si))
 353                     {
 354                       if (dump_enabled_p ())
 355                         {
 356                           dump_printf_loc (MSG_NOTE, vect_location,
 357                                            "==> examining pattern def stmt: ");
 358                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 359                                             pattern_def_stmt, 0);
 360                           dump_printf (MSG_NOTE, "\n");
 361                         }
 362
 363                       stmt = pattern_def_stmt;
 364                       stmt_info = pattern_def_stmt_info;
 365                     }
 366                   else
 367                     {
 368                       pattern_def_si = gsi_none ();
 369                       analyze_pattern_stmt = false;
 370                     }
 371                 }
 372               else
 373                 analyze_pattern_stmt = false;
 374             }
 375
 376           if (gimple_get_lhs (stmt) == NULL_TREE
 377               /* MASK_STORE has no lhs, but is ok.  */
 378               && (!is_gimple_call (stmt)
 379                   || !gimple_call_internal_p (stmt)
 380                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 381             {
 382               if (is_gimple_call (stmt))
 383                 {
 384                   /* Ignore calls with no lhs.  These must be calls to
 385                      #pragma omp simd functions, and what vectorization factor
 386                      it really needs can't be determined until
 387                      vectorizable_simd_clone_call.  */
 388                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 389                     {
 390                       pattern_def_seq = NULL;
 391                       gsi_next (&si);
 392                     }
 393                   continue;
 394                 }
 395               if (dump_enabled_p ())
 396                 {
 397                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 398                                    "not vectorized: irregular stmt.");
 399                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 400                                     0);
 401                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 402                 }
 403               return false;
 404             }
 405
 406           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 407             {
 408               if (dump_enabled_p ())
 409                 {
 410                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 411                                    "not vectorized: vector stmt in loop:");
 412                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 413                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 414                 }
 415               return false;
 416             }
 417
 418           if (STMT_VINFO_VECTYPE (stmt_info))
 419             {
 420               /* The only case when a vectype had been already set is for stmts
 421                  that contain a dataref, or for "pattern-stmts" (stmts
 422                  generated by the vectorizer to represent/replace a certain
 423                  idiom).  */
 424               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 425                           || is_pattern_stmt_p (stmt_info)
 426                           || !gsi_end_p (pattern_def_si));
 427               vectype = STMT_VINFO_VECTYPE (stmt_info);
 428             }
 429           else
 430             {
 431               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 432               if (is_gimple_call (stmt)
 433                   && gimple_call_internal_p (stmt)
 434                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 435                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 436               else
 437                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 438               if (dump_enabled_p ())
 439                 {
 440                   dump_printf_loc (MSG_NOTE, vect_location,
 441                                    "get vectype for scalar type:  ");
 442                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 443                   dump_printf (MSG_NOTE, "\n");
 444                 }
 445               vectype = get_vectype_for_scalar_type (scalar_type);
 446               if (!vectype)
 447                 {
 448                   if (dump_enabled_p ())
 449                     {
 450                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 451                                        "not vectorized: unsupported "
 452                                        "data-type ");
 453                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 454                                          scalar_type);
 455                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 456                     }
 457                   return false;
 458                 }
 459
 460               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 461
 462               if (dump_enabled_p ())
 463                 {
 464                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 465                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 466                   dump_printf (MSG_NOTE, "\n");
 467                 }
 468             }
 469
 470           /* The vectorization factor is according to the smallest
 471              scalar type (or the largest vector size, but we only
 472              support one vector size per loop).  */
 473           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 474                                                        &dummy);
 475           if (dump_enabled_p ())
 476             {
 477               dump_printf_loc (MSG_NOTE, vect_location,
 478                                "get vectype for scalar type:  ");
 479               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 480               dump_printf (MSG_NOTE, "\n");
 481             }
 482           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 483           if (!vf_vectype)
 484             {
 485               if (dump_enabled_p ())
 486                 {
 487                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 488                                    "not vectorized: unsupported data-type ");
 489                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 490                                      scalar_type);
 491                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 492                 }
 493               return false;
 494             }
 495
 496           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 497                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 498             {
 499               if (dump_enabled_p ())
 500                 {
 501                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 502                                    "not vectorized: different sized vector "
 503                                    "types in statement, ");
 504                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 505                                      vectype);
 506                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 507                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 508                                      vf_vectype);
 509                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 510                 }
 511               return false;
 512             }
 513
 514           if (dump_enabled_p ())
 515             {
 516               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 517               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 518               dump_printf (MSG_NOTE, "\n");
 519             }
 520
 521           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 522           if (dump_enabled_p ())
 523             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 524           if (!vectorization_factor
 525               || (nunits > vectorization_factor))
 526             vectorization_factor = nunits;
 527
 528           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 529             {
 530               pattern_def_seq = NULL;
 531               gsi_next (&si);
 532             }
 533         }
 534     }
 535
 536   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 537   if (dump_enabled_p ())
 538     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 539                      vectorization_factor);
 540   if (vectorization_factor <= 1)
 541     {
 542       if (dump_enabled_p ())
 543         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 544                          "not vectorized: unsupported data-type\n");
 545       return false;
 546     }
 547   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 548
 549   return true;
 550 }
 551
 552
 553 /* Function vect_is_simple_iv_evolution.
 554
 555    FORNOW: A simple evolution of an induction variables in the loop is
 556    considered a polynomial evolution.  */
 557
 558 static bool
 559 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 560                              tree * step)
 561 {
 562   tree init_expr;
 563   tree step_expr;
 564   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 565   basic_block bb;
 566
 567   /* When there is no evolution in this loop, the evolution function
 568      is not "simple".  */
 569   if (evolution_part == NULL_TREE)
 570     return false;
 571
 572   /* When the evolution is a polynomial of degree >= 2
 573      the evolution function is not "simple".  */
 574   if (tree_is_chrec (evolution_part))
 575     return false;
 576
 577   step_expr = evolution_part;
 578   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 579
 580   if (dump_enabled_p ())
 581     {
 582       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 583       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 584       dump_printf (MSG_NOTE, ",  init: ");
 585       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 586       dump_printf (MSG_NOTE, "\n");
 587     }
 588
 589   *init = init_expr;
 590   *step = step_expr;
 591
 592   if (TREE_CODE (step_expr) != INTEGER_CST
 593       && (TREE_CODE (step_expr) != SSA_NAME
 594           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 595               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 596           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 597               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 598                   || !flag_associative_math)))
 599       && (TREE_CODE (step_expr) != REAL_CST
 600           || !flag_associative_math))
 601     {
 602       if (dump_enabled_p ())
 603         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 604                          "step unknown.\n");
 605       return false;
 606     }
 607
 608   return true;
 609 }
 610
 611 /* Function vect_analyze_scalar_cycles_1.
 612
 613    Examine the cross iteration def-use cycles of scalar variables
 614    in LOOP.  LOOP_VINFO represents the loop that is now being
 615    considered for vectorization (can be LOOP, or an outer-loop
 616    enclosing LOOP).  */
 617
 618 static void
 619 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 620 {
 621   basic_block bb = loop->header;
 622   tree init, step;
 623   auto_vec<gimple *, 64> worklist;
 624   gphi_iterator gsi;
 625   bool double_reduc;
 626
 627   if (dump_enabled_p ())
 628     dump_printf_loc (MSG_NOTE, vect_location,
 629                      "=== vect_analyze_scalar_cycles ===\n");
 630
 631   /* First - identify all inductions.  Reduction detection assumes that all the
 632      inductions have been identified, therefore, this order must not be
 633      changed.  */
 634   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 635     {
 636       gphi *phi = gsi.phi ();
 637       tree access_fn = NULL;
 638       tree def = PHI_RESULT (phi);
 639       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 640
 641       if (dump_enabled_p ())
 642         {
 643           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 644           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 645           dump_printf (MSG_NOTE, "\n");
 646         }
 647
 648       /* Skip virtual phi's.  The data dependences that are associated with
 649          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 650       if (virtual_operand_p (def))
 651         continue;
 652
 653       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 654
 655       /* Analyze the evolution function.  */
 656       access_fn = analyze_scalar_evolution (loop, def);
 657       if (access_fn)
 658         {
 659           STRIP_NOPS (access_fn);
 660           if (dump_enabled_p ())
 661             {
 662               dump_printf_loc (MSG_NOTE, vect_location,
 663                                "Access function of PHI: ");
 664               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 665               dump_printf (MSG_NOTE, "\n");
 666             }
 667           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 668             = evolution_part_in_loop_num (access_fn, loop->num);
 669         }
 670
 671       if (!access_fn
 672           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 673           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 674               && TREE_CODE (step) != INTEGER_CST))
 675         {
 676           worklist.safe_push (phi);
 677           continue;
 678         }
 679
 680       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 681
 682       if (dump_enabled_p ())
 683         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 684       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 685     }
 686
 687
 688   /* Second - identify all reductions and nested cycles.  */
 689   while (worklist.length () > 0)
 690     {
 691       gimple *phi = worklist.pop ();
 692       tree def = PHI_RESULT (phi);
 693       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 694       gimple *reduc_stmt;
 695       bool nested_cycle;
 696
 697       if (dump_enabled_p ())
 698         {
 699           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 700           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 701           dump_printf (MSG_NOTE, "\n");
 702         }
 703
 704       gcc_assert (!virtual_operand_p (def)
 705                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 706
 707       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 708       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 709                                                 &double_reduc, false);
 710       if (reduc_stmt)
 711         {
 712           if (double_reduc)
 713             {
 714               if (dump_enabled_p ())
 715                 dump_printf_loc (MSG_NOTE, vect_location,
 716                                  "Detected double reduction.\n");
 717
 718               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 719               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 720                                                     vect_double_reduction_def;
 721             }
 722           else
 723             {
 724               if (nested_cycle)
 725                 {
 726                   if (dump_enabled_p ())
 727                     dump_printf_loc (MSG_NOTE, vect_location,
 728                                      "Detected vectorizable nested cycle.\n");
 729
 730                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 731                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 732                                                              vect_nested_cycle;
 733                 }
 734               else
 735                 {
 736                   if (dump_enabled_p ())
 737                     dump_printf_loc (MSG_NOTE, vect_location,
 738                                      "Detected reduction.\n");
 739
 740                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 741                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 742                                                            vect_reduction_def;
 743                   /* Store the reduction cycles for possible vectorization in
 744                      loop-aware SLP.  */
 745                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 746                 }
 747             }
 748         }
 749       else
 750         if (dump_enabled_p ())
 751           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 752                            "Unknown def-use cycle pattern.\n");
 753     }
 754 }
 755
 756
 757 /* Function vect_analyze_scalar_cycles.
 758
 759    Examine the cross iteration def-use cycles of scalar variables, by
 760    analyzing the loop-header PHIs of scalar variables.  Classify each
 761    cycle as one of the following: invariant, induction, reduction, unknown.
 762    We do that for the loop represented by LOOP_VINFO, and also to its
 763    inner-loop, if exists.
 764    Examples for scalar cycles:
 765
 766    Example1: reduction:
 767
 768               loop1:
 769               for (i=0; i<N; i++)
 770                  sum += a[i];
 771
 772    Example2: induction:
 773
 774               loop2:
 775               for (i=0; i<N; i++)
 776                  a[i] = i;  */
 777
 778 static void
 779 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 780 {
 781   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 782
 783   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 784
 785   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 786      Reductions in such inner-loop therefore have different properties than
 787      the reductions in the nest that gets vectorized:
 788      1. When vectorized, they are executed in the same order as in the original
 789         scalar loop, so we can't change the order of computation when
 790         vectorizing them.
 791      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 792         current checks are too strict.  */
 793
 794   if (loop->inner)
 795     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 796 }
 797
 798 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 799
 800 static void
 801 vect_fixup_reduc_chain (gimple *stmt)
 802 {
 803   gimple *firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 804   gimple *stmtp;
 805   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 806               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 807   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 808   do
 809     {
 810       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 811       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 812       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 813       if (stmt)
 814         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 815           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 816     }
 817   while (stmt);
 818   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 819 }
 820
 821 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 822
 823 static void
 824 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 825 {
 826   gimple *first;
 827   unsigned i;
 828
 829   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 830     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 831       {
 832         vect_fixup_reduc_chain (first);
 833         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 834           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 835       }
 836 }
 837
 838 /* Function vect_get_loop_niters.
 839
 840    Determine how many iterations the loop is executed and place it
 841    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 842    in NUMBER_OF_ITERATIONSM1.
 843
 844    Return the loop exit condition.  */
 845
 846
 847 static gcond *
 848 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 849                       tree *number_of_iterationsm1)
 850 {
 851   tree niters;
 852
 853   if (dump_enabled_p ())
 854     dump_printf_loc (MSG_NOTE, vect_location,
 855                      "=== get_loop_niters ===\n");
 856
 857   niters = number_of_latch_executions (loop);
 858   *number_of_iterationsm1 = niters;
 859
 860   /* We want the number of loop header executions which is the number
 861      of latch executions plus one.
 862      ???  For UINT_MAX latch executions this number overflows to zero
 863      for loops like do { n++; } while (n != 0);  */
 864   if (niters && !chrec_contains_undetermined (niters))
 865     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 866                           build_int_cst (TREE_TYPE (niters), 1));
 867   *number_of_iterations = niters;
 868
 869   return get_loop_exit_condition (loop);
 870 }
 871
 872
 873 /* Function bb_in_loop_p
 874
 875    Used as predicate for dfs order traversal of the loop bbs.  */
 876
 877 static bool
 878 bb_in_loop_p (const_basic_block bb, const void *data)
 879 {
 880   const struct loop *const loop = (const struct loop *)data;
 881   if (flow_bb_inside_loop_p (loop, bb))
 882     return true;
 883   return false;
 884 }
 885
 886
 887 /* Function new_loop_vec_info.
 888
 889    Create and initialize a new loop_vec_info struct for LOOP, as well as
 890    stmt_vec_info structs for all the stmts in LOOP.  */
 891
 892 static loop_vec_info
 893 new_loop_vec_info (struct loop *loop)
 894 {
 895   loop_vec_info res;
 896   basic_block *bbs;
 897   gimple_stmt_iterator si;
 898   unsigned int i, nbbs;
 899
 900   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 901   res->kind = vec_info::loop;
 902   LOOP_VINFO_LOOP (res) = loop;
 903
 904   bbs = get_loop_body (loop);
 905
 906   /* Create/Update stmt_info for all stmts in the loop.  */
 907   for (i = 0; i < loop->num_nodes; i++)
 908     {
 909       basic_block bb = bbs[i];
 910
 911       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 912         {
 913           gimple *phi = gsi_stmt (si);
 914           gimple_set_uid (phi, 0);
 915           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res));
 916         }
 917
 918       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 919         {
 920           gimple *stmt = gsi_stmt (si);
 921           gimple_set_uid (stmt, 0);
 922           set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res));
 923         }
 924     }
 925
 926   /* CHECKME: We want to visit all BBs before their successors (except for
 927      latch blocks, for which this assertion wouldn't hold).  In the simple
 928      case of the loop forms we allow, a dfs order of the BBs would the same
 929      as reversed postorder traversal, so we are safe.  */
 930
 931    free (bbs);
 932    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 933    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 934                               bbs, loop->num_nodes, loop);
 935    gcc_assert (nbbs == loop->num_nodes);
 936
 937   LOOP_VINFO_BBS (res) = bbs;
 938   LOOP_VINFO_NITERSM1 (res) = NULL;
 939   LOOP_VINFO_NITERS (res) = NULL;
 940   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 941   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 942   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 943   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 944   LOOP_VINFO_VECT_FACTOR (res) = 0;
 945   LOOP_VINFO_LOOP_NEST (res) = vNULL;
 946   LOOP_VINFO_DATAREFS (res) = vNULL;
 947   LOOP_VINFO_DDRS (res) = vNULL;
 948   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 949   LOOP_VINFO_MAY_MISALIGN_STMTS (res) = vNULL;
 950   LOOP_VINFO_MAY_ALIAS_DDRS (res) = vNULL;
 951   LOOP_VINFO_GROUPED_STORES (res) = vNULL;
 952   LOOP_VINFO_REDUCTIONS (res) = vNULL;
 953   LOOP_VINFO_REDUCTION_CHAINS (res) = vNULL;
 954   LOOP_VINFO_SLP_INSTANCES (res) = vNULL;
 955   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
 956   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
 957   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
 958   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
 959   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
 960
 961   return res;
 962 }
 963
 964
 965 /* Function destroy_loop_vec_info.
 966
 967    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
 968    stmts in the loop.  */
 969
 970 void
 971 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
 972 {
 973   struct loop *loop;
 974   basic_block *bbs;
 975   int nbbs;
 976   gimple_stmt_iterator si;
 977   int j;
 978   vec<slp_instance> slp_instances;
 979   slp_instance instance;
 980   bool swapped;
 981
 982   if (!loop_vinfo)
 983     return;
 984
 985   loop = LOOP_VINFO_LOOP (loop_vinfo);
 986
 987   bbs = LOOP_VINFO_BBS (loop_vinfo);
 988   nbbs = clean_stmts ? loop->num_nodes : 0;
 989   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
 990
 991   for (j = 0; j < nbbs; j++)
 992     {
 993       basic_block bb = bbs[j];
 994       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 995         free_stmt_vec_info (gsi_stmt (si));
 996
 997       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
 998         {
 999           gimple *stmt = gsi_stmt (si);
1000
1001           /* We may have broken canonical form by moving a constant
1002              into RHS1 of a commutative op.  Fix such occurrences.  */
1003           if (swapped && is_gimple_assign (stmt))
1004             {
1005               enum tree_code code = gimple_assign_rhs_code (stmt);
1006
1007               if ((code == PLUS_EXPR
1008                    || code == POINTER_PLUS_EXPR
1009                    || code == MULT_EXPR)
1010                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1011                 swap_ssa_operands (stmt,
1012                                    gimple_assign_rhs1_ptr (stmt),
1013                                    gimple_assign_rhs2_ptr (stmt));
1014             }
1015
1016           /* Free stmt_vec_info.  */
1017           free_stmt_vec_info (stmt);
1018           gsi_next (&si);
1019         }
1020     }
1021
1022   free (LOOP_VINFO_BBS (loop_vinfo));
1023   vect_destroy_datarefs (loop_vinfo);
1024   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1025   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1026   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1027   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1028   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1029   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1030     vect_free_slp_instance (instance);
1031
1032   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1033   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1034   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1035   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1036
1037   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1038   loop_vinfo->scalar_cost_vec.release ();
1039
1040   free (loop_vinfo);
1041   loop->aux = NULL;
1042 }
1043
1044
1045 /* Calculate the cost of one scalar iteration of the loop.  */
1046 static void
1047 vect_compute_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1048 {
1049   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1050   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1051   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1052   int innerloop_iters, i;
1053
1054   /* Count statements in scalar loop.  Using this as scalar cost for a single
1055      iteration for now.
1056
1057      TODO: Add outer loop support.
1058
1059      TODO: Consider assigning different costs to different scalar
1060      statements.  */
1061
1062   /* FORNOW.  */
1063   innerloop_iters = 1;
1064   if (loop->inner)
1065     innerloop_iters = 50; /* FIXME */
1066
1067   for (i = 0; i < nbbs; i++)
1068     {
1069       gimple_stmt_iterator si;
1070       basic_block bb = bbs[i];
1071
1072       if (bb->loop_father == loop->inner)
1073         factor = innerloop_iters;
1074       else
1075         factor = 1;
1076
1077       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1078         {
1079           gimple *stmt = gsi_stmt (si);
1080           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1081
1082           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1083             continue;
1084
1085           /* Skip stmts that are not vectorized inside the loop.  */
1086           if (stmt_info
1087               && !STMT_VINFO_RELEVANT_P (stmt_info)
1088               && (!STMT_VINFO_LIVE_P (stmt_info)
1089                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1090               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1091             continue;
1092
1093           vect_cost_for_stmt kind;
1094           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1095             {
1096               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1097                kind = scalar_load;
1098              else
1099                kind = scalar_store;
1100             }
1101           else
1102             kind = scalar_stmt;
1103
1104           scalar_single_iter_cost
1105             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1106                                  factor, kind, NULL, 0, vect_prologue);
1107         }
1108     }
1109   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1110     = scalar_single_iter_cost;
1111 }
1112
1113
1114 /* Function vect_analyze_loop_form_1.
1115
1116    Verify that certain CFG restrictions hold, including:
1117    - the loop has a pre-header
1118    - the loop has a single entry and exit
1119    - the loop exit condition is simple enough, and the number of iterations
1120      can be analyzed (a countable loop).  */
1121
1122 bool
1123 vect_analyze_loop_form_1 (struct loop *loop, gcond **loop_cond,
1124                           tree *number_of_iterationsm1,
1125                           tree *number_of_iterations, gcond **inner_loop_cond)
1126 {
1127   if (dump_enabled_p ())
1128     dump_printf_loc (MSG_NOTE, vect_location,
1129                      "=== vect_analyze_loop_form ===\n");
1130
1131   /* Different restrictions apply when we are considering an inner-most loop,
1132      vs. an outer (nested) loop.
1133      (FORNOW. May want to relax some of these restrictions in the future).  */
1134
1135   if (!loop->inner)
1136     {
1137       /* Inner-most loop.  We currently require that the number of BBs is
1138          exactly 2 (the header and latch).  Vectorizable inner-most loops
1139          look like this:
1140
1141                         (pre-header)
1142                            |
1143                           header <--------+
1144                            | |            |
1145                            | +--> latch --+
1146                            |
1147                         (exit-bb)  */
1148
1149       if (loop->num_nodes != 2)
1150         {
1151           if (dump_enabled_p ())
1152             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1153                              "not vectorized: control flow in loop.\n");
1154           return false;
1155         }
1156
1157       if (empty_block_p (loop->header))
1158         {
1159           if (dump_enabled_p ())
1160             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1161                              "not vectorized: empty loop.\n");
1162           return false;
1163         }
1164     }
1165   else
1166     {
1167       struct loop *innerloop = loop->inner;
1168       edge entryedge;
1169
1170       /* Nested loop. We currently require that the loop is doubly-nested,
1171          contains a single inner loop, and the number of BBs is exactly 5.
1172          Vectorizable outer-loops look like this:
1173
1174                         (pre-header)
1175                            |
1176                           header <---+
1177                            |         |
1178                           inner-loop |
1179                            |         |
1180                           tail ------+
1181                            |
1182                         (exit-bb)
1183
1184          The inner-loop has the properties expected of inner-most loops
1185          as described above.  */
1186
1187       if ((loop->inner)->inner || (loop->inner)->next)
1188         {
1189           if (dump_enabled_p ())
1190             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1191                              "not vectorized: multiple nested loops.\n");
1192           return false;
1193         }
1194
1195       if (loop->num_nodes != 5)
1196         {
1197           if (dump_enabled_p ())
1198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1199                              "not vectorized: control flow in loop.\n");
1200           return false;
1201         }
1202
1203       entryedge = loop_preheader_edge (innerloop);
1204       if (entryedge->src != loop->header
1205           || !single_exit (innerloop)
1206           || single_exit (innerloop)->dest != EDGE_PRED (loop->latch, 0)->src)
1207         {
1208           if (dump_enabled_p ())
1209             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1210                              "not vectorized: unsupported outerloop form.\n");
1211           return false;
1212         }
1213
1214       /* Analyze the inner-loop.  */
1215       tree inner_niterm1, inner_niter;
1216       if (! vect_analyze_loop_form_1 (loop->inner, inner_loop_cond,
1217                                       &inner_niterm1, &inner_niter, NULL))
1218         {
1219           if (dump_enabled_p ())
1220             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1221                              "not vectorized: Bad inner loop.\n");
1222           return false;
1223         }
1224
1225       if (!expr_invariant_in_loop_p (loop, inner_niter))
1226         {
1227           if (dump_enabled_p ())
1228             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1229                              "not vectorized: inner-loop count not"
1230                              " invariant.\n");
1231           return false;
1232         }
1233
1234       if (dump_enabled_p ())
1235         dump_printf_loc (MSG_NOTE, vect_location,
1236                          "Considering outer-loop vectorization.\n");
1237     }
1238
1239   if (!single_exit (loop)
1240       || EDGE_COUNT (loop->header->preds) != 2)
1241     {
1242       if (dump_enabled_p ())
1243         {
1244           if (!single_exit (loop))
1245             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1246                              "not vectorized: multiple exits.\n");
1247           else if (EDGE_COUNT (loop->header->preds) != 2)
1248             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1249                              "not vectorized: too many incoming edges.\n");
1250         }
1251       return false;
1252     }
1253
1254   /* We assume that the loop exit condition is at the end of the loop. i.e,
1255      that the loop is represented as a do-while (with a proper if-guard
1256      before the loop if needed), where the loop header contains all the
1257      executable statements, and the latch is empty.  */
1258   if (!empty_block_p (loop->latch)
1259       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1260     {
1261       if (dump_enabled_p ())
1262         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1263                          "not vectorized: latch block not empty.\n");
1264       return false;
1265     }
1266
1267   /* Make sure there exists a single-predecessor exit bb:  */
1268   if (!single_pred_p (single_exit (loop)->dest))
1269     {
1270       edge e = single_exit (loop);
1271       if (!(e->flags & EDGE_ABNORMAL))
1272         {
1273           split_loop_exit_edge (e);
1274           if (dump_enabled_p ())
1275             dump_printf (MSG_NOTE, "split exit edge.\n");
1276         }
1277       else
1278         {
1279           if (dump_enabled_p ())
1280             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1281                              "not vectorized: abnormal loop exit edge.\n");
1282           return false;
1283         }
1284     }
1285
1286   *loop_cond = vect_get_loop_niters (loop, number_of_iterations,
1287                                      number_of_iterationsm1);
1288   if (!*loop_cond)
1289     {
1290       if (dump_enabled_p ())
1291         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1292                          "not vectorized: complicated exit condition.\n");
1293       return false;
1294     }
1295
1296   if (!*number_of_iterations
1297       || chrec_contains_undetermined (*number_of_iterations))
1298     {
1299       if (dump_enabled_p ())
1300         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1301                          "not vectorized: number of iterations cannot be "
1302                          "computed.\n");
1303       return false;
1304     }
1305
1306   if (integer_zerop (*number_of_iterations))
1307     {
1308       if (dump_enabled_p ())
1309         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1310                          "not vectorized: number of iterations = 0.\n");
1311       return false;
1312     }
1313
1314   return true;
1315 }
1316
1317 /* Analyze LOOP form and return a loop_vec_info if it is of suitable form.  */
1318
1319 loop_vec_info
1320 vect_analyze_loop_form (struct loop *loop)
1321 {
1322   tree number_of_iterations, number_of_iterationsm1;
1323   gcond *loop_cond, *inner_loop_cond = NULL;
1324
1325   if (! vect_analyze_loop_form_1 (loop, &loop_cond, &number_of_iterationsm1,
1326                                   &number_of_iterations, &inner_loop_cond))
1327     return NULL;
1328
1329   loop_vec_info loop_vinfo = new_loop_vec_info (loop);
1330   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1331   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1332   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1333
1334   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1335     {
1336       if (dump_enabled_p ())
1337         {
1338           dump_printf_loc (MSG_NOTE, vect_location,
1339                            "Symbolic number of iterations is ");
1340           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1341           dump_printf (MSG_NOTE, "\n");
1342         }
1343     }
1344
1345   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1346   if (inner_loop_cond)
1347     STMT_VINFO_TYPE (vinfo_for_stmt (inner_loop_cond))
1348       = loop_exit_ctrl_vec_info_type;
1349
1350   gcc_assert (!loop->aux);
1351   loop->aux = loop_vinfo;
1352   return loop_vinfo;
1353 }
1354
1355
1356
1357 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1358    statements update the vectorization factor.  */
1359
1360 static void
1361 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1362 {
1363   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1364   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1365   int nbbs = loop->num_nodes;
1366   unsigned int vectorization_factor;
1367   int i;
1368
1369   if (dump_enabled_p ())
1370     dump_printf_loc (MSG_NOTE, vect_location,
1371                      "=== vect_update_vf_for_slp ===\n");
1372
1373   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1374   gcc_assert (vectorization_factor != 0);
1375
1376   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1377      vectorization factor of the loop is the unrolling factor required by
1378      the SLP instances.  If that unrolling factor is 1, we say, that we
1379      perform pure SLP on loop - cross iteration parallelism is not
1380      exploited.  */
1381   bool only_slp_in_loop = true;
1382   for (i = 0; i < nbbs; i++)
1383     {
1384       basic_block bb = bbs[i];
1385       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1386            gsi_next (&si))
1387         {
1388           gimple *stmt = gsi_stmt (si);
1389           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1390           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1391               && STMT_VINFO_RELATED_STMT (stmt_info))
1392             {
1393               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1394               stmt_info = vinfo_for_stmt (stmt);
1395             }
1396           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1397                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1398               && !PURE_SLP_STMT (stmt_info))
1399             /* STMT needs both SLP and loop-based vectorization.  */
1400             only_slp_in_loop = false;
1401         }
1402     }
1403
1404   if (only_slp_in_loop)
1405     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1406   else
1407     vectorization_factor
1408       = least_common_multiple (vectorization_factor,
1409                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1410
1411   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1412   if (dump_enabled_p ())
1413     dump_printf_loc (MSG_NOTE, vect_location,
1414                      "Updating vectorization factor to %d\n",
1415                      vectorization_factor);
1416 }
1417
1418 /* Function vect_analyze_loop_operations.
1419
1420    Scan the loop stmts and make sure they are all vectorizable.  */
1421
1422 static bool
1423 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1424 {
1425   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1426   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1427   int nbbs = loop->num_nodes;
1428   int i;
1429   stmt_vec_info stmt_info;
1430   bool need_to_vectorize = false;
1431   bool ok;
1432
1433   if (dump_enabled_p ())
1434     dump_printf_loc (MSG_NOTE, vect_location,
1435                      "=== vect_analyze_loop_operations ===\n");
1436
1437   for (i = 0; i < nbbs; i++)
1438     {
1439       basic_block bb = bbs[i];
1440
1441       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1442            gsi_next (&si))
1443         {
1444           gphi *phi = si.phi ();
1445           ok = true;
1446
1447           stmt_info = vinfo_for_stmt (phi);
1448           if (dump_enabled_p ())
1449             {
1450               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1451               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1452               dump_printf (MSG_NOTE, "\n");
1453             }
1454           if (virtual_operand_p (gimple_phi_result (phi)))
1455             continue;
1456
1457           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1458              (i.e., a phi in the tail of the outer-loop).  */
1459           if (! is_loop_header_bb_p (bb))
1460             {
1461               /* FORNOW: we currently don't support the case that these phis
1462                  are not used in the outerloop (unless it is double reduction,
1463                  i.e., this phi is vect_reduction_def), cause this case
1464                  requires to actually do something here.  */
1465               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1466                    || STMT_VINFO_LIVE_P (stmt_info))
1467                   && STMT_VINFO_DEF_TYPE (stmt_info)
1468                      != vect_double_reduction_def)
1469                 {
1470                   if (dump_enabled_p ())
1471                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1472                                      "Unsupported loop-closed phi in "
1473                                      "outer-loop.\n");
1474                   return false;
1475                 }
1476
1477               /* If PHI is used in the outer loop, we check that its operand
1478                  is defined in the inner loop.  */
1479               if (STMT_VINFO_RELEVANT_P (stmt_info))
1480                 {
1481                   tree phi_op;
1482                   gimple *op_def_stmt;
1483
1484                   if (gimple_phi_num_args (phi) != 1)
1485                     return false;
1486
1487                   phi_op = PHI_ARG_DEF (phi, 0);
1488                   if (TREE_CODE (phi_op) != SSA_NAME)
1489                     return false;
1490
1491                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1492                   if (gimple_nop_p (op_def_stmt)
1493                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1494                       || !vinfo_for_stmt (op_def_stmt))
1495                     return false;
1496
1497                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1498                         != vect_used_in_outer
1499                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1500                            != vect_used_in_outer_by_reduction)
1501                     return false;
1502                 }
1503
1504               continue;
1505             }
1506
1507           gcc_assert (stmt_info);
1508
1509           if (STMT_VINFO_LIVE_P (stmt_info))
1510             {
1511               /* FORNOW: not yet supported.  */
1512               if (dump_enabled_p ())
1513                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1514                                  "not vectorized: value used after loop.\n");
1515               return false;
1516             }
1517
1518           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1519               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1520             {
1521               /* A scalar-dependence cycle that we don't support.  */
1522               if (dump_enabled_p ())
1523                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1524                                  "not vectorized: scalar dependence cycle.\n");
1525               return false;
1526             }
1527
1528           if (STMT_VINFO_RELEVANT_P (stmt_info))
1529             {
1530               need_to_vectorize = true;
1531               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1532                 ok = vectorizable_induction (phi, NULL, NULL);
1533             }
1534
1535           if (!ok)
1536             {
1537               if (dump_enabled_p ())
1538                 {
1539                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1540                                    "not vectorized: relevant phi not "
1541                                    "supported: ");
1542                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1543                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1544                 }
1545               return false;
1546             }
1547         }
1548
1549       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1550            gsi_next (&si))
1551         {
1552           gimple *stmt = gsi_stmt (si);
1553           if (!gimple_clobber_p (stmt)
1554               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1555             return false;
1556         }
1557     } /* bbs */
1558
1559   /* All operations in the loop are either irrelevant (deal with loop
1560      control, or dead), or only used outside the loop and can be moved
1561      out of the loop (e.g. invariants, inductions).  The loop can be
1562      optimized away by scalar optimizations.  We're better off not
1563      touching this loop.  */
1564   if (!need_to_vectorize)
1565     {
1566       if (dump_enabled_p ())
1567         dump_printf_loc (MSG_NOTE, vect_location,
1568                          "All the computation can be taken out of the loop.\n");
1569       if (dump_enabled_p ())
1570         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1571                          "not vectorized: redundant loop. no profit to "
1572                          "vectorize.\n");
1573       return false;
1574     }
1575
1576   return true;
1577 }
1578
1579
1580 /* Function vect_analyze_loop_2.
1581
1582    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1583    for it.  The different analyses will record information in the
1584    loop_vec_info struct.  */
1585 static bool
1586 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1587 {
1588   bool ok;
1589   int max_vf = MAX_VECTORIZATION_FACTOR;
1590   int min_vf = 2;
1591   unsigned int n_stmts = 0;
1592
1593   /* Find all data references in the loop (which correspond to vdefs/vuses)
1594      and analyze their evolution in the loop.  Also adjust the minimal
1595      vectorization factor according to the loads and stores.
1596
1597      FORNOW: Handle only simple, array references, which
1598      alignment can be forced, and aligned pointer-references.  */
1599
1600   ok = vect_analyze_data_refs (loop_vinfo, &min_vf, &n_stmts);
1601   if (!ok)
1602     {
1603       if (dump_enabled_p ())
1604         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                          "bad data references.\n");
1606       return false;
1607     }
1608
1609   /* Classify all cross-iteration scalar data-flow cycles.
1610      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1611
1612   vect_analyze_scalar_cycles (loop_vinfo);
1613
1614   vect_pattern_recog (loop_vinfo);
1615
1616   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1617
1618   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1619      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1620
1621   ok = vect_analyze_data_ref_accesses (loop_vinfo);
1622   if (!ok)
1623     {
1624       if (dump_enabled_p ())
1625         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1626                          "bad data access.\n");
1627       return false;
1628     }
1629
1630   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1631
1632   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1633   if (!ok)
1634     {
1635       if (dump_enabled_p ())
1636         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1637                          "unexpected pattern.\n");
1638       return false;
1639     }
1640
1641   /* Analyze data dependences between the data-refs in the loop
1642      and adjust the maximum vectorization factor according to
1643      the dependences.
1644      FORNOW: fail at the first data dependence that we encounter.  */
1645
1646   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1647   if (!ok
1648       || max_vf < min_vf)
1649     {
1650       if (dump_enabled_p ())
1651             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1652                              "bad data dependence.\n");
1653       return false;
1654     }
1655
1656   ok = vect_determine_vectorization_factor (loop_vinfo);
1657   if (!ok)
1658     {
1659       if (dump_enabled_p ())
1660         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1661                          "can't determine vectorization factor.\n");
1662       return false;
1663     }
1664   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1665     {
1666       if (dump_enabled_p ())
1667         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1668                          "bad data dependence.\n");
1669       return false;
1670     }
1671
1672   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1673   ok = vect_analyze_slp (loop_vinfo, n_stmts);
1674   if (!ok)
1675     return false;
1676
1677   /* If there are any SLP instances mark them as pure_slp.  */
1678   bool slp = vect_make_slp_decision (loop_vinfo);
1679   if (slp)
1680     {
1681       /* Find stmts that need to be both vectorized and SLPed.  */
1682       vect_detect_hybrid_slp (loop_vinfo);
1683
1684       /* Update the vectorization factor based on the SLP decision.  */
1685       vect_update_vf_for_slp (loop_vinfo);
1686     }
1687
1688   /* Now the vectorization factor is final.  */
1689   unsigned vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1690   gcc_assert (vectorization_factor != 0);
1691
1692   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1693     dump_printf_loc (MSG_NOTE, vect_location,
1694                      "vectorization_factor = %d, niters = "
1695                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1696                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1697
1698   HOST_WIDE_INT max_niter
1699     = max_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1700   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1701        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1702       || (max_niter != -1
1703           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1704     {
1705       if (dump_enabled_p ())
1706         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1707                          "not vectorized: iteration count too small.\n");
1708       if (dump_enabled_p ())
1709         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1710                          "not vectorized: iteration count smaller than "
1711                          "vectorization factor.\n");
1712       return false;
1713     }
1714
1715   /* Analyze the alignment of the data-refs in the loop.
1716      Fail if a data reference is found that cannot be vectorized.  */
1717
1718   ok = vect_analyze_data_refs_alignment (loop_vinfo);
1719   if (!ok)
1720     {
1721       if (dump_enabled_p ())
1722         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1723                          "bad data alignment.\n");
1724       return false;
1725     }
1726
1727   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1728      It is important to call pruning after vect_analyze_data_ref_accesses,
1729      since we use grouping information gathered by interleaving analysis.  */
1730   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1731   if (!ok)
1732     {
1733       if (dump_enabled_p ())
1734         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1735                          "number of versioning for alias "
1736                          "run-time tests exceeds %d "
1737                          "(--param vect-max-version-for-alias-checks)\n",
1738                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1739       return false;
1740     }
1741
1742   /* Compute the scalar iteration cost.  */
1743   vect_compute_single_scalar_iteration_cost (loop_vinfo);
1744
1745   /* This pass will decide on using loop versioning and/or loop peeling in
1746      order to enhance the alignment of data references in the loop.  */
1747
1748   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1749   if (!ok)
1750     {
1751       if (dump_enabled_p ())
1752         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1753                          "bad data alignment.\n");
1754       return false;
1755     }
1756
1757   if (slp)
1758     {
1759       /* Analyze operations in the SLP instances.  Note this may
1760          remove unsupported SLP instances which makes the above
1761          SLP kind detection invalid.  */
1762       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1763       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1764                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1765       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1766         return false;
1767     }
1768
1769   /* Scan all the remaining operations in the loop that are not subject
1770      to SLP and make sure they are vectorizable.  */
1771   ok = vect_analyze_loop_operations (loop_vinfo);
1772   if (!ok)
1773     {
1774       if (dump_enabled_p ())
1775         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1776                          "bad operation or unsupported loop bound.\n");
1777       return false;
1778     }
1779
1780   /* Analyze cost.  Decide if worth while to vectorize.  */
1781   int min_profitable_estimate, min_profitable_iters;
1782   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1783                                       &min_profitable_estimate);
1784
1785   if (min_profitable_iters < 0)
1786     {
1787       if (dump_enabled_p ())
1788         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1789                          "not vectorized: vectorization not profitable.\n");
1790       if (dump_enabled_p ())
1791         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1792                          "not vectorized: vector version will never be "
1793                          "profitable.\n");
1794       return false;
1795     }
1796
1797   int min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1798                                 * vectorization_factor) - 1);
1799
1800   /* Use the cost model only if it is more conservative than user specified
1801      threshold.  */
1802   unsigned th = (unsigned) min_scalar_loop_bound;
1803   if (min_profitable_iters
1804       && (!min_scalar_loop_bound
1805           || min_profitable_iters > min_scalar_loop_bound))
1806     th = (unsigned) min_profitable_iters;
1807
1808   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1809
1810   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1811       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1812     {
1813       if (dump_enabled_p ())
1814         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1815                          "not vectorized: vectorization not profitable.\n");
1816       if (dump_enabled_p ())
1817         dump_printf_loc (MSG_NOTE, vect_location,
1818                          "not vectorized: iteration count smaller than user "
1819                          "specified loop bound parameter or minimum profitable "
1820                          "iterations (whichever is more conservative).\n");
1821       return false;
1822     }
1823
1824   HOST_WIDE_INT estimated_niter
1825     = estimated_stmt_executions_int (LOOP_VINFO_LOOP (loop_vinfo));
1826   if (estimated_niter != -1
1827       && ((unsigned HOST_WIDE_INT) estimated_niter
1828           <= MAX (th, (unsigned)min_profitable_estimate)))
1829     {
1830       if (dump_enabled_p ())
1831         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832                          "not vectorized: estimated iteration count too "
1833                          "small.\n");
1834       if (dump_enabled_p ())
1835         dump_printf_loc (MSG_NOTE, vect_location,
1836                          "not vectorized: estimated iteration count smaller "
1837                          "than specified loop bound parameter or minimum "
1838                          "profitable iterations (whichever is more "
1839                          "conservative).\n");
1840       return false;
1841     }
1842
1843   /* Decide whether we need to create an epilogue loop to handle
1844      remaining scalar iterations.  */
1845   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1846         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1847        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1848
1849   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1850       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1851     {
1852       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1853                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1854           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1855         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1856     }
1857   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1858            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1859                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1860                /* In case of versioning, check if the maximum number of
1861                   iterations is greater than th.  If they are identical,
1862                   the epilogue is unnecessary.  */
1863                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1864                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1865                    || (unsigned HOST_WIDE_INT) max_niter > th)))
1866     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1867
1868   /* If an epilogue loop is required make sure we can create one.  */
1869   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1870       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1871     {
1872       if (dump_enabled_p ())
1873         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1874       if (!vect_can_advance_ivs_p (loop_vinfo)
1875           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1876                                            single_exit (LOOP_VINFO_LOOP
1877                                                          (loop_vinfo))))
1878         {
1879           if (dump_enabled_p ())
1880             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1881                              "not vectorized: can't create required "
1882                              "epilog loop\n");
1883           return false;
1884         }
1885     }
1886
1887   gcc_assert (vectorization_factor
1888               == (unsigned)LOOP_VINFO_VECT_FACTOR (loop_vinfo));
1889
1890   return true;
1891 }
1892
1893 /* Function vect_analyze_loop.
1894
1895    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1896    for it.  The different analyses will record information in the
1897    loop_vec_info struct.  */
1898 loop_vec_info
1899 vect_analyze_loop (struct loop *loop)
1900 {
1901   loop_vec_info loop_vinfo;
1902   unsigned int vector_sizes;
1903
1904   /* Autodetect first vector size we try.  */
1905   current_vector_size = 0;
1906   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1907
1908   if (dump_enabled_p ())
1909     dump_printf_loc (MSG_NOTE, vect_location,
1910                      "===== analyze_loop_nest =====\n");
1911
1912   if (loop_outer (loop)
1913       && loop_vec_info_for_loop (loop_outer (loop))
1914       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
1915     {
1916       if (dump_enabled_p ())
1917         dump_printf_loc (MSG_NOTE, vect_location,
1918                          "outer-loop already vectorized.\n");
1919       return NULL;
1920     }
1921
1922   while (1)
1923     {
1924       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
1925       loop_vinfo = vect_analyze_loop_form (loop);
1926       if (!loop_vinfo)
1927         {
1928           if (dump_enabled_p ())
1929             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1930                              "bad loop form.\n");
1931           return NULL;
1932         }
1933
1934       if (vect_analyze_loop_2 (loop_vinfo))
1935         {
1936           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
1937
1938           return loop_vinfo;
1939         }
1940
1941       destroy_loop_vec_info (loop_vinfo, true);
1942
1943       vector_sizes &= ~current_vector_size;
1944       if (vector_sizes == 0
1945           || current_vector_size == 0)
1946         return NULL;
1947
1948       /* Try the next biggest vector size.  */
1949       current_vector_size = 1 << floor_log2 (vector_sizes);
1950       if (dump_enabled_p ())
1951         dump_printf_loc (MSG_NOTE, vect_location,
1952                          "***** Re-trying analysis with "
1953                          "vector size %d\n", current_vector_size);
1954     }
1955 }
1956
1957
1958 /* Function reduction_code_for_scalar_code
1959
1960    Input:
1961    CODE - tree_code of a reduction operations.
1962
1963    Output:
1964    REDUC_CODE - the corresponding tree-code to be used to reduce the
1965       vector of partial results into a single scalar result, or ERROR_MARK
1966       if the operation is a supported reduction operation, but does not have
1967       such a tree-code.
1968
1969    Return FALSE if CODE currently cannot be vectorized as reduction.  */
1970
1971 static bool
1972 reduction_code_for_scalar_code (enum tree_code code,
1973                                 enum tree_code *reduc_code)
1974 {
1975   switch (code)
1976     {
1977       case MAX_EXPR:
1978         *reduc_code = REDUC_MAX_EXPR;
1979         return true;
1980
1981       case MIN_EXPR:
1982         *reduc_code = REDUC_MIN_EXPR;
1983         return true;
1984
1985       case PLUS_EXPR:
1986         *reduc_code = REDUC_PLUS_EXPR;
1987         return true;
1988
1989       case MULT_EXPR:
1990       case MINUS_EXPR:
1991       case BIT_IOR_EXPR:
1992       case BIT_XOR_EXPR:
1993       case BIT_AND_EXPR:
1994         *reduc_code = ERROR_MARK;
1995         return true;
1996
1997       default:
1998        return false;
1999     }
2000 }
2001
2002
2003 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2004    STMT is printed with a message MSG. */
2005
2006 static void
2007 report_vect_op (int msg_type, gimple *stmt, const char *msg)
2008 {
2009   dump_printf_loc (msg_type, vect_location, "%s", msg);
2010   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2011   dump_printf (msg_type, "\n");
2012 }
2013
2014
2015 /* Detect SLP reduction of the form:
2016
2017    #a1 = phi <a5, a0>
2018    a2 = operation (a1)
2019    a3 = operation (a2)
2020    a4 = operation (a3)
2021    a5 = operation (a4)
2022
2023    #a = phi <a5>
2024
2025    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2026    FIRST_STMT is the first reduction stmt in the chain
2027    (a2 = operation (a1)).
2028
2029    Return TRUE if a reduction chain was detected.  */
2030
2031 static bool
2032 vect_is_slp_reduction (loop_vec_info loop_info, gimple *phi,
2033                        gimple *first_stmt)
2034 {
2035   struct loop *loop = (gimple_bb (phi))->loop_father;
2036   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2037   enum tree_code code;
2038   gimple *current_stmt = NULL, *loop_use_stmt = NULL, *first, *next_stmt;
2039   stmt_vec_info use_stmt_info, current_stmt_info;
2040   tree lhs;
2041   imm_use_iterator imm_iter;
2042   use_operand_p use_p;
2043   int nloop_uses, size = 0, n_out_of_loop_uses;
2044   bool found = false;
2045
2046   if (loop != vect_loop)
2047     return false;
2048
2049   lhs = PHI_RESULT (phi);
2050   code = gimple_assign_rhs_code (first_stmt);
2051   while (1)
2052     {
2053       nloop_uses = 0;
2054       n_out_of_loop_uses = 0;
2055       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2056         {
2057           gimple *use_stmt = USE_STMT (use_p);
2058           if (is_gimple_debug (use_stmt))
2059             continue;
2060
2061           /* Check if we got back to the reduction phi.  */
2062           if (use_stmt == phi)
2063             {
2064               loop_use_stmt = use_stmt;
2065               found = true;
2066               break;
2067             }
2068
2069           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2070             {
2071               loop_use_stmt = use_stmt;
2072               nloop_uses++;
2073             }
2074            else
2075              n_out_of_loop_uses++;
2076
2077            /* There are can be either a single use in the loop or two uses in
2078               phi nodes.  */
2079            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2080              return false;
2081         }
2082
2083       if (found)
2084         break;
2085
2086       /* We reached a statement with no loop uses.  */
2087       if (nloop_uses == 0)
2088         return false;
2089
2090       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2091       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2092         return false;
2093
2094       if (!is_gimple_assign (loop_use_stmt)
2095           || code != gimple_assign_rhs_code (loop_use_stmt)
2096           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2097         return false;
2098
2099       /* Insert USE_STMT into reduction chain.  */
2100       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2101       if (current_stmt)
2102         {
2103           current_stmt_info = vinfo_for_stmt (current_stmt);
2104           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2105           GROUP_FIRST_ELEMENT (use_stmt_info)
2106             = GROUP_FIRST_ELEMENT (current_stmt_info);
2107         }
2108       else
2109         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2110
2111       lhs = gimple_assign_lhs (loop_use_stmt);
2112       current_stmt = loop_use_stmt;
2113       size++;
2114    }
2115
2116   if (!found || loop_use_stmt != phi || size < 2)
2117     return false;
2118
2119   /* Swap the operands, if needed, to make the reduction operand be the second
2120      operand.  */
2121   lhs = PHI_RESULT (phi);
2122   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2123   while (next_stmt)
2124     {
2125       if (gimple_assign_rhs2 (next_stmt) == lhs)
2126         {
2127           tree op = gimple_assign_rhs1 (next_stmt);
2128           gimple *def_stmt = NULL;
2129
2130           if (TREE_CODE (op) == SSA_NAME)
2131             def_stmt = SSA_NAME_DEF_STMT (op);
2132
2133           /* Check that the other def is either defined in the loop
2134              ("vect_internal_def"), or it's an induction (defined by a
2135              loop-header phi-node).  */
2136           if (def_stmt
2137               && gimple_bb (def_stmt)
2138               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2139               && (is_gimple_assign (def_stmt)
2140                   || is_gimple_call (def_stmt)
2141                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2142                            == vect_induction_def
2143                   || (gimple_code (def_stmt) == GIMPLE_PHI
2144                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2145                                   == vect_internal_def
2146                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2147             {
2148               lhs = gimple_assign_lhs (next_stmt);
2149               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2150               continue;
2151             }
2152
2153           return false;
2154         }
2155       else
2156         {
2157           tree op = gimple_assign_rhs2 (next_stmt);
2158           gimple *def_stmt = NULL;
2159
2160           if (TREE_CODE (op) == SSA_NAME)
2161             def_stmt = SSA_NAME_DEF_STMT (op);
2162
2163           /* Check that the other def is either defined in the loop
2164             ("vect_internal_def"), or it's an induction (defined by a
2165             loop-header phi-node).  */
2166           if (def_stmt
2167               && gimple_bb (def_stmt)
2168               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2169               && (is_gimple_assign (def_stmt)
2170                   || is_gimple_call (def_stmt)
2171                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2172                               == vect_induction_def
2173                   || (gimple_code (def_stmt) == GIMPLE_PHI
2174                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2175                                   == vect_internal_def
2176                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2177             {
2178               if (dump_enabled_p ())
2179                 {
2180                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2181                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2182                   dump_printf (MSG_NOTE, "\n");
2183                 }
2184
2185               swap_ssa_operands (next_stmt,
2186                                  gimple_assign_rhs1_ptr (next_stmt),
2187                                  gimple_assign_rhs2_ptr (next_stmt));
2188               update_stmt (next_stmt);
2189
2190               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2191                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2192             }
2193           else
2194             return false;
2195         }
2196
2197       lhs = gimple_assign_lhs (next_stmt);
2198       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2199     }
2200
2201   /* Save the chain for further analysis in SLP detection.  */
2202   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2203   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2204   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2205
2206   return true;
2207 }
2208
2209
2210 /* Function vect_is_simple_reduction_1
2211
2212    (1) Detect a cross-iteration def-use cycle that represents a simple
2213    reduction computation.  We look for the following pattern:
2214
2215    loop_header:
2216      a1 = phi < a0, a2 >
2217      a3 = ...
2218      a2 = operation (a3, a1)
2219
2220    or
2221
2222    a3 = ...
2223    loop_header:
2224      a1 = phi < a0, a2 >
2225      a2 = operation (a3, a1)
2226
2227    such that:
2228    1. operation is commutative and associative and it is safe to
2229       change the order of the computation (if CHECK_REDUCTION is true)
2230    2. no uses for a2 in the loop (a2 is used out of the loop)
2231    3. no uses of a1 in the loop besides the reduction operation
2232    4. no uses of a1 outside the loop.
2233
2234    Conditions 1,4 are tested here.
2235    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2236
2237    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2238    nested cycles, if CHECK_REDUCTION is false.
2239
2240    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2241    reductions:
2242
2243      a1 = phi < a0, a2 >
2244      inner loop (def of a3)
2245      a2 = phi < a3 >
2246
2247    If MODIFY is true it tries also to rework the code in-place to enable
2248    detection of more reduction patterns.  For the time being we rewrite
2249    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2250 */
2251
2252 static gimple *
2253 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple *phi,
2254                             bool check_reduction, bool *double_reduc,
2255                             bool modify, bool need_wrapping_integral_overflow)
2256 {
2257   struct loop *loop = (gimple_bb (phi))->loop_father;
2258   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2259   edge latch_e = loop_latch_edge (loop);
2260   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2261   gimple *def_stmt, *def1 = NULL, *def2 = NULL;
2262   enum tree_code orig_code, code;
2263   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2264   tree type;
2265   int nloop_uses;
2266   tree name;
2267   imm_use_iterator imm_iter;
2268   use_operand_p use_p;
2269   bool phi_def;
2270
2271   *double_reduc = false;
2272
2273   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2274      otherwise, we assume outer loop vectorization.  */
2275   gcc_assert ((check_reduction && loop == vect_loop)
2276               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2277
2278   name = PHI_RESULT (phi);
2279   /* ???  If there are no uses of the PHI result the inner loop reduction
2280      won't be detected as possibly double-reduction by vectorizable_reduction
2281      because that tries to walk the PHI arg from the preheader edge which
2282      can be constant.  See PR60382.  */
2283   if (has_zero_uses (name))
2284     return NULL;
2285   nloop_uses = 0;
2286   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2287     {
2288       gimple *use_stmt = USE_STMT (use_p);
2289       if (is_gimple_debug (use_stmt))
2290         continue;
2291
2292       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2293         {
2294           if (dump_enabled_p ())
2295             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2296                              "intermediate value used outside loop.\n");
2297
2298           return NULL;
2299         }
2300
2301       nloop_uses++;
2302       if (nloop_uses > 1)
2303         {
2304           if (dump_enabled_p ())
2305             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2306                              "reduction used in loop.\n");
2307           return NULL;
2308         }
2309     }
2310
2311   if (TREE_CODE (loop_arg) != SSA_NAME)
2312     {
2313       if (dump_enabled_p ())
2314         {
2315           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2316                            "reduction: not ssa_name: ");
2317           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2318           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2319         }
2320       return NULL;
2321     }
2322
2323   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2324   if (!def_stmt)
2325     {
2326       if (dump_enabled_p ())
2327         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2328                          "reduction: no def_stmt.\n");
2329       return NULL;
2330     }
2331
2332   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2333     {
2334       if (dump_enabled_p ())
2335         {
2336           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2337           dump_printf (MSG_NOTE, "\n");
2338         }
2339       return NULL;
2340     }
2341
2342   if (is_gimple_assign (def_stmt))
2343     {
2344       name = gimple_assign_lhs (def_stmt);
2345       phi_def = false;
2346     }
2347   else
2348     {
2349       name = PHI_RESULT (def_stmt);
2350       phi_def = true;
2351     }
2352
2353   nloop_uses = 0;
2354   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2355     {
2356       gimple *use_stmt = USE_STMT (use_p);
2357       if (is_gimple_debug (use_stmt))
2358         continue;
2359       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2360         nloop_uses++;
2361       if (nloop_uses > 1)
2362         {
2363           if (dump_enabled_p ())
2364             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2365                              "reduction used in loop.\n");
2366           return NULL;
2367         }
2368     }
2369
2370   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2371      defined in the inner loop.  */
2372   if (phi_def)
2373     {
2374       op1 = PHI_ARG_DEF (def_stmt, 0);
2375
2376       if (gimple_phi_num_args (def_stmt) != 1
2377           || TREE_CODE (op1) != SSA_NAME)
2378         {
2379           if (dump_enabled_p ())
2380             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2381                              "unsupported phi node definition.\n");
2382
2383           return NULL;
2384         }
2385
2386       def1 = SSA_NAME_DEF_STMT (op1);
2387       if (gimple_bb (def1)
2388           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2389           && loop->inner
2390           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2391           && is_gimple_assign (def1))
2392         {
2393           if (dump_enabled_p ())
2394             report_vect_op (MSG_NOTE, def_stmt,
2395                             "detected double reduction: ");
2396
2397           *double_reduc = true;
2398           return def_stmt;
2399         }
2400
2401       return NULL;
2402     }
2403
2404   code = orig_code = gimple_assign_rhs_code (def_stmt);
2405
2406   /* We can handle "res -= x[i]", which is non-associative by
2407      simply rewriting this into "res += -x[i]".  Avoid changing
2408      gimple instruction for the first simple tests and only do this
2409      if we're allowed to change code at all.  */
2410   if (code == MINUS_EXPR
2411       && modify
2412       && (op1 = gimple_assign_rhs1 (def_stmt))
2413       && TREE_CODE (op1) == SSA_NAME
2414       && SSA_NAME_DEF_STMT (op1) == phi)
2415     code = PLUS_EXPR;
2416
2417   if (check_reduction
2418       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2419     {
2420       if (dump_enabled_p ())
2421         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2422                         "reduction: not commutative/associative: ");
2423       return NULL;
2424     }
2425
2426   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2427     {
2428       if (code != COND_EXPR)
2429         {
2430           if (dump_enabled_p ())
2431             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2432                             "reduction: not binary operation: ");
2433
2434           return NULL;
2435         }
2436
2437       op3 = gimple_assign_rhs1 (def_stmt);
2438       if (COMPARISON_CLASS_P (op3))
2439         {
2440           op4 = TREE_OPERAND (op3, 1);
2441           op3 = TREE_OPERAND (op3, 0);
2442         }
2443
2444       op1 = gimple_assign_rhs2 (def_stmt);
2445       op2 = gimple_assign_rhs3 (def_stmt);
2446
2447       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2448         {
2449           if (dump_enabled_p ())
2450             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2451                             "reduction: uses not ssa_names: ");
2452
2453           return NULL;
2454         }
2455     }
2456   else
2457     {
2458       op1 = gimple_assign_rhs1 (def_stmt);
2459       op2 = gimple_assign_rhs2 (def_stmt);
2460
2461       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2462         {
2463           if (dump_enabled_p ())
2464             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2465                             "reduction: uses not ssa_names: ");
2466
2467           return NULL;
2468         }
2469    }
2470
2471   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2472   if ((TREE_CODE (op1) == SSA_NAME
2473        && !types_compatible_p (type,TREE_TYPE (op1)))
2474       || (TREE_CODE (op2) == SSA_NAME
2475           && !types_compatible_p (type, TREE_TYPE (op2)))
2476       || (op3 && TREE_CODE (op3) == SSA_NAME
2477           && !types_compatible_p (type, TREE_TYPE (op3)))
2478       || (op4 && TREE_CODE (op4) == SSA_NAME
2479           && !types_compatible_p (type, TREE_TYPE (op4))))
2480     {
2481       if (dump_enabled_p ())
2482         {
2483           dump_printf_loc (MSG_NOTE, vect_location,
2484                            "reduction: multiple types: operation type: ");
2485           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2486           dump_printf (MSG_NOTE, ", operands types: ");
2487           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2488                              TREE_TYPE (op1));
2489           dump_printf (MSG_NOTE, ",");
2490           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2491                              TREE_TYPE (op2));
2492           if (op3)
2493             {
2494               dump_printf (MSG_NOTE, ",");
2495               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2496                                  TREE_TYPE (op3));
2497             }
2498
2499           if (op4)
2500             {
2501               dump_printf (MSG_NOTE, ",");
2502               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2503                                  TREE_TYPE (op4));
2504             }
2505           dump_printf (MSG_NOTE, "\n");
2506         }
2507
2508       return NULL;
2509     }
2510
2511   /* Check that it's ok to change the order of the computation.
2512      Generally, when vectorizing a reduction we change the order of the
2513      computation.  This may change the behavior of the program in some
2514      cases, so we need to check that this is ok.  One exception is when
2515      vectorizing an outer-loop: the inner-loop is executed sequentially,
2516      and therefore vectorizing reductions in the inner-loop during
2517      outer-loop vectorization is safe.  */
2518
2519   /* CHECKME: check for !flag_finite_math_only too?  */
2520   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2521       && check_reduction)
2522     {
2523       /* Changing the order of operations changes the semantics.  */
2524       if (dump_enabled_p ())
2525         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2526                         "reduction: unsafe fp math optimization: ");
2527       return NULL;
2528     }
2529   else if (INTEGRAL_TYPE_P (type) && check_reduction)
2530     {
2531       if (!operation_no_trapping_overflow (type, code))
2532         {
2533           /* Changing the order of operations changes the semantics.  */
2534           if (dump_enabled_p ())
2535             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2536                             "reduction: unsafe int math optimization"
2537                             " (overflow traps): ");
2538           return NULL;
2539         }
2540       if (need_wrapping_integral_overflow
2541           && !TYPE_OVERFLOW_WRAPS (type)
2542           && operation_can_overflow (code))
2543         {
2544           /* Changing the order of operations changes the semantics.  */
2545           if (dump_enabled_p ())
2546             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2547                             "reduction: unsafe int math optimization"
2548                             " (overflow doesn't wrap): ");
2549           return NULL;
2550         }
2551     }
2552   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2553     {
2554       /* Changing the order of operations changes the semantics.  */
2555       if (dump_enabled_p ())
2556         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2557                         "reduction: unsafe fixed-point math optimization: ");
2558       return NULL;
2559     }
2560
2561   /* If we detected "res -= x[i]" earlier, rewrite it into
2562      "res += -x[i]" now.  If this turns out to be useless reassoc
2563      will clean it up again.  */
2564   if (orig_code == MINUS_EXPR)
2565     {
2566       tree rhs = gimple_assign_rhs2 (def_stmt);
2567       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2568       gimple *negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2569       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2570       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2571                                                           loop_info));
2572       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2573       gimple_assign_set_rhs2 (def_stmt, negrhs);
2574       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2575       update_stmt (def_stmt);
2576     }
2577
2578   /* Reduction is safe. We're dealing with one of the following:
2579      1) integer arithmetic and no trapv
2580      2) floating point arithmetic, and special flags permit this optimization
2581      3) nested cycle (i.e., outer loop vectorization).  */
2582   if (TREE_CODE (op1) == SSA_NAME)
2583     def1 = SSA_NAME_DEF_STMT (op1);
2584
2585   if (TREE_CODE (op2) == SSA_NAME)
2586     def2 = SSA_NAME_DEF_STMT (op2);
2587
2588   if (code != COND_EXPR
2589       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2590     {
2591       if (dump_enabled_p ())
2592         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2593       return NULL;
2594     }
2595
2596   /* Check that one def is the reduction def, defined by PHI,
2597      the other def is either defined in the loop ("vect_internal_def"),
2598      or it's an induction (defined by a loop-header phi-node).  */
2599
2600   if (def2 && def2 == phi
2601       && (code == COND_EXPR
2602           || !def1 || gimple_nop_p (def1)
2603           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2604           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2605               && (is_gimple_assign (def1)
2606                   || is_gimple_call (def1)
2607                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2608                       == vect_induction_def
2609                   || (gimple_code (def1) == GIMPLE_PHI
2610                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2611                           == vect_internal_def
2612                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2613     {
2614       if (dump_enabled_p ())
2615         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2616       return def_stmt;
2617     }
2618
2619   if (def1 && def1 == phi
2620       && (code == COND_EXPR
2621           || !def2 || gimple_nop_p (def2)
2622           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2623           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2624               && (is_gimple_assign (def2)
2625                   || is_gimple_call (def2)
2626                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2627                       == vect_induction_def
2628                   || (gimple_code (def2) == GIMPLE_PHI
2629                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2630                           == vect_internal_def
2631                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2632     {
2633       if (check_reduction)
2634         {
2635           /* Swap operands (just for simplicity - so that the rest of the code
2636              can assume that the reduction variable is always the last (second)
2637              argument).  */
2638           if (dump_enabled_p ())
2639             report_vect_op (MSG_NOTE, def_stmt,
2640                             "detected reduction: need to swap operands: ");
2641
2642           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2643                              gimple_assign_rhs2_ptr (def_stmt));
2644
2645           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2646             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2647         }
2648       else
2649         {
2650           if (dump_enabled_p ())
2651             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2652         }
2653
2654       return def_stmt;
2655     }
2656
2657   /* Try to find SLP reduction chain.  */
2658   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2659     {
2660       if (dump_enabled_p ())
2661         report_vect_op (MSG_NOTE, def_stmt,
2662                         "reduction: detected reduction chain: ");
2663
2664       return def_stmt;
2665     }
2666
2667   if (dump_enabled_p ())
2668     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2669                     "reduction: unknown pattern: ");
2670
2671   return NULL;
2672 }
2673
2674 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2675    in-place.  Arguments as there.  */
2676
2677 static gimple *
2678 vect_is_simple_reduction (loop_vec_info loop_info, gimple *phi,
2679                           bool check_reduction, bool *double_reduc,
2680                           bool need_wrapping_integral_overflow)
2681 {
2682   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2683                                      double_reduc, false,
2684                                      need_wrapping_integral_overflow);
2685 }
2686
2687 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2688    in-place if it enables detection of more reductions.  Arguments
2689    as there.  */
2690
2691 gimple *
2692 vect_force_simple_reduction (loop_vec_info loop_info, gimple *phi,
2693                              bool check_reduction, bool *double_reduc,
2694                              bool need_wrapping_integral_overflow)
2695 {
2696   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2697                                      double_reduc, true,
2698                                      need_wrapping_integral_overflow);
2699 }
2700
2701 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2702 int
2703 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2704                              int *peel_iters_epilogue,
2705                              stmt_vector_for_cost *scalar_cost_vec,
2706                              stmt_vector_for_cost *prologue_cost_vec,
2707                              stmt_vector_for_cost *epilogue_cost_vec)
2708 {
2709   int retval = 0;
2710   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2711
2712   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2713     {
2714       *peel_iters_epilogue = vf/2;
2715       if (dump_enabled_p ())
2716         dump_printf_loc (MSG_NOTE, vect_location,
2717                          "cost model: epilogue peel iters set to vf/2 "
2718                          "because loop iterations are unknown .\n");
2719
2720       /* If peeled iterations are known but number of scalar loop
2721          iterations are unknown, count a taken branch per peeled loop.  */
2722       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2723                                  NULL, 0, vect_prologue);
2724       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2725                                  NULL, 0, vect_epilogue);
2726     }
2727   else
2728     {
2729       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2730       peel_iters_prologue = niters < peel_iters_prologue ?
2731                             niters : peel_iters_prologue;
2732       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2733       /* If we need to peel for gaps, but no peeling is required, we have to
2734          peel VF iterations.  */
2735       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2736         *peel_iters_epilogue = vf;
2737     }
2738
2739   stmt_info_for_cost *si;
2740   int j;
2741   if (peel_iters_prologue)
2742     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2743       retval += record_stmt_cost (prologue_cost_vec,
2744                                   si->count * peel_iters_prologue,
2745                                   si->kind, NULL, si->misalign,
2746                                   vect_prologue);
2747   if (*peel_iters_epilogue)
2748     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2749       retval += record_stmt_cost (epilogue_cost_vec,
2750                                   si->count * *peel_iters_epilogue,
2751                                   si->kind, NULL, si->misalign,
2752                                   vect_epilogue);
2753
2754   return retval;
2755 }
2756
2757 /* Function vect_estimate_min_profitable_iters
2758
2759    Return the number of iterations required for the vector version of the
2760    loop to be profitable relative to the cost of the scalar version of the
2761    loop.  */
2762
2763 static void
2764 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2765                                     int *ret_min_profitable_niters,
2766                                     int *ret_min_profitable_estimate)
2767 {
2768   int min_profitable_iters;
2769   int min_profitable_estimate;
2770   int peel_iters_prologue;
2771   int peel_iters_epilogue;
2772   unsigned vec_inside_cost = 0;
2773   int vec_outside_cost = 0;
2774   unsigned vec_prologue_cost = 0;
2775   unsigned vec_epilogue_cost = 0;
2776   int scalar_single_iter_cost = 0;
2777   int scalar_outside_cost = 0;
2778   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2779   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2780   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2781
2782   /* Cost model disabled.  */
2783   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2784     {
2785       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2786       *ret_min_profitable_niters = 0;
2787       *ret_min_profitable_estimate = 0;
2788       return;
2789     }
2790
2791   /* Requires loop versioning tests to handle misalignment.  */
2792   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2793     {
2794       /*  FIXME: Make cost depend on complexity of individual check.  */
2795       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2796       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2797                             vect_prologue);
2798       dump_printf (MSG_NOTE,
2799                    "cost model: Adding cost of checks for loop "
2800                    "versioning to treat misalignment.\n");
2801     }
2802
2803   /* Requires loop versioning with alias checks.  */
2804   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2805     {
2806       /*  FIXME: Make cost depend on complexity of individual check.  */
2807       unsigned len = LOOP_VINFO_COMP_ALIAS_DDRS (loop_vinfo).length ();
2808       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2809                             vect_prologue);
2810       dump_printf (MSG_NOTE,
2811                    "cost model: Adding cost of checks for loop "
2812                    "versioning aliasing.\n");
2813     }
2814
2815   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2816       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2817     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2818                           vect_prologue);
2819
2820   /* Count statements in scalar loop.  Using this as scalar cost for a single
2821      iteration for now.
2822
2823      TODO: Add outer loop support.
2824
2825      TODO: Consider assigning different costs to different scalar
2826      statements.  */
2827
2828   scalar_single_iter_cost
2829     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2830
2831   /* Add additional cost for the peeled instructions in prologue and epilogue
2832      loop.
2833
2834      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2835      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2836
2837      TODO: Build an expression that represents peel_iters for prologue and
2838      epilogue to be used in a run-time test.  */
2839
2840   if (npeel  < 0)
2841     {
2842       peel_iters_prologue = vf/2;
2843       dump_printf (MSG_NOTE, "cost model: "
2844                    "prologue peel iters set to vf/2.\n");
2845
2846       /* If peeling for alignment is unknown, loop bound of main loop becomes
2847          unknown.  */
2848       peel_iters_epilogue = vf/2;
2849       dump_printf (MSG_NOTE, "cost model: "
2850                    "epilogue peel iters set to vf/2 because "
2851                    "peeling for alignment is unknown.\n");
2852
2853       /* If peeled iterations are unknown, count a taken branch and a not taken
2854          branch per peeled loop. Even if scalar loop iterations are known,
2855          vector iterations are not known since peeled prologue iterations are
2856          not known. Hence guards remain the same.  */
2857       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2858                             NULL, 0, vect_prologue);
2859       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2860                             NULL, 0, vect_prologue);
2861       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2862                             NULL, 0, vect_epilogue);
2863       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2864                             NULL, 0, vect_epilogue);
2865       stmt_info_for_cost *si;
2866       int j;
2867       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2868         {
2869           struct _stmt_vec_info *stmt_info
2870             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2871           (void) add_stmt_cost (target_cost_data,
2872                                 si->count * peel_iters_prologue,
2873                                 si->kind, stmt_info, si->misalign,
2874                                 vect_prologue);
2875           (void) add_stmt_cost (target_cost_data,
2876                                 si->count * peel_iters_epilogue,
2877                                 si->kind, stmt_info, si->misalign,
2878                                 vect_epilogue);
2879         }
2880     }
2881   else
2882     {
2883       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2884       stmt_info_for_cost *si;
2885       int j;
2886       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2887
2888       prologue_cost_vec.create (2);
2889       epilogue_cost_vec.create (2);
2890       peel_iters_prologue = npeel;
2891
2892       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2893                                           &peel_iters_epilogue,
2894                                           &LOOP_VINFO_SCALAR_ITERATION_COST
2895                                             (loop_vinfo),
2896                                           &prologue_cost_vec,
2897                                           &epilogue_cost_vec);
2898
2899       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2900         {
2901           struct _stmt_vec_info *stmt_info
2902             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2903           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2904                                 si->misalign, vect_prologue);
2905         }
2906
2907       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2908         {
2909           struct _stmt_vec_info *stmt_info
2910             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2911           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2912                                 si->misalign, vect_epilogue);
2913         }
2914
2915       prologue_cost_vec.release ();
2916       epilogue_cost_vec.release ();
2917     }
2918
2919   /* FORNOW: The scalar outside cost is incremented in one of the
2920      following ways:
2921
2922      1. The vectorizer checks for alignment and aliasing and generates
2923      a condition that allows dynamic vectorization.  A cost model
2924      check is ANDED with the versioning condition.  Hence scalar code
2925      path now has the added cost of the versioning check.
2926
2927        if (cost > th & versioning_check)
2928          jmp to vector code
2929
2930      Hence run-time scalar is incremented by not-taken branch cost.
2931
2932      2. The vectorizer then checks if a prologue is required.  If the
2933      cost model check was not done before during versioning, it has to
2934      be done before the prologue check.
2935
2936        if (cost <= th)
2937          prologue = scalar_iters
2938        if (prologue == 0)
2939          jmp to vector code
2940        else
2941          execute prologue
2942        if (prologue == num_iters)
2943          go to exit
2944
2945      Hence the run-time scalar cost is incremented by a taken branch,
2946      plus a not-taken branch, plus a taken branch cost.
2947
2948      3. The vectorizer then checks if an epilogue is required.  If the
2949      cost model check was not done before during prologue check, it
2950      has to be done with the epilogue check.
2951
2952        if (prologue == 0)
2953          jmp to vector code
2954        else
2955          execute prologue
2956        if (prologue == num_iters)
2957          go to exit
2958        vector code:
2959          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
2960            jmp to epilogue
2961
2962      Hence the run-time scalar cost should be incremented by 2 taken
2963      branches.
2964
2965      TODO: The back end may reorder the BBS's differently and reverse
2966      conditions/branch directions.  Change the estimates below to
2967      something more reasonable.  */
2968
2969   /* If the number of iterations is known and we do not do versioning, we can
2970      decide whether to vectorize at compile time.  Hence the scalar version
2971      do not carry cost model guard costs.  */
2972   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
2973       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2974       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2975     {
2976       /* Cost model check occurs at versioning.  */
2977       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2978           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2979         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
2980       else
2981         {
2982           /* Cost model check occurs at prologue generation.  */
2983           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
2984             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
2985               + vect_get_stmt_cost (cond_branch_not_taken);
2986           /* Cost model check occurs at epilogue generation.  */
2987           else
2988             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
2989         }
2990     }
2991
2992   /* Complete the target-specific cost calculations.  */
2993   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
2994                &vec_inside_cost, &vec_epilogue_cost);
2995
2996   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
2997
2998   if (dump_enabled_p ())
2999     {
3000       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3001       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3002                    vec_inside_cost);
3003       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3004                    vec_prologue_cost);
3005       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3006                    vec_epilogue_cost);
3007       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3008                    scalar_single_iter_cost);
3009       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3010                    scalar_outside_cost);
3011       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3012                    vec_outside_cost);
3013       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3014                    peel_iters_prologue);
3015       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3016                    peel_iters_epilogue);
3017     }
3018
3019   /* Calculate number of iterations required to make the vector version
3020      profitable, relative to the loop bodies only.  The following condition
3021      must hold true:
3022      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3023      where
3024      SIC = scalar iteration cost, VIC = vector iteration cost,
3025      VOC = vector outside cost, VF = vectorization factor,
3026      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3027      SOC = scalar outside cost for run time cost model check.  */
3028
3029   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3030     {
3031       if (vec_outside_cost <= 0)
3032         min_profitable_iters = 1;
3033       else
3034         {
3035           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3036                                   - vec_inside_cost * peel_iters_prologue
3037                                   - vec_inside_cost * peel_iters_epilogue)
3038                                  / ((scalar_single_iter_cost * vf)
3039                                     - vec_inside_cost);
3040
3041           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3042               <= (((int) vec_inside_cost * min_profitable_iters)
3043                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3044             min_profitable_iters++;
3045         }
3046     }
3047   /* vector version will never be profitable.  */
3048   else
3049     {
3050       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3051         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3052                     "did not happen for a simd loop");
3053
3054       if (dump_enabled_p ())
3055         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3056                          "cost model: the vector iteration cost = %d "
3057                          "divided by the scalar iteration cost = %d "
3058                          "is greater or equal to the vectorization factor = %d"
3059                          ".\n",
3060                          vec_inside_cost, scalar_single_iter_cost, vf);
3061       *ret_min_profitable_niters = -1;
3062       *ret_min_profitable_estimate = -1;
3063       return;
3064     }
3065
3066   dump_printf (MSG_NOTE,
3067                "  Calculated minimum iters for profitability: %d\n",
3068                min_profitable_iters);
3069
3070   min_profitable_iters =
3071         min_profitable_iters < vf ? vf : min_profitable_iters;
3072
3073   /* Because the condition we create is:
3074      if (niters <= min_profitable_iters)
3075        then skip the vectorized loop.  */
3076   min_profitable_iters--;
3077
3078   if (dump_enabled_p ())
3079     dump_printf_loc (MSG_NOTE, vect_location,
3080                      "  Runtime profitability threshold = %d\n",
3081                      min_profitable_iters);
3082
3083   *ret_min_profitable_niters = min_profitable_iters;
3084
3085   /* Calculate number of iterations required to make the vector version
3086      profitable, relative to the loop bodies only.
3087
3088      Non-vectorized variant is SIC * niters and it must win over vector
3089      variant on the expected loop trip count.  The following condition must hold true:
3090      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3091
3092   if (vec_outside_cost <= 0)
3093     min_profitable_estimate = 1;
3094   else
3095     {
3096       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3097                                  - vec_inside_cost * peel_iters_prologue
3098                                  - vec_inside_cost * peel_iters_epilogue)
3099                                  / ((scalar_single_iter_cost * vf)
3100                                    - vec_inside_cost);
3101     }
3102   min_profitable_estimate --;
3103   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3104   if (dump_enabled_p ())
3105     dump_printf_loc (MSG_NOTE, vect_location,
3106                      "  Static estimate profitability threshold = %d\n",
3107                       min_profitable_iters);
3108
3109   *ret_min_profitable_estimate = min_profitable_estimate;
3110 }
3111
3112 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3113    vector elements (not bits) for a vector of mode MODE.  */
3114 static void
3115 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3116                               unsigned char *sel)
3117 {
3118   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3119
3120   for (i = 0; i < nelt; i++)
3121     sel[i] = (i + offset) & (2*nelt - 1);
3122 }
3123
3124 /* Checks whether the target supports whole-vector shifts for vectors of mode
3125    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3126    it supports vec_perm_const with masks for all necessary shift amounts.  */
3127 static bool
3128 have_whole_vector_shift (enum machine_mode mode)
3129 {
3130   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3131     return true;
3132
3133   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3134     return false;
3135
3136   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3137   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3138
3139   for (i = nelt/2; i >= 1; i/=2)
3140     {
3141       calc_vec_perm_mask_for_shift (mode, i, sel);
3142       if (!can_vec_perm_p (mode, false, sel))
3143         return false;
3144     }
3145   return true;
3146 }
3147
3148 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3149
3150 static tree
3151 get_reduction_op (gimple *stmt, int reduc_index)
3152 {
3153   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3154     {
3155     case GIMPLE_SINGLE_RHS:
3156       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3157                   == ternary_op);
3158       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3159     case GIMPLE_UNARY_RHS:
3160       return gimple_assign_rhs1 (stmt);
3161     case GIMPLE_BINARY_RHS:
3162       return (reduc_index
3163               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3164     case GIMPLE_TERNARY_RHS:
3165       return gimple_op (stmt, reduc_index + 1);
3166     default:
3167       gcc_unreachable ();
3168     }
3169 }
3170
3171 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3172    functions. Design better to avoid maintenance issues.  */
3173
3174 /* Function vect_model_reduction_cost.
3175
3176    Models cost for a reduction operation, including the vector ops
3177    generated within the strip-mine loop, the initial definition before
3178    the loop, and the epilogue code that must be generated.  */
3179
3180 static bool
3181 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3182                            int ncopies, int reduc_index)
3183 {
3184   int prologue_cost = 0, epilogue_cost = 0;
3185   enum tree_code code;
3186   optab optab;
3187   tree vectype;
3188   gimple *stmt, *orig_stmt;
3189   tree reduction_op;
3190   machine_mode mode;
3191   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3192   struct loop *loop = NULL;
3193   void *target_cost_data;
3194
3195   if (loop_vinfo)
3196     {
3197       loop = LOOP_VINFO_LOOP (loop_vinfo);
3198       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3199     }
3200   else
3201     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3202
3203   /* Cost of reduction op inside loop.  */
3204   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3205                                         stmt_info, 0, vect_body);
3206   stmt = STMT_VINFO_STMT (stmt_info);
3207
3208   reduction_op = get_reduction_op (stmt, reduc_index);
3209
3210   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3211   if (!vectype)
3212     {
3213       if (dump_enabled_p ())
3214         {
3215           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3216                            "unsupported data-type ");
3217           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3218                              TREE_TYPE (reduction_op));
3219           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3220         }
3221       return false;
3222    }
3223
3224   mode = TYPE_MODE (vectype);
3225   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3226
3227   if (!orig_stmt)
3228     orig_stmt = STMT_VINFO_STMT (stmt_info);
3229
3230   code = gimple_assign_rhs_code (orig_stmt);
3231
3232   /* Add in cost for initial definition.  */
3233   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3234                                   stmt_info, 0, vect_prologue);
3235
3236   /* Determine cost of epilogue code.
3237
3238      We have a reduction operator that will reduce the vector in one statement.
3239      Also requires scalar extract.  */
3240
3241   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3242     {
3243       if (reduc_code != ERROR_MARK)
3244         {
3245           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3246                                           stmt_info, 0, vect_epilogue);
3247           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3248                                           stmt_info, 0, vect_epilogue);
3249         }
3250       else
3251         {
3252           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3253           tree bitsize =
3254             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3255           int element_bitsize = tree_to_uhwi (bitsize);
3256           int nelements = vec_size_in_bits / element_bitsize;
3257
3258           optab = optab_for_tree_code (code, vectype, optab_default);
3259
3260           /* We have a whole vector shift available.  */
3261           if (VECTOR_MODE_P (mode)
3262               && optab_handler (optab, mode) != CODE_FOR_nothing
3263               && have_whole_vector_shift (mode))
3264             {
3265               /* Final reduction via vector shifts and the reduction operator.
3266                  Also requires scalar extract.  */
3267               epilogue_cost += add_stmt_cost (target_cost_data,
3268                                               exact_log2 (nelements) * 2,
3269                                               vector_stmt, stmt_info, 0,
3270                                               vect_epilogue);
3271               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3272                                               vec_to_scalar, stmt_info, 0,
3273                                               vect_epilogue);
3274             }
3275           else
3276             /* Use extracts and reduction op for final reduction.  For N
3277                elements, we have N extracts and N-1 reduction ops.  */
3278             epilogue_cost += add_stmt_cost (target_cost_data,
3279                                             nelements + nelements - 1,
3280                                             vector_stmt, stmt_info, 0,
3281                                             vect_epilogue);
3282         }
3283     }
3284
3285   if (dump_enabled_p ())
3286     dump_printf (MSG_NOTE,
3287                  "vect_model_reduction_cost: inside_cost = %d, "
3288                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3289                  prologue_cost, epilogue_cost);
3290
3291   return true;
3292 }
3293
3294
3295 /* Function vect_model_induction_cost.
3296
3297    Models cost for induction operations.  */
3298
3299 static void
3300 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3301 {
3302   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3303   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3304   unsigned inside_cost, prologue_cost;
3305
3306   /* loop cost for vec_loop.  */
3307   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3308                                stmt_info, 0, vect_body);
3309
3310   /* prologue cost for vec_init and vec_step.  */
3311   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3312                                  stmt_info, 0, vect_prologue);
3313
3314   if (dump_enabled_p ())
3315     dump_printf_loc (MSG_NOTE, vect_location,
3316                      "vect_model_induction_cost: inside_cost = %d, "
3317                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3318 }
3319
3320
3321 /* Function get_initial_def_for_induction
3322
3323    Input:
3324    STMT - a stmt that performs an induction operation in the loop.
3325    IV_PHI - the initial value of the induction variable
3326
3327    Output:
3328    Return a vector variable, initialized with the first VF values of
3329    the induction variable.  E.g., for an iv with IV_PHI='X' and
3330    evolution S, for a vector of 4 units, we want to return:
3331    [X, X + S, X + 2*S, X + 3*S].  */
3332
3333 static tree
3334 get_initial_def_for_induction (gimple *iv_phi)
3335 {
3336   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3337   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3338   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3339   tree vectype;
3340   int nunits;
3341   edge pe = loop_preheader_edge (loop);
3342   struct loop *iv_loop;
3343   basic_block new_bb;
3344   tree new_vec, vec_init, vec_step, t;
3345   tree new_name;
3346   gimple *new_stmt;
3347   gphi *induction_phi;
3348   tree induc_def, vec_def, vec_dest;
3349   tree init_expr, step_expr;
3350   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3351   int i;
3352   int ncopies;
3353   tree expr;
3354   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3355   bool nested_in_vect_loop = false;
3356   gimple_seq stmts;
3357   imm_use_iterator imm_iter;
3358   use_operand_p use_p;
3359   gimple *exit_phi;
3360   edge latch_e;
3361   tree loop_arg;
3362   gimple_stmt_iterator si;
3363   basic_block bb = gimple_bb (iv_phi);
3364   tree stepvectype;
3365   tree resvectype;
3366
3367   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3368   if (nested_in_vect_loop_p (loop, iv_phi))
3369     {
3370       nested_in_vect_loop = true;
3371       iv_loop = loop->inner;
3372     }
3373   else
3374     iv_loop = loop;
3375   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3376
3377   latch_e = loop_latch_edge (iv_loop);
3378   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3379
3380   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3381   gcc_assert (step_expr != NULL_TREE);
3382
3383   pe = loop_preheader_edge (iv_loop);
3384   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3385                                      loop_preheader_edge (iv_loop));
3386
3387   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3388   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3389   gcc_assert (vectype);
3390   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3391   ncopies = vf / nunits;
3392
3393   gcc_assert (phi_info);
3394   gcc_assert (ncopies >= 1);
3395
3396   /* Convert the step to the desired type.  */
3397   stmts = NULL;
3398   step_expr = gimple_convert (&stmts, TREE_TYPE (vectype), step_expr);
3399   if (stmts)
3400     {
3401       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3402       gcc_assert (!new_bb);
3403     }
3404
3405   /* Find the first insertion point in the BB.  */
3406   si = gsi_after_labels (bb);
3407
3408   /* Create the vector that holds the initial_value of the induction.  */
3409   if (nested_in_vect_loop)
3410     {
3411       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3412          been created during vectorization of previous stmts.  We obtain it
3413          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3414       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi);
3415       /* If the initial value is not of proper type, convert it.  */
3416       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3417         {
3418           new_stmt
3419             = gimple_build_assign (vect_get_new_ssa_name (vectype,
3420                                                           vect_simple_var,
3421                                                           "vec_iv_"),
3422                                    VIEW_CONVERT_EXPR,
3423                                    build1 (VIEW_CONVERT_EXPR, vectype,
3424                                            vec_init));
3425           vec_init = gimple_assign_lhs (new_stmt);
3426           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3427                                                  new_stmt);
3428           gcc_assert (!new_bb);
3429           set_vinfo_for_stmt (new_stmt,
3430                               new_stmt_vec_info (new_stmt, loop_vinfo));
3431         }
3432     }
3433   else
3434     {
3435       vec<constructor_elt, va_gc> *v;
3436
3437       /* iv_loop is the loop to be vectorized. Create:
3438          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3439       stmts = NULL;
3440       new_name = gimple_convert (&stmts, TREE_TYPE (vectype), init_expr);
3441
3442       vec_alloc (v, nunits);
3443       bool constant_p = is_gimple_min_invariant (new_name);
3444       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3445       for (i = 1; i < nunits; i++)
3446         {
3447           /* Create: new_name_i = new_name + step_expr  */
3448           new_name = gimple_build (&stmts, PLUS_EXPR, TREE_TYPE (new_name),
3449                                    new_name, step_expr);
3450           if (!is_gimple_min_invariant (new_name))
3451             constant_p = false;
3452           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3453         }
3454       if (stmts)
3455         {
3456           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3457           gcc_assert (!new_bb);
3458         }
3459
3460       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3461       if (constant_p)
3462         new_vec = build_vector_from_ctor (vectype, v);
3463       else
3464         new_vec = build_constructor (vectype, v);
3465       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3466     }
3467
3468
3469   /* Create the vector that holds the step of the induction.  */
3470   if (nested_in_vect_loop)
3471     /* iv_loop is nested in the loop to be vectorized. Generate:
3472        vec_step = [S, S, S, S]  */
3473     new_name = step_expr;
3474   else
3475     {
3476       /* iv_loop is the loop to be vectorized. Generate:
3477           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3478       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3479         {
3480           expr = build_int_cst (integer_type_node, vf);
3481           expr = fold_convert (TREE_TYPE (step_expr), expr);
3482         }
3483       else
3484         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3485       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3486                               expr, step_expr);
3487       if (TREE_CODE (step_expr) == SSA_NAME)
3488         new_name = vect_init_vector (iv_phi, new_name,
3489                                      TREE_TYPE (step_expr), NULL);
3490     }
3491
3492   t = unshare_expr (new_name);
3493   gcc_assert (CONSTANT_CLASS_P (new_name)
3494               || TREE_CODE (new_name) == SSA_NAME);
3495   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3496   gcc_assert (stepvectype);
3497   new_vec = build_vector_from_val (stepvectype, t);
3498   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3499
3500
3501   /* Create the following def-use cycle:
3502      loop prolog:
3503          vec_init = ...
3504          vec_step = ...
3505      loop:
3506          vec_iv = PHI <vec_init, vec_loop>
3507          ...
3508          STMT
3509          ...
3510          vec_loop = vec_iv + vec_step;  */
3511
3512   /* Create the induction-phi that defines the induction-operand.  */
3513   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3514   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3515   set_vinfo_for_stmt (induction_phi,
3516                       new_stmt_vec_info (induction_phi, loop_vinfo));
3517   induc_def = PHI_RESULT (induction_phi);
3518
3519   /* Create the iv update inside the loop  */
3520   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3521   vec_def = make_ssa_name (vec_dest, new_stmt);
3522   gimple_assign_set_lhs (new_stmt, vec_def);
3523   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3524   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo));
3525
3526   /* Set the arguments of the phi node:  */
3527   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3528   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3529                UNKNOWN_LOCATION);
3530
3531
3532   /* In case that vectorization factor (VF) is bigger than the number
3533      of elements that we can fit in a vectype (nunits), we have to generate
3534      more than one vector stmt - i.e - we need to "unroll" the
3535      vector stmt by a factor VF/nunits.  For more details see documentation
3536      in vectorizable_operation.  */
3537
3538   if (ncopies > 1)
3539     {
3540       stmt_vec_info prev_stmt_vinfo;
3541       /* FORNOW. This restriction should be relaxed.  */
3542       gcc_assert (!nested_in_vect_loop);
3543
3544       /* Create the vector that holds the step of the induction.  */
3545       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3546         {
3547           expr = build_int_cst (integer_type_node, nunits);
3548           expr = fold_convert (TREE_TYPE (step_expr), expr);
3549         }
3550       else
3551         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3552       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3553                               expr, step_expr);
3554       if (TREE_CODE (step_expr) == SSA_NAME)
3555         new_name = vect_init_vector (iv_phi, new_name,
3556                                      TREE_TYPE (step_expr), NULL);
3557       t = unshare_expr (new_name);
3558       gcc_assert (CONSTANT_CLASS_P (new_name)
3559                   || TREE_CODE (new_name) == SSA_NAME);
3560       new_vec = build_vector_from_val (stepvectype, t);
3561       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3562
3563       vec_def = induc_def;
3564       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3565       for (i = 1; i < ncopies; i++)
3566         {
3567           /* vec_i = vec_prev + vec_step  */
3568           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3569                                           vec_def, vec_step);
3570           vec_def = make_ssa_name (vec_dest, new_stmt);
3571           gimple_assign_set_lhs (new_stmt, vec_def);
3572
3573           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3574           if (!useless_type_conversion_p (resvectype, vectype))
3575             {
3576               new_stmt
3577                 = gimple_build_assign
3578                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3579                                                 "vec_iv_"),
3580                          VIEW_CONVERT_EXPR,
3581                          build1 (VIEW_CONVERT_EXPR, resvectype,
3582                                  gimple_assign_lhs (new_stmt)));
3583               gimple_assign_set_lhs (new_stmt,
3584                                      make_ssa_name
3585                                        (gimple_assign_lhs (new_stmt), new_stmt));
3586               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3587             }
3588           set_vinfo_for_stmt (new_stmt,
3589                               new_stmt_vec_info (new_stmt, loop_vinfo));
3590           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3591           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3592         }
3593     }
3594
3595   if (nested_in_vect_loop)
3596     {
3597       /* Find the loop-closed exit-phi of the induction, and record
3598          the final vector of induction results:  */
3599       exit_phi = NULL;
3600       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3601         {
3602           gimple *use_stmt = USE_STMT (use_p);
3603           if (is_gimple_debug (use_stmt))
3604             continue;
3605
3606           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3607             {
3608               exit_phi = use_stmt;
3609               break;
3610             }
3611         }
3612       if (exit_phi)
3613         {
3614           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3615           /* FORNOW. Currently not supporting the case that an inner-loop induction
3616              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3617           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3618                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3619
3620           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3621           if (dump_enabled_p ())
3622             {
3623               dump_printf_loc (MSG_NOTE, vect_location,
3624                                "vector of inductions after inner-loop:");
3625               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3626               dump_printf (MSG_NOTE, "\n");
3627             }
3628         }
3629     }
3630
3631
3632   if (dump_enabled_p ())
3633     {
3634       dump_printf_loc (MSG_NOTE, vect_location,
3635                        "transform induction: created def-use cycle: ");
3636       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3637       dump_printf (MSG_NOTE, "\n");
3638       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3639                         SSA_NAME_DEF_STMT (vec_def), 0);
3640       dump_printf (MSG_NOTE, "\n");
3641     }
3642
3643   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3644   if (!useless_type_conversion_p (resvectype, vectype))
3645     {
3646       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3647                                                              vect_simple_var,
3648                                                              "vec_iv_"),
3649                                       VIEW_CONVERT_EXPR,
3650                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3651                                               induc_def));
3652       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3653       gimple_assign_set_lhs (new_stmt, induc_def);
3654       si = gsi_after_labels (bb);
3655       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3656       set_vinfo_for_stmt (new_stmt,
3657                           new_stmt_vec_info (new_stmt, loop_vinfo));
3658       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3659         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3660     }
3661
3662   return induc_def;
3663 }
3664
3665
3666 /* Function get_initial_def_for_reduction
3667
3668    Input:
3669    STMT - a stmt that performs a reduction operation in the loop.
3670    INIT_VAL - the initial value of the reduction variable
3671
3672    Output:
3673    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3674         of the reduction (used for adjusting the epilog - see below).
3675    Return a vector variable, initialized according to the operation that STMT
3676         performs. This vector will be used as the initial value of the
3677         vector of partial results.
3678
3679    Option1 (adjust in epilog): Initialize the vector as follows:
3680      add/bit or/xor:    [0,0,...,0,0]
3681      mult/bit and:      [1,1,...,1,1]
3682      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3683    and when necessary (e.g. add/mult case) let the caller know
3684    that it needs to adjust the result by init_val.
3685
3686    Option2: Initialize the vector as follows:
3687      add/bit or/xor:    [init_val,0,0,...,0]
3688      mult/bit and:      [init_val,1,1,...,1]
3689      min/max/cond_expr: [init_val,init_val,...,init_val]
3690    and no adjustments are needed.
3691
3692    For example, for the following code:
3693
3694    s = init_val;
3695    for (i=0;i<n;i++)
3696      s = s + a[i];
3697
3698    STMT is 's = s + a[i]', and the reduction variable is 's'.
3699    For a vector of 4 units, we want to return either [0,0,0,init_val],
3700    or [0,0,0,0] and let the caller know that it needs to adjust
3701    the result at the end by 'init_val'.
3702
3703    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3704    initialization vector is simpler (same element in all entries), if
3705    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3706
3707    A cost model should help decide between these two schemes.  */
3708
3709 tree
3710 get_initial_def_for_reduction (gimple *stmt, tree init_val,
3711                                tree *adjustment_def)
3712 {
3713   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3714   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3715   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3716   tree scalar_type = TREE_TYPE (init_val);
3717   tree vectype = get_vectype_for_scalar_type (scalar_type);
3718   int nunits;
3719   enum tree_code code = gimple_assign_rhs_code (stmt);
3720   tree def_for_init;
3721   tree init_def;
3722   tree *elts;
3723   int i;
3724   bool nested_in_vect_loop = false;
3725   tree init_value;
3726   REAL_VALUE_TYPE real_init_val = dconst0;
3727   int int_init_val = 0;
3728   gimple *def_stmt = NULL;
3729
3730   gcc_assert (vectype);
3731   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3732
3733   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3734               || SCALAR_FLOAT_TYPE_P (scalar_type));
3735
3736   if (nested_in_vect_loop_p (loop, stmt))
3737     nested_in_vect_loop = true;
3738   else
3739     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3740
3741   /* In case of double reduction we only create a vector variable to be put
3742      in the reduction phi node.  The actual statement creation is done in
3743      vect_create_epilog_for_reduction.  */
3744   if (adjustment_def && nested_in_vect_loop
3745       && TREE_CODE (init_val) == SSA_NAME
3746       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3747       && gimple_code (def_stmt) == GIMPLE_PHI
3748       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3749       && vinfo_for_stmt (def_stmt)
3750       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3751           == vect_double_reduction_def)
3752     {
3753       *adjustment_def = NULL;
3754       return vect_create_destination_var (init_val, vectype);
3755     }
3756
3757   if (TREE_CONSTANT (init_val))
3758     {
3759       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3760         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3761       else
3762         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3763     }
3764   else
3765     init_value = init_val;
3766
3767   switch (code)
3768     {
3769       case WIDEN_SUM_EXPR:
3770       case DOT_PROD_EXPR:
3771       case SAD_EXPR:
3772       case PLUS_EXPR:
3773       case MINUS_EXPR:
3774       case BIT_IOR_EXPR:
3775       case BIT_XOR_EXPR:
3776       case MULT_EXPR:
3777       case BIT_AND_EXPR:
3778         /* ADJUSMENT_DEF is NULL when called from
3779            vect_create_epilog_for_reduction to vectorize double reduction.  */
3780         if (adjustment_def)
3781           {
3782             if (nested_in_vect_loop)
3783               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt);
3784             else
3785               *adjustment_def = init_val;
3786           }
3787
3788         if (code == MULT_EXPR)
3789           {
3790             real_init_val = dconst1;
3791             int_init_val = 1;
3792           }
3793
3794         if (code == BIT_AND_EXPR)
3795           int_init_val = -1;
3796
3797         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3798           def_for_init = build_real (scalar_type, real_init_val);
3799         else
3800           def_for_init = build_int_cst (scalar_type, int_init_val);
3801
3802         /* Create a vector of '0' or '1' except the first element.  */
3803         elts = XALLOCAVEC (tree, nunits);
3804         for (i = nunits - 2; i >= 0; --i)
3805           elts[i + 1] = def_for_init;
3806
3807         /* Option1: the first element is '0' or '1' as well.  */
3808         if (adjustment_def)
3809           {
3810             elts[0] = def_for_init;
3811             init_def = build_vector (vectype, elts);
3812             break;
3813           }
3814
3815         /* Option2: the first element is INIT_VAL.  */
3816         elts[0] = init_val;
3817         if (TREE_CONSTANT (init_val))
3818           init_def = build_vector (vectype, elts);
3819         else
3820           {
3821             vec<constructor_elt, va_gc> *v;
3822             vec_alloc (v, nunits);
3823             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3824             for (i = 1; i < nunits; ++i)
3825               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3826             init_def = build_constructor (vectype, v);
3827           }
3828
3829         break;
3830
3831       case MIN_EXPR:
3832       case MAX_EXPR:
3833       case COND_EXPR:
3834         if (adjustment_def)
3835           {
3836             *adjustment_def = NULL_TREE;
3837             init_def = vect_get_vec_def_for_operand (init_val, stmt);
3838             break;
3839           }
3840
3841         init_def = build_vector_from_val (vectype, init_value);
3842         break;
3843
3844       default:
3845         gcc_unreachable ();
3846     }
3847
3848   return init_def;
3849 }
3850
3851 /* Function vect_create_epilog_for_reduction
3852
3853    Create code at the loop-epilog to finalize the result of a reduction
3854    computation.
3855
3856    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3857      reduction statements.
3858    STMT is the scalar reduction stmt that is being vectorized.
3859    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3860      number of elements that we can fit in a vectype (nunits).  In this case
3861      we have to generate more than one vector stmt - i.e - we need to "unroll"
3862      the vector stmt by a factor VF/nunits.  For more details see documentation
3863      in vectorizable_operation.
3864    REDUC_CODE is the tree-code for the epilog reduction.
3865    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3866      computation.
3867    REDUC_INDEX is the index of the operand in the right hand side of the
3868      statement that is defined by REDUCTION_PHI.
3869    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3870    SLP_NODE is an SLP node containing a group of reduction statements. The
3871      first one in this group is STMT.
3872
3873    This function:
3874    1. Creates the reduction def-use cycles: sets the arguments for
3875       REDUCTION_PHIS:
3876       The loop-entry argument is the vectorized initial-value of the reduction.
3877       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3878       sums.
3879    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3880       by applying the operation specified by REDUC_CODE if available, or by
3881       other means (whole-vector shifts or a scalar loop).
3882       The function also creates a new phi node at the loop exit to preserve
3883       loop-closed form, as illustrated below.
3884
3885      The flow at the entry to this function:
3886
3887         loop:
3888           vec_def = phi <null, null>            # REDUCTION_PHI
3889           VECT_DEF = vector_stmt                # vectorized form of STMT
3890           s_loop = scalar_stmt                  # (scalar) STMT
3891         loop_exit:
3892           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3893           use <s_out0>
3894           use <s_out0>
3895
3896      The above is transformed by this function into:
3897
3898         loop:
3899           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3900           VECT_DEF = vector_stmt                # vectorized form of STMT
3901           s_loop = scalar_stmt                  # (scalar) STMT
3902         loop_exit:
3903           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3904           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3905           v_out2 = reduce <v_out1>
3906           s_out3 = extract_field <v_out2, 0>
3907           s_out4 = adjust_result <s_out3>
3908           use <s_out4>
3909           use <s_out4>
3910 */
3911
3912 static void
3913 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple *stmt,
3914                                   int ncopies, enum tree_code reduc_code,
3915                                   vec<gimple *> reduction_phis,
3916                                   int reduc_index, bool double_reduc,
3917                                   slp_tree slp_node)
3918 {
3919   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
3920   stmt_vec_info prev_phi_info;
3921   tree vectype;
3922   machine_mode mode;
3923   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3924   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
3925   basic_block exit_bb;
3926   tree scalar_dest;
3927   tree scalar_type;
3928   gimple *new_phi = NULL, *phi;
3929   gimple_stmt_iterator exit_gsi;
3930   tree vec_dest;
3931   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
3932   gimple *epilog_stmt = NULL;
3933   enum tree_code code = gimple_assign_rhs_code (stmt);
3934   gimple *exit_phi;
3935   tree bitsize;
3936   tree adjustment_def = NULL;
3937   tree vec_initial_def = NULL;
3938   tree reduction_op, expr, def;
3939   tree orig_name, scalar_result;
3940   imm_use_iterator imm_iter, phi_imm_iter;
3941   use_operand_p use_p, phi_use_p;
3942   gimple *use_stmt, *orig_stmt, *reduction_phi = NULL;
3943   bool nested_in_vect_loop = false;
3944   auto_vec<gimple *> new_phis;
3945   auto_vec<gimple *> inner_phis;
3946   enum vect_def_type dt = vect_unknown_def_type;
3947   int j, i;
3948   auto_vec<tree> scalar_results;
3949   unsigned int group_size = 1, k, ratio;
3950   auto_vec<tree> vec_initial_defs;
3951   auto_vec<gimple *> phis;
3952   bool slp_reduc = false;
3953   tree new_phi_result;
3954   gimple *inner_phi = NULL;
3955
3956   if (slp_node)
3957     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
3958
3959   if (nested_in_vect_loop_p (loop, stmt))
3960     {
3961       outer_loop = loop;
3962       loop = loop->inner;
3963       nested_in_vect_loop = true;
3964       gcc_assert (!slp_node);
3965     }
3966
3967   reduction_op = get_reduction_op (stmt, reduc_index);
3968
3969   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3970   gcc_assert (vectype);
3971   mode = TYPE_MODE (vectype);
3972
3973   /* 1. Create the reduction def-use cycle:
3974      Set the arguments of REDUCTION_PHIS, i.e., transform
3975
3976         loop:
3977           vec_def = phi <null, null>            # REDUCTION_PHI
3978           VECT_DEF = vector_stmt                # vectorized form of STMT
3979           ...
3980
3981      into:
3982
3983         loop:
3984           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3985           VECT_DEF = vector_stmt                # vectorized form of STMT
3986           ...
3987
3988      (in case of SLP, do it for all the phis). */
3989
3990   /* Get the loop-entry arguments.  */
3991   if (slp_node)
3992     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
3993                        NULL, slp_node, reduc_index);
3994   else
3995     {
3996       /* Get at the scalar def before the loop, that defines the initial value
3997          of the reduction variable.  */
3998       gimple *def_stmt = SSA_NAME_DEF_STMT (reduction_op);
3999       tree op = PHI_ARG_DEF_FROM_EDGE (def_stmt, loop_preheader_edge (loop));
4000       vec_initial_defs.create (1);
4001       vec_initial_def = get_initial_def_for_reduction (stmt, op,
4002                                                        &adjustment_def);
4003       vec_initial_defs.quick_push (vec_initial_def);
4004     }
4005
4006   /* Set phi nodes arguments.  */
4007   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4008     {
4009       tree vec_init_def, def;
4010       gimple_seq stmts;
4011       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4012                                            true, NULL_TREE);
4013       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4014       def = vect_defs[i];
4015       for (j = 0; j < ncopies; j++)
4016         {
4017           /* Set the loop-entry arg of the reduction-phi.  */
4018           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4019                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4020
4021           /* Set the loop-latch arg for the reduction-phi.  */
4022           if (j > 0)
4023             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4024
4025           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4026                        UNKNOWN_LOCATION);
4027
4028           if (dump_enabled_p ())
4029             {
4030               dump_printf_loc (MSG_NOTE, vect_location,
4031                                "transform reduction: created def-use cycle: ");
4032               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4033               dump_printf (MSG_NOTE, "\n");
4034               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4035               dump_printf (MSG_NOTE, "\n");
4036             }
4037
4038           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4039         }
4040     }
4041
4042   /* 2. Create epilog code.
4043         The reduction epilog code operates across the elements of the vector
4044         of partial results computed by the vectorized loop.
4045         The reduction epilog code consists of:
4046
4047         step 1: compute the scalar result in a vector (v_out2)
4048         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4049         step 3: adjust the scalar result (s_out3) if needed.
4050
4051         Step 1 can be accomplished using one the following three schemes:
4052           (scheme 1) using reduc_code, if available.
4053           (scheme 2) using whole-vector shifts, if available.
4054           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4055                      combined.
4056
4057           The overall epilog code looks like this:
4058
4059           s_out0 = phi <s_loop>         # original EXIT_PHI
4060           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4061           v_out2 = reduce <v_out1>              # step 1
4062           s_out3 = extract_field <v_out2, 0>    # step 2
4063           s_out4 = adjust_result <s_out3>       # step 3
4064
4065           (step 3 is optional, and steps 1 and 2 may be combined).
4066           Lastly, the uses of s_out0 are replaced by s_out4.  */
4067
4068
4069   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4070          v_out1 = phi <VECT_DEF>
4071          Store them in NEW_PHIS.  */
4072
4073   exit_bb = single_exit (loop)->dest;
4074   prev_phi_info = NULL;
4075   new_phis.create (vect_defs.length ());
4076   FOR_EACH_VEC_ELT (vect_defs, i, def)
4077     {
4078       for (j = 0; j < ncopies; j++)
4079         {
4080           tree new_def = copy_ssa_name (def);
4081           phi = create_phi_node (new_def, exit_bb);
4082           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo));
4083           if (j == 0)
4084             new_phis.quick_push (phi);
4085           else
4086             {
4087               def = vect_get_vec_def_for_stmt_copy (dt, def);
4088               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4089             }
4090
4091           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4092           prev_phi_info = vinfo_for_stmt (phi);
4093         }
4094     }
4095
4096   /* The epilogue is created for the outer-loop, i.e., for the loop being
4097      vectorized.  Create exit phis for the outer loop.  */
4098   if (double_reduc)
4099     {
4100       loop = outer_loop;
4101       exit_bb = single_exit (loop)->dest;
4102       inner_phis.create (vect_defs.length ());
4103       FOR_EACH_VEC_ELT (new_phis, i, phi)
4104         {
4105           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4106           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4107           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4108                            PHI_RESULT (phi));
4109           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4110                                                             loop_vinfo));
4111           inner_phis.quick_push (phi);
4112           new_phis[i] = outer_phi;
4113           prev_phi_info = vinfo_for_stmt (outer_phi);
4114           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4115             {
4116               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4117               new_result = copy_ssa_name (PHI_RESULT (phi));
4118               outer_phi = create_phi_node (new_result, exit_bb);
4119               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4120                                PHI_RESULT (phi));
4121               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4122                                                                 loop_vinfo));
4123               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4124               prev_phi_info = vinfo_for_stmt (outer_phi);
4125             }
4126         }
4127     }
4128
4129   exit_gsi = gsi_after_labels (exit_bb);
4130
4131   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4132          (i.e. when reduc_code is not available) and in the final adjustment
4133          code (if needed).  Also get the original scalar reduction variable as
4134          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4135          represents a reduction pattern), the tree-code and scalar-def are
4136          taken from the original stmt that the pattern-stmt (STMT) replaces.
4137          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4138          are taken from STMT.  */
4139
4140   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4141   if (!orig_stmt)
4142     {
4143       /* Regular reduction  */
4144       orig_stmt = stmt;
4145     }
4146   else
4147     {
4148       /* Reduction pattern  */
4149       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4150       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4151       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4152     }
4153
4154   code = gimple_assign_rhs_code (orig_stmt);
4155   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4156      partial results are added and not subtracted.  */
4157   if (code == MINUS_EXPR)
4158     code = PLUS_EXPR;
4159
4160   scalar_dest = gimple_assign_lhs (orig_stmt);
4161   scalar_type = TREE_TYPE (scalar_dest);
4162   scalar_results.create (group_size);
4163   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4164   bitsize = TYPE_SIZE (scalar_type);
4165
4166   /* In case this is a reduction in an inner-loop while vectorizing an outer
4167      loop - we don't need to extract a single scalar result at the end of the
4168      inner-loop (unless it is double reduction, i.e., the use of reduction is
4169      outside the outer-loop).  The final vector of partial results will be used
4170      in the vectorized outer-loop, or reduced to a scalar result at the end of
4171      the outer-loop.  */
4172   if (nested_in_vect_loop && !double_reduc)
4173     goto vect_finalize_reduction;
4174
4175   /* SLP reduction without reduction chain, e.g.,
4176      # a1 = phi <a2, a0>
4177      # b1 = phi <b2, b0>
4178      a2 = operation (a1)
4179      b2 = operation (b1)  */
4180   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4181
4182   /* In case of reduction chain, e.g.,
4183      # a1 = phi <a3, a0>
4184      a2 = operation (a1)
4185      a3 = operation (a2),
4186
4187      we may end up with more than one vector result.  Here we reduce them to
4188      one vector.  */
4189   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4190     {
4191       tree first_vect = PHI_RESULT (new_phis[0]);
4192       tree tmp;
4193       gassign *new_vec_stmt = NULL;
4194
4195       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4196       for (k = 1; k < new_phis.length (); k++)
4197         {
4198           gimple *next_phi = new_phis[k];
4199           tree second_vect = PHI_RESULT (next_phi);
4200
4201           tmp = build2 (code, vectype,  first_vect, second_vect);
4202           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4203           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4204           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4205           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4206         }
4207
4208       new_phi_result = first_vect;
4209       if (new_vec_stmt)
4210         {
4211           new_phis.truncate (0);
4212           new_phis.safe_push (new_vec_stmt);
4213         }
4214     }
4215   else
4216     new_phi_result = PHI_RESULT (new_phis[0]);
4217
4218   /* 2.3 Create the reduction code, using one of the three schemes described
4219          above. In SLP we simply need to extract all the elements from the
4220          vector (without reducing them), so we use scalar shifts.  */
4221   if (reduc_code != ERROR_MARK && !slp_reduc)
4222     {
4223       tree tmp;
4224       tree vec_elem_type;
4225
4226       /*** Case 1:  Create:
4227            v_out2 = reduc_expr <v_out1>  */
4228
4229       if (dump_enabled_p ())
4230         dump_printf_loc (MSG_NOTE, vect_location,
4231                          "Reduce using direct vector reduction.\n");
4232
4233       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4234       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4235         {
4236           tree tmp_dest =
4237               vect_create_destination_var (scalar_dest, vec_elem_type);
4238           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4239           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4240           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4241           gimple_assign_set_lhs (epilog_stmt, new_temp);
4242           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4243
4244           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4245         }
4246       else
4247         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4248       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4249       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4250       gimple_assign_set_lhs (epilog_stmt, new_temp);
4251       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4252       scalar_results.safe_push (new_temp);
4253     }
4254   else
4255     {
4256       bool reduce_with_shift = have_whole_vector_shift (mode);
4257       int element_bitsize = tree_to_uhwi (bitsize);
4258       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4259       tree vec_temp;
4260
4261       /* Regardless of whether we have a whole vector shift, if we're
4262          emulating the operation via tree-vect-generic, we don't want
4263          to use it.  Only the first round of the reduction is likely
4264          to still be profitable via emulation.  */
4265       /* ??? It might be better to emit a reduction tree code here, so that
4266          tree-vect-generic can expand the first round via bit tricks.  */
4267       if (!VECTOR_MODE_P (mode))
4268         reduce_with_shift = false;
4269       else
4270         {
4271           optab optab = optab_for_tree_code (code, vectype, optab_default);
4272           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4273             reduce_with_shift = false;
4274         }
4275
4276       if (reduce_with_shift && !slp_reduc)
4277         {
4278           int nelements = vec_size_in_bits / element_bitsize;
4279           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4280
4281           int elt_offset;
4282
4283           tree zero_vec = build_zero_cst (vectype);
4284           /*** Case 2: Create:
4285              for (offset = nelements/2; offset >= 1; offset/=2)
4286                 {
4287                   Create:  va' = vec_shift <va, offset>
4288                   Create:  va = vop <va, va'>
4289                 }  */
4290
4291           tree rhs;
4292
4293           if (dump_enabled_p ())
4294             dump_printf_loc (MSG_NOTE, vect_location,
4295                              "Reduce using vector shifts\n");
4296
4297           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4298           new_temp = new_phi_result;
4299           for (elt_offset = nelements / 2;
4300                elt_offset >= 1;
4301                elt_offset /= 2)
4302             {
4303               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4304               tree mask = vect_gen_perm_mask_any (vectype, sel);
4305               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4306                                                  new_temp, zero_vec, mask);
4307               new_name = make_ssa_name (vec_dest, epilog_stmt);
4308               gimple_assign_set_lhs (epilog_stmt, new_name);
4309               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4310
4311               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4312                                                  new_temp);
4313               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4314               gimple_assign_set_lhs (epilog_stmt, new_temp);
4315               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4316             }
4317
4318           /* 2.4  Extract the final scalar result.  Create:
4319              s_out3 = extract_field <v_out2, bitpos>  */
4320
4321           if (dump_enabled_p ())
4322             dump_printf_loc (MSG_NOTE, vect_location,
4323                              "extract scalar result\n");
4324
4325           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4326                         bitsize, bitsize_zero_node);
4327           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4328           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4329           gimple_assign_set_lhs (epilog_stmt, new_temp);
4330           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4331           scalar_results.safe_push (new_temp);
4332         }
4333       else
4334         {
4335           /*** Case 3: Create:
4336              s = extract_field <v_out2, 0>
4337              for (offset = element_size;
4338                   offset < vector_size;
4339                   offset += element_size;)
4340                {
4341                  Create:  s' = extract_field <v_out2, offset>
4342                  Create:  s = op <s, s'>  // For non SLP cases
4343                }  */
4344
4345           if (dump_enabled_p ())
4346             dump_printf_loc (MSG_NOTE, vect_location,
4347                              "Reduce using scalar code.\n");
4348
4349           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4350           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4351             {
4352               int bit_offset;
4353               if (gimple_code (new_phi) == GIMPLE_PHI)
4354                 vec_temp = PHI_RESULT (new_phi);
4355               else
4356                 vec_temp = gimple_assign_lhs (new_phi);
4357               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4358                             bitsize_zero_node);
4359               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4360               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4361               gimple_assign_set_lhs (epilog_stmt, new_temp);
4362               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4363
4364               /* In SLP we don't need to apply reduction operation, so we just
4365                  collect s' values in SCALAR_RESULTS.  */
4366               if (slp_reduc)
4367                 scalar_results.safe_push (new_temp);
4368
4369               for (bit_offset = element_bitsize;
4370                    bit_offset < vec_size_in_bits;
4371                    bit_offset += element_bitsize)
4372                 {
4373                   tree bitpos = bitsize_int (bit_offset);
4374                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4375                                      bitsize, bitpos);
4376
4377                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4378                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4379                   gimple_assign_set_lhs (epilog_stmt, new_name);
4380                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4381
4382                   if (slp_reduc)
4383                     {
4384                       /* In SLP we don't need to apply reduction operation, so
4385                          we just collect s' values in SCALAR_RESULTS.  */
4386                       new_temp = new_name;
4387                       scalar_results.safe_push (new_name);
4388                     }
4389                   else
4390                     {
4391                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4392                                                          new_name, new_temp);
4393                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4394                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4395                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4396                     }
4397                 }
4398             }
4399
4400           /* The only case where we need to reduce scalar results in SLP, is
4401              unrolling.  If the size of SCALAR_RESULTS is greater than
4402              GROUP_SIZE, we reduce them combining elements modulo
4403              GROUP_SIZE.  */
4404           if (slp_reduc)
4405             {
4406               tree res, first_res, new_res;
4407               gimple *new_stmt;
4408
4409               /* Reduce multiple scalar results in case of SLP unrolling.  */
4410               for (j = group_size; scalar_results.iterate (j, &res);
4411                    j++)
4412                 {
4413                   first_res = scalar_results[j % group_size];
4414                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4415                                                   first_res, res);
4416                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4417                   gimple_assign_set_lhs (new_stmt, new_res);
4418                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4419                   scalar_results[j % group_size] = new_res;
4420                 }
4421             }
4422           else
4423             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4424             scalar_results.safe_push (new_temp);
4425         }
4426     }
4427
4428 vect_finalize_reduction:
4429
4430   if (double_reduc)
4431     loop = loop->inner;
4432
4433   /* 2.5 Adjust the final result by the initial value of the reduction
4434          variable. (When such adjustment is not needed, then
4435          'adjustment_def' is zero).  For example, if code is PLUS we create:
4436          new_temp = loop_exit_def + adjustment_def  */
4437
4438   if (adjustment_def)
4439     {
4440       gcc_assert (!slp_reduc);
4441       if (nested_in_vect_loop)
4442         {
4443           new_phi = new_phis[0];
4444           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4445           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4446           new_dest = vect_create_destination_var (scalar_dest, vectype);
4447         }
4448       else
4449         {
4450           new_temp = scalar_results[0];
4451           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4452           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4453           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4454         }
4455
4456       epilog_stmt = gimple_build_assign (new_dest, expr);
4457       new_temp = make_ssa_name (new_dest, epilog_stmt);
4458       gimple_assign_set_lhs (epilog_stmt, new_temp);
4459       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4460       if (nested_in_vect_loop)
4461         {
4462           set_vinfo_for_stmt (epilog_stmt,
4463                               new_stmt_vec_info (epilog_stmt, loop_vinfo));
4464           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4465                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4466
4467           if (!double_reduc)
4468             scalar_results.quick_push (new_temp);
4469           else
4470             scalar_results[0] = new_temp;
4471         }
4472       else
4473         scalar_results[0] = new_temp;
4474
4475       new_phis[0] = epilog_stmt;
4476     }
4477
4478   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4479           phis with new adjusted scalar results, i.e., replace use <s_out0>
4480           with use <s_out4>.
4481
4482      Transform:
4483         loop_exit:
4484           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4485           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4486           v_out2 = reduce <v_out1>
4487           s_out3 = extract_field <v_out2, 0>
4488           s_out4 = adjust_result <s_out3>
4489           use <s_out0>
4490           use <s_out0>
4491
4492      into:
4493
4494         loop_exit:
4495           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4496           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4497           v_out2 = reduce <v_out1>
4498           s_out3 = extract_field <v_out2, 0>
4499           s_out4 = adjust_result <s_out3>
4500           use <s_out4>
4501           use <s_out4> */
4502
4503
4504   /* In SLP reduction chain we reduce vector results into one vector if
4505      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4506      the last stmt in the reduction chain, since we are looking for the loop
4507      exit phi node.  */
4508   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4509     {
4510       gimple *dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4511       /* Handle reduction patterns.  */
4512       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4513         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4514
4515       scalar_dest = gimple_assign_lhs (dest_stmt);
4516       group_size = 1;
4517     }
4518
4519   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4520      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4521      need to match SCALAR_RESULTS with corresponding statements.  The first
4522      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4523      the first vector stmt, etc.
4524      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4525   if (group_size > new_phis.length ())
4526     {
4527       ratio = group_size / new_phis.length ();
4528       gcc_assert (!(group_size % new_phis.length ()));
4529     }
4530   else
4531     ratio = 1;
4532
4533   for (k = 0; k < group_size; k++)
4534     {
4535       if (k % ratio == 0)
4536         {
4537           epilog_stmt = new_phis[k / ratio];
4538           reduction_phi = reduction_phis[k / ratio];
4539           if (double_reduc)
4540             inner_phi = inner_phis[k / ratio];
4541         }
4542
4543       if (slp_reduc)
4544         {
4545           gimple *current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4546
4547           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4548           /* SLP statements can't participate in patterns.  */
4549           gcc_assert (!orig_stmt);
4550           scalar_dest = gimple_assign_lhs (current_stmt);
4551         }
4552
4553       phis.create (3);
4554       /* Find the loop-closed-use at the loop exit of the original scalar
4555          result.  (The reduction result is expected to have two immediate uses -
4556          one at the latch block, and one at the loop exit).  */
4557       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4558         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4559             && !is_gimple_debug (USE_STMT (use_p)))
4560           phis.safe_push (USE_STMT (use_p));
4561
4562       /* While we expect to have found an exit_phi because of loop-closed-ssa
4563          form we can end up without one if the scalar cycle is dead.  */
4564
4565       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4566         {
4567           if (outer_loop)
4568             {
4569               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4570               gphi *vect_phi;
4571
4572               /* FORNOW. Currently not supporting the case that an inner-loop
4573                  reduction is not used in the outer-loop (but only outside the
4574                  outer-loop), unless it is double reduction.  */
4575               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4576                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4577                           || double_reduc);
4578
4579               if (double_reduc)
4580                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4581               else
4582                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4583               if (!double_reduc
4584                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4585                       != vect_double_reduction_def)
4586                 continue;
4587
4588               /* Handle double reduction:
4589
4590                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4591                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4592                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4593                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4594
4595                  At that point the regular reduction (stmt2 and stmt3) is
4596                  already vectorized, as well as the exit phi node, stmt4.
4597                  Here we vectorize the phi node of double reduction, stmt1, and
4598                  update all relevant statements.  */
4599
4600               /* Go through all the uses of s2 to find double reduction phi
4601                  node, i.e., stmt1 above.  */
4602               orig_name = PHI_RESULT (exit_phi);
4603               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4604                 {
4605                   stmt_vec_info use_stmt_vinfo;
4606                   stmt_vec_info new_phi_vinfo;
4607                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4608                   basic_block bb = gimple_bb (use_stmt);
4609                   gimple *use;
4610
4611                   /* Check that USE_STMT is really double reduction phi
4612                      node.  */
4613                   if (gimple_code (use_stmt) != GIMPLE_PHI
4614                       || gimple_phi_num_args (use_stmt) != 2
4615                       || bb->loop_father != outer_loop)
4616                     continue;
4617                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4618                   if (!use_stmt_vinfo
4619                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4620                           != vect_double_reduction_def)
4621                     continue;
4622
4623                   /* Create vector phi node for double reduction:
4624                      vs1 = phi <vs0, vs2>
4625                      vs1 was created previously in this function by a call to
4626                        vect_get_vec_def_for_operand and is stored in
4627                        vec_initial_def;
4628                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4629                      vs0 is created here.  */
4630
4631                   /* Create vector phi node.  */
4632                   vect_phi = create_phi_node (vec_initial_def, bb);
4633                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4634                                     loop_vec_info_for_loop (outer_loop));
4635                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4636
4637                   /* Create vs0 - initial def of the double reduction phi.  */
4638                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4639                                              loop_preheader_edge (outer_loop));
4640                   init_def = get_initial_def_for_reduction (stmt,
4641                                                           preheader_arg, NULL);
4642                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4643                                                     vectype, NULL);
4644
4645                   /* Update phi node arguments with vs0 and vs2.  */
4646                   add_phi_arg (vect_phi, vect_phi_init,
4647                                loop_preheader_edge (outer_loop),
4648                                UNKNOWN_LOCATION);
4649                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4650                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4651                   if (dump_enabled_p ())
4652                     {
4653                       dump_printf_loc (MSG_NOTE, vect_location,
4654                                        "created double reduction phi node: ");
4655                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4656                       dump_printf (MSG_NOTE, "\n");
4657                     }
4658
4659                   vect_phi_res = PHI_RESULT (vect_phi);
4660
4661                   /* Replace the use, i.e., set the correct vs1 in the regular
4662                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4663                      loop is redundant.  */
4664                   use = reduction_phi;
4665                   for (j = 0; j < ncopies; j++)
4666                     {
4667                       edge pr_edge = loop_preheader_edge (loop);
4668                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4669                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4670                     }
4671                 }
4672             }
4673         }
4674
4675       phis.release ();
4676       if (nested_in_vect_loop)
4677         {
4678           if (double_reduc)
4679             loop = outer_loop;
4680           else
4681             continue;
4682         }
4683
4684       phis.create (3);
4685       /* Find the loop-closed-use at the loop exit of the original scalar
4686          result.  (The reduction result is expected to have two immediate uses,
4687          one at the latch block, and one at the loop exit).  For double
4688          reductions we are looking for exit phis of the outer loop.  */
4689       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4690         {
4691           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4692             {
4693               if (!is_gimple_debug (USE_STMT (use_p)))
4694                 phis.safe_push (USE_STMT (use_p));
4695             }
4696           else
4697             {
4698               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4699                 {
4700                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4701
4702                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4703                     {
4704                       if (!flow_bb_inside_loop_p (loop,
4705                                              gimple_bb (USE_STMT (phi_use_p)))
4706                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4707                         phis.safe_push (USE_STMT (phi_use_p));
4708                     }
4709                 }
4710             }
4711         }
4712
4713       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4714         {
4715           /* Replace the uses:  */
4716           orig_name = PHI_RESULT (exit_phi);
4717           scalar_result = scalar_results[k];
4718           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4719             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4720               SET_USE (use_p, scalar_result);
4721         }
4722
4723       phis.release ();
4724     }
4725 }
4726
4727
4728 /* Function vectorizable_reduction.
4729
4730    Check if STMT performs a reduction operation that can be vectorized.
4731    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4732    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4733    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4734
4735    This function also handles reduction idioms (patterns) that have been
4736    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4737    of this form:
4738      X = pattern_expr (arg0, arg1, ..., X)
4739    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4740    sequence that had been detected and replaced by the pattern-stmt (STMT).
4741
4742    In some cases of reduction patterns, the type of the reduction variable X is
4743    different than the type of the other arguments of STMT.
4744    In such cases, the vectype that is used when transforming STMT into a vector
4745    stmt is different than the vectype that is used to determine the
4746    vectorization factor, because it consists of a different number of elements
4747    than the actual number of elements that are being operated upon in parallel.
4748
4749    For example, consider an accumulation of shorts into an int accumulator.
4750    On some targets it's possible to vectorize this pattern operating on 8
4751    shorts at a time (hence, the vectype for purposes of determining the
4752    vectorization factor should be V8HI); on the other hand, the vectype that
4753    is used to create the vector form is actually V4SI (the type of the result).
4754
4755    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4756    indicates what is the actual level of parallelism (V8HI in the example), so
4757    that the right vectorization factor would be derived.  This vectype
4758    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4759    be used to create the vectorized stmt.  The right vectype for the vectorized
4760    stmt is obtained from the type of the result X:
4761         get_vectype_for_scalar_type (TREE_TYPE (X))
4762
4763    This means that, contrary to "regular" reductions (or "regular" stmts in
4764    general), the following equation:
4765       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4766    does *NOT* necessarily hold for reduction patterns.  */
4767
4768 bool
4769 vectorizable_reduction (gimple *stmt, gimple_stmt_iterator *gsi,
4770                         gimple **vec_stmt, slp_tree slp_node)
4771 {
4772   tree vec_dest;
4773   tree scalar_dest;
4774   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4775   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4776   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4777   tree vectype_in = NULL_TREE;
4778   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4779   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4780   enum tree_code code, orig_code, epilog_reduc_code;
4781   machine_mode vec_mode;
4782   int op_type;
4783   optab optab, reduc_optab;
4784   tree new_temp = NULL_TREE;
4785   gimple *def_stmt;
4786   enum vect_def_type dt;
4787   gphi *new_phi = NULL;
4788   tree scalar_type;
4789   bool is_simple_use;
4790   gimple *orig_stmt;
4791   stmt_vec_info orig_stmt_info;
4792   tree expr = NULL_TREE;
4793   int i;
4794   int ncopies;
4795   int epilog_copies;
4796   stmt_vec_info prev_stmt_info, prev_phi_info;
4797   bool single_defuse_cycle = false;
4798   tree reduc_def = NULL_TREE;
4799   gimple *new_stmt = NULL;
4800   int j;
4801   tree ops[3];
4802   bool nested_cycle = false, found_nested_cycle_def = false;
4803   gimple *reduc_def_stmt = NULL;
4804   bool double_reduc = false, dummy;
4805   basic_block def_bb;
4806   struct loop * def_stmt_loop, *outer_loop = NULL;
4807   tree def_arg;
4808   gimple *def_arg_stmt;
4809   auto_vec<tree> vec_oprnds0;
4810   auto_vec<tree> vec_oprnds1;
4811   auto_vec<tree> vect_defs;
4812   auto_vec<gimple *> phis;
4813   int vec_num;
4814   tree def0, def1, tem, op0, op1 = NULL_TREE;
4815   bool first_p = true;
4816
4817   /* In case of reduction chain we switch to the first stmt in the chain, but
4818      we don't update STMT_INFO, since only the last stmt is marked as reduction
4819      and has reduction properties.  */
4820   if (GROUP_FIRST_ELEMENT (stmt_info)
4821       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4822     {
4823       stmt = GROUP_FIRST_ELEMENT (stmt_info);
4824       first_p = false;
4825     }
4826
4827   if (nested_in_vect_loop_p (loop, stmt))
4828     {
4829       outer_loop = loop;
4830       loop = loop->inner;
4831       nested_cycle = true;
4832     }
4833
4834   /* 1. Is vectorizable reduction?  */
4835   /* Not supportable if the reduction variable is used in the loop, unless
4836      it's a reduction chain.  */
4837   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4838       && !GROUP_FIRST_ELEMENT (stmt_info))
4839     return false;
4840
4841   /* Reductions that are not used even in an enclosing outer-loop,
4842      are expected to be "live" (used out of the loop).  */
4843   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4844       && !STMT_VINFO_LIVE_P (stmt_info))
4845     return false;
4846
4847   /* Make sure it was already recognized as a reduction computation.  */
4848   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4849       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4850     return false;
4851
4852   /* 2. Has this been recognized as a reduction pattern?
4853
4854      Check if STMT represents a pattern that has been recognized
4855      in earlier analysis stages.  For stmts that represent a pattern,
4856      the STMT_VINFO_RELATED_STMT field records the last stmt in
4857      the original sequence that constitutes the pattern.  */
4858
4859   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4860   if (orig_stmt)
4861     {
4862       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4863       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4864       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4865     }
4866
4867   /* 3. Check the operands of the operation.  The first operands are defined
4868         inside the loop body. The last operand is the reduction variable,
4869         which is defined by the loop-header-phi.  */
4870
4871   gcc_assert (is_gimple_assign (stmt));
4872
4873   /* Flatten RHS.  */
4874   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4875     {
4876     case GIMPLE_SINGLE_RHS:
4877       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4878       if (op_type == ternary_op)
4879         {
4880           tree rhs = gimple_assign_rhs1 (stmt);
4881           ops[0] = TREE_OPERAND (rhs, 0);
4882           ops[1] = TREE_OPERAND (rhs, 1);
4883           ops[2] = TREE_OPERAND (rhs, 2);
4884           code = TREE_CODE (rhs);
4885         }
4886       else
4887         return false;
4888       break;
4889
4890     case GIMPLE_BINARY_RHS:
4891       code = gimple_assign_rhs_code (stmt);
4892       op_type = TREE_CODE_LENGTH (code);
4893       gcc_assert (op_type == binary_op);
4894       ops[0] = gimple_assign_rhs1 (stmt);
4895       ops[1] = gimple_assign_rhs2 (stmt);
4896       break;
4897
4898     case GIMPLE_TERNARY_RHS:
4899       code = gimple_assign_rhs_code (stmt);
4900       op_type = TREE_CODE_LENGTH (code);
4901       gcc_assert (op_type == ternary_op);
4902       ops[0] = gimple_assign_rhs1 (stmt);
4903       ops[1] = gimple_assign_rhs2 (stmt);
4904       ops[2] = gimple_assign_rhs3 (stmt);
4905       break;
4906
4907     case GIMPLE_UNARY_RHS:
4908       return false;
4909
4910     default:
4911       gcc_unreachable ();
4912     }
4913   /* The default is that the reduction variable is the last in statement.  */
4914   int reduc_index = op_type - 1;
4915
4916   if (code == COND_EXPR && slp_node)
4917     return false;
4918
4919   scalar_dest = gimple_assign_lhs (stmt);
4920   scalar_type = TREE_TYPE (scalar_dest);
4921   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
4922       && !SCALAR_FLOAT_TYPE_P (scalar_type))
4923     return false;
4924
4925   /* Do not try to vectorize bit-precision reductions.  */
4926   if ((TYPE_PRECISION (scalar_type)
4927        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
4928     return false;
4929
4930   /* All uses but the last are expected to be defined in the loop.
4931      The last use is the reduction variable.  In case of nested cycle this
4932      assumption is not true: we use reduc_index to record the index of the
4933      reduction variable.  */
4934   for (i = 0; i < op_type - 1; i++)
4935     {
4936       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
4937       if (i == 0 && code == COND_EXPR)
4938         continue;
4939
4940       is_simple_use = vect_is_simple_use (ops[i], loop_vinfo,
4941                                           &def_stmt, &dt, &tem);
4942       if (!vectype_in)
4943         vectype_in = tem;
4944       gcc_assert (is_simple_use);
4945
4946       if (dt != vect_internal_def
4947           && dt != vect_external_def
4948           && dt != vect_constant_def
4949           && dt != vect_induction_def
4950           && !(dt == vect_nested_cycle && nested_cycle))
4951         return false;
4952
4953       if (dt == vect_nested_cycle)
4954         {
4955           found_nested_cycle_def = true;
4956           reduc_def_stmt = def_stmt;
4957           reduc_index = i;
4958         }
4959     }
4960
4961   is_simple_use = vect_is_simple_use (ops[i], loop_vinfo, &def_stmt, &dt, &tem);
4962   if (!vectype_in)
4963     vectype_in = tem;
4964   gcc_assert (is_simple_use);
4965   if (!found_nested_cycle_def)
4966     reduc_def_stmt = def_stmt;
4967
4968   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
4969     return false;
4970
4971   if (!(dt == vect_reduction_def
4972         || dt == vect_nested_cycle
4973         || ((dt == vect_internal_def || dt == vect_external_def
4974              || dt == vect_constant_def || dt == vect_induction_def)
4975             && nested_cycle && found_nested_cycle_def)))
4976     {
4977       /* For pattern recognized stmts, orig_stmt might be a reduction,
4978          but some helper statements for the pattern might not, or
4979          might be COND_EXPRs with reduction uses in the condition.  */
4980       gcc_assert (orig_stmt);
4981       return false;
4982     }
4983
4984   gimple *tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
4985                                           !nested_cycle, &dummy, false);
4986   if (orig_stmt)
4987     gcc_assert (tmp == orig_stmt
4988                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
4989   else
4990     /* We changed STMT to be the first stmt in reduction chain, hence we
4991        check that in this case the first element in the chain is STMT.  */
4992     gcc_assert (stmt == tmp
4993                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
4994
4995   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
4996     return false;
4997
4998   if (slp_node || PURE_SLP_STMT (stmt_info))
4999     ncopies = 1;
5000   else
5001     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5002                / TYPE_VECTOR_SUBPARTS (vectype_in));
5003
5004   gcc_assert (ncopies >= 1);
5005
5006   vec_mode = TYPE_MODE (vectype_in);
5007
5008   if (code == COND_EXPR)
5009     {
5010       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5011         {
5012           if (dump_enabled_p ())
5013             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5014                              "unsupported condition in reduction\n");
5015
5016           return false;
5017         }
5018     }
5019   else
5020     {
5021       /* 4. Supportable by target?  */
5022
5023       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5024           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5025         {
5026           /* Shifts and rotates are only supported by vectorizable_shifts,
5027              not vectorizable_reduction.  */
5028           if (dump_enabled_p ())
5029             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5030                              "unsupported shift or rotation.\n");
5031           return false;
5032         }
5033
5034       /* 4.1. check support for the operation in the loop  */
5035       optab = optab_for_tree_code (code, vectype_in, optab_default);
5036       if (!optab)
5037         {
5038           if (dump_enabled_p ())
5039             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5040                              "no optab.\n");
5041
5042           return false;
5043         }
5044
5045       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5046         {
5047           if (dump_enabled_p ())
5048             dump_printf (MSG_NOTE, "op not supported by target.\n");
5049
5050           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5051               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5052                   < vect_min_worthwhile_factor (code))
5053             return false;
5054
5055           if (dump_enabled_p ())
5056             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5057         }
5058
5059       /* Worthwhile without SIMD support?  */
5060       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5061           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5062              < vect_min_worthwhile_factor (code))
5063         {
5064           if (dump_enabled_p ())
5065             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5066                              "not worthwhile without SIMD support.\n");
5067
5068           return false;
5069         }
5070     }
5071
5072   /* 4.2. Check support for the epilog operation.
5073
5074           If STMT represents a reduction pattern, then the type of the
5075           reduction variable may be different than the type of the rest
5076           of the arguments.  For example, consider the case of accumulation
5077           of shorts into an int accumulator; The original code:
5078                         S1: int_a = (int) short_a;
5079           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5080
5081           was replaced with:
5082                         STMT: int_acc = widen_sum <short_a, int_acc>
5083
5084           This means that:
5085           1. The tree-code that is used to create the vector operation in the
5086              epilog code (that reduces the partial results) is not the
5087              tree-code of STMT, but is rather the tree-code of the original
5088              stmt from the pattern that STMT is replacing.  I.e, in the example
5089              above we want to use 'widen_sum' in the loop, but 'plus' in the
5090              epilog.
5091           2. The type (mode) we use to check available target support
5092              for the vector operation to be created in the *epilog*, is
5093              determined by the type of the reduction variable (in the example
5094              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5095              However the type (mode) we use to check available target support
5096              for the vector operation to be created *inside the loop*, is
5097              determined by the type of the other arguments to STMT (in the
5098              example we'd check this: optab_handler (widen_sum_optab,
5099              vect_short_mode)).
5100
5101           This is contrary to "regular" reductions, in which the types of all
5102           the arguments are the same as the type of the reduction variable.
5103           For "regular" reductions we can therefore use the same vector type
5104           (and also the same tree-code) when generating the epilog code and
5105           when generating the code inside the loop.  */
5106
5107   if (orig_stmt)
5108     {
5109       /* This is a reduction pattern: get the vectype from the type of the
5110          reduction variable, and get the tree-code from orig_stmt.  */
5111       orig_code = gimple_assign_rhs_code (orig_stmt);
5112       gcc_assert (vectype_out);
5113       vec_mode = TYPE_MODE (vectype_out);
5114     }
5115   else
5116     {
5117       /* Regular reduction: use the same vectype and tree-code as used for
5118          the vector code inside the loop can be used for the epilog code. */
5119       orig_code = code;
5120     }
5121
5122   if (nested_cycle)
5123     {
5124       def_bb = gimple_bb (reduc_def_stmt);
5125       def_stmt_loop = def_bb->loop_father;
5126       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5127                                        loop_preheader_edge (def_stmt_loop));
5128       if (TREE_CODE (def_arg) == SSA_NAME
5129           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5130           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5131           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5132           && vinfo_for_stmt (def_arg_stmt)
5133           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5134               == vect_double_reduction_def)
5135         double_reduc = true;
5136     }
5137
5138   epilog_reduc_code = ERROR_MARK;
5139   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5140     {
5141       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5142                                          optab_default);
5143       if (!reduc_optab)
5144         {
5145           if (dump_enabled_p ())
5146             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5147                              "no optab for reduction.\n");
5148
5149           epilog_reduc_code = ERROR_MARK;
5150         }
5151       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5152         {
5153           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5154           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5155             {
5156               if (dump_enabled_p ())
5157                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158                                  "reduc op not supported by target.\n");
5159
5160               epilog_reduc_code = ERROR_MARK;
5161             }
5162         }
5163     }
5164   else
5165     {
5166       if (!nested_cycle || double_reduc)
5167         {
5168           if (dump_enabled_p ())
5169             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5170                              "no reduc code for scalar code.\n");
5171
5172           return false;
5173         }
5174     }
5175
5176   if (double_reduc && ncopies > 1)
5177     {
5178       if (dump_enabled_p ())
5179         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5180                          "multiple types in double reduction\n");
5181
5182       return false;
5183     }
5184
5185   /* In case of widenning multiplication by a constant, we update the type
5186      of the constant to be the type of the other operand.  We check that the
5187      constant fits the type in the pattern recognition pass.  */
5188   if (code == DOT_PROD_EXPR
5189       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5190     {
5191       if (TREE_CODE (ops[0]) == INTEGER_CST)
5192         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5193       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5194         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5195       else
5196         {
5197           if (dump_enabled_p ())
5198             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5199                              "invalid types in dot-prod\n");
5200
5201           return false;
5202         }
5203     }
5204
5205   if (!vec_stmt) /* transformation not required.  */
5206     {
5207       if (first_p
5208           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5209                                          reduc_index))
5210         return false;
5211       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5212       return true;
5213     }
5214
5215   /** Transform.  **/
5216
5217   if (dump_enabled_p ())
5218     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5219
5220   /* FORNOW: Multiple types are not supported for condition.  */
5221   if (code == COND_EXPR)
5222     gcc_assert (ncopies == 1);
5223
5224   /* Create the destination vector  */
5225   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5226
5227   /* In case the vectorization factor (VF) is bigger than the number
5228      of elements that we can fit in a vectype (nunits), we have to generate
5229      more than one vector stmt - i.e - we need to "unroll" the
5230      vector stmt by a factor VF/nunits.  For more details see documentation
5231      in vectorizable_operation.  */
5232
5233   /* If the reduction is used in an outer loop we need to generate
5234      VF intermediate results, like so (e.g. for ncopies=2):
5235         r0 = phi (init, r0)
5236         r1 = phi (init, r1)
5237         r0 = x0 + r0;
5238         r1 = x1 + r1;
5239     (i.e. we generate VF results in 2 registers).
5240     In this case we have a separate def-use cycle for each copy, and therefore
5241     for each copy we get the vector def for the reduction variable from the
5242     respective phi node created for this copy.
5243
5244     Otherwise (the reduction is unused in the loop nest), we can combine
5245     together intermediate results, like so (e.g. for ncopies=2):
5246         r = phi (init, r)
5247         r = x0 + r;
5248         r = x1 + r;
5249    (i.e. we generate VF/2 results in a single register).
5250    In this case for each copy we get the vector def for the reduction variable
5251    from the vectorized reduction operation generated in the previous iteration.
5252   */
5253
5254   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5255     {
5256       single_defuse_cycle = true;
5257       epilog_copies = 1;
5258     }
5259   else
5260     epilog_copies = ncopies;
5261
5262   prev_stmt_info = NULL;
5263   prev_phi_info = NULL;
5264   if (slp_node)
5265     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5266   else
5267     {
5268       vec_num = 1;
5269       vec_oprnds0.create (1);
5270       if (op_type == ternary_op)
5271         vec_oprnds1.create (1);
5272     }
5273
5274   phis.create (vec_num);
5275   vect_defs.create (vec_num);
5276   if (!slp_node)
5277     vect_defs.quick_push (NULL_TREE);
5278
5279   for (j = 0; j < ncopies; j++)
5280     {
5281       if (j == 0 || !single_defuse_cycle)
5282         {
5283           for (i = 0; i < vec_num; i++)
5284             {
5285               /* Create the reduction-phi that defines the reduction
5286                  operand.  */
5287               new_phi = create_phi_node (vec_dest, loop->header);
5288               set_vinfo_for_stmt (new_phi,
5289                                   new_stmt_vec_info (new_phi, loop_vinfo));
5290                if (j == 0 || slp_node)
5291                  phis.quick_push (new_phi);
5292             }
5293         }
5294
5295       if (code == COND_EXPR)
5296         {
5297           gcc_assert (!slp_node);
5298           vectorizable_condition (stmt, gsi, vec_stmt,
5299                                   PHI_RESULT (phis[0]),
5300                                   reduc_index, NULL);
5301           /* Multiple types are not supported for condition.  */
5302           break;
5303         }
5304
5305       /* Handle uses.  */
5306       if (j == 0)
5307         {
5308           op0 = ops[!reduc_index];
5309           if (op_type == ternary_op)
5310             {
5311               if (reduc_index == 0)
5312                 op1 = ops[2];
5313               else
5314                 op1 = ops[1];
5315             }
5316
5317           if (slp_node)
5318             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5319                                slp_node, -1);
5320           else
5321             {
5322               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5323                                                             stmt);
5324               vec_oprnds0.quick_push (loop_vec_def0);
5325               if (op_type == ternary_op)
5326                {
5327                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt);
5328                  vec_oprnds1.quick_push (loop_vec_def1);
5329                }
5330             }
5331         }
5332       else
5333         {
5334           if (!slp_node)
5335             {
5336               enum vect_def_type dt;
5337               gimple *dummy_stmt;
5338
5339               vect_is_simple_use (ops[!reduc_index], loop_vinfo,
5340                                   &dummy_stmt, &dt);
5341               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5342                                                               loop_vec_def0);
5343               vec_oprnds0[0] = loop_vec_def0;
5344               if (op_type == ternary_op)
5345                 {
5346                   vect_is_simple_use (op1, loop_vinfo, &dummy_stmt, &dt);
5347                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5348                                                                 loop_vec_def1);
5349                   vec_oprnds1[0] = loop_vec_def1;
5350                 }
5351             }
5352
5353           if (single_defuse_cycle)
5354             reduc_def = gimple_assign_lhs (new_stmt);
5355
5356           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5357         }
5358
5359       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5360         {
5361           if (slp_node)
5362             reduc_def = PHI_RESULT (phis[i]);
5363           else
5364             {
5365               if (!single_defuse_cycle || j == 0)
5366                 reduc_def = PHI_RESULT (new_phi);
5367             }
5368
5369           def1 = ((op_type == ternary_op)
5370                   ? vec_oprnds1[i] : NULL);
5371           if (op_type == binary_op)
5372             {
5373               if (reduc_index == 0)
5374                 expr = build2 (code, vectype_out, reduc_def, def0);
5375               else
5376                 expr = build2 (code, vectype_out, def0, reduc_def);
5377             }
5378           else
5379             {
5380               if (reduc_index == 0)
5381                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5382               else
5383                 {
5384                   if (reduc_index == 1)
5385                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5386                   else
5387                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5388                 }
5389             }
5390
5391           new_stmt = gimple_build_assign (vec_dest, expr);
5392           new_temp = make_ssa_name (vec_dest, new_stmt);
5393           gimple_assign_set_lhs (new_stmt, new_temp);
5394           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5395
5396           if (slp_node)
5397             {
5398               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5399               vect_defs.quick_push (new_temp);
5400             }
5401           else
5402             vect_defs[0] = new_temp;
5403         }
5404
5405       if (slp_node)
5406         continue;
5407
5408       if (j == 0)
5409         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5410       else
5411         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5412
5413       prev_stmt_info = vinfo_for_stmt (new_stmt);
5414       prev_phi_info = vinfo_for_stmt (new_phi);
5415     }
5416
5417   /* Finalize the reduction-phi (set its arguments) and create the
5418      epilog reduction code.  */
5419   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5420     {
5421       new_temp = gimple_assign_lhs (*vec_stmt);
5422       vect_defs[0] = new_temp;
5423     }
5424
5425   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5426                                     epilog_reduc_code, phis, reduc_index,
5427                                     double_reduc, slp_node);
5428
5429   return true;
5430 }
5431
5432 /* Function vect_min_worthwhile_factor.
5433
5434    For a loop where we could vectorize the operation indicated by CODE,
5435    return the minimum vectorization factor that makes it worthwhile
5436    to use generic vectors.  */
5437 int
5438 vect_min_worthwhile_factor (enum tree_code code)
5439 {
5440   switch (code)
5441     {
5442     case PLUS_EXPR:
5443     case MINUS_EXPR:
5444     case NEGATE_EXPR:
5445       return 4;
5446
5447     case BIT_AND_EXPR:
5448     case BIT_IOR_EXPR:
5449     case BIT_XOR_EXPR:
5450     case BIT_NOT_EXPR:
5451       return 2;
5452
5453     default:
5454       return INT_MAX;
5455     }
5456 }
5457
5458
5459 /* Function vectorizable_induction
5460
5461    Check if PHI performs an induction computation that can be vectorized.
5462    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5463    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5464    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5465
5466 bool
5467 vectorizable_induction (gimple *phi,
5468                         gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5469                         gimple **vec_stmt)
5470 {
5471   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5472   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5473   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5474   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5475   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5476   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5477   tree vec_def;
5478
5479   gcc_assert (ncopies >= 1);
5480   /* FORNOW. These restrictions should be relaxed.  */
5481   if (nested_in_vect_loop_p (loop, phi))
5482     {
5483       imm_use_iterator imm_iter;
5484       use_operand_p use_p;
5485       gimple *exit_phi;
5486       edge latch_e;
5487       tree loop_arg;
5488
5489       if (ncopies > 1)
5490         {
5491           if (dump_enabled_p ())
5492             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5493                              "multiple types in nested loop.\n");
5494           return false;
5495         }
5496
5497       exit_phi = NULL;
5498       latch_e = loop_latch_edge (loop->inner);
5499       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5500       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5501         {
5502           gimple *use_stmt = USE_STMT (use_p);
5503           if (is_gimple_debug (use_stmt))
5504             continue;
5505
5506           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5507             {
5508               exit_phi = use_stmt;
5509               break;
5510             }
5511         }
5512       if (exit_phi)
5513         {
5514           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5515           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5516                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5517             {
5518               if (dump_enabled_p ())
5519                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5520                                  "inner-loop induction only used outside "
5521                                  "of the outer vectorized loop.\n");
5522               return false;
5523             }
5524         }
5525     }
5526
5527   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5528     return false;
5529
5530   /* FORNOW: SLP not supported.  */
5531   if (STMT_SLP_TYPE (stmt_info))
5532     return false;
5533
5534   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5535
5536   if (gimple_code (phi) != GIMPLE_PHI)
5537     return false;
5538
5539   if (!vec_stmt) /* transformation not required.  */
5540     {
5541       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5542       if (dump_enabled_p ())
5543         dump_printf_loc (MSG_NOTE, vect_location,
5544                          "=== vectorizable_induction ===\n");
5545       vect_model_induction_cost (stmt_info, ncopies);
5546       return true;
5547     }
5548
5549   /** Transform.  **/
5550
5551   if (dump_enabled_p ())
5552     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5553
5554   vec_def = get_initial_def_for_induction (phi);
5555   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5556   return true;
5557 }
5558
5559 /* Function vectorizable_live_operation.
5560
5561    STMT computes a value that is used outside the loop.  Check if
5562    it can be supported.  */
5563
5564 bool
5565 vectorizable_live_operation (gimple *stmt,
5566                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5567                              gimple **vec_stmt)
5568 {
5569   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5570   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5571   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5572   int i;
5573   int op_type;
5574   tree op;
5575   gimple *def_stmt;
5576   enum vect_def_type dt;
5577   enum tree_code code;
5578   enum gimple_rhs_class rhs_class;
5579
5580   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5581
5582   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5583     return false;
5584
5585   if (!is_gimple_assign (stmt))
5586     {
5587       if (gimple_call_internal_p (stmt)
5588           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5589           && gimple_call_lhs (stmt)
5590           && loop->simduid
5591           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5592           && loop->simduid
5593              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5594         {
5595           edge e = single_exit (loop);
5596           basic_block merge_bb = e->dest;
5597           imm_use_iterator imm_iter;
5598           use_operand_p use_p;
5599           tree lhs = gimple_call_lhs (stmt);
5600
5601           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5602             {
5603               gimple *use_stmt = USE_STMT (use_p);
5604               if (gimple_code (use_stmt) == GIMPLE_PHI
5605                   && gimple_bb (use_stmt) == merge_bb)
5606                 {
5607                   if (vec_stmt)
5608                     {
5609                       tree vfm1
5610                         = build_int_cst (unsigned_type_node,
5611                                          loop_vinfo->vectorization_factor - 1);
5612                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5613                     }
5614                   return true;
5615                 }
5616             }
5617         }
5618
5619       return false;
5620     }
5621
5622   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5623     return false;
5624
5625   /* FORNOW. CHECKME. */
5626   if (nested_in_vect_loop_p (loop, stmt))
5627     return false;
5628
5629   code = gimple_assign_rhs_code (stmt);
5630   op_type = TREE_CODE_LENGTH (code);
5631   rhs_class = get_gimple_rhs_class (code);
5632   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5633   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5634
5635   /* FORNOW: support only if all uses are invariant.  This means
5636      that the scalar operations can remain in place, unvectorized.
5637      The original last scalar value that they compute will be used.  */
5638
5639   for (i = 0; i < op_type; i++)
5640     {
5641       if (rhs_class == GIMPLE_SINGLE_RHS)
5642         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5643       else
5644         op = gimple_op (stmt, i + 1);
5645       if (op
5646           && !vect_is_simple_use (op, loop_vinfo, &def_stmt, &dt))
5647         {
5648           if (dump_enabled_p ())
5649             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5650                              "use not simple.\n");
5651           return false;
5652         }
5653
5654       if (dt != vect_external_def && dt != vect_constant_def)
5655         return false;
5656     }
5657
5658   /* No transformation is required for the cases we currently support.  */
5659   return true;
5660 }
5661
5662 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5663
5664 static void
5665 vect_loop_kill_debug_uses (struct loop *loop, gimple *stmt)
5666 {
5667   ssa_op_iter op_iter;
5668   imm_use_iterator imm_iter;
5669   def_operand_p def_p;
5670   gimple *ustmt;
5671
5672   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5673     {
5674       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5675         {
5676           basic_block bb;
5677
5678           if (!is_gimple_debug (ustmt))
5679             continue;
5680
5681           bb = gimple_bb (ustmt);
5682
5683           if (!flow_bb_inside_loop_p (loop, bb))
5684             {
5685               if (gimple_debug_bind_p (ustmt))
5686                 {
5687                   if (dump_enabled_p ())
5688                     dump_printf_loc (MSG_NOTE, vect_location,
5689                                      "killing debug use\n");
5690
5691                   gimple_debug_bind_reset_value (ustmt);
5692                   update_stmt (ustmt);
5693                 }
5694               else
5695                 gcc_unreachable ();
5696             }
5697         }
5698     }
5699 }
5700
5701
5702 /* This function builds ni_name = number of iterations.  Statements
5703    are emitted on the loop preheader edge.  */
5704
5705 static tree
5706 vect_build_loop_niters (loop_vec_info loop_vinfo)
5707 {
5708   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5709   if (TREE_CODE (ni) == INTEGER_CST)
5710     return ni;
5711   else
5712     {
5713       tree ni_name, var;
5714       gimple_seq stmts = NULL;
5715       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5716
5717       var = create_tmp_var (TREE_TYPE (ni), "niters");
5718       ni_name = force_gimple_operand (ni, &stmts, false, var);
5719       if (stmts)
5720         gsi_insert_seq_on_edge_immediate (pe, stmts);
5721
5722       return ni_name;
5723     }
5724 }
5725
5726
5727 /* This function generates the following statements:
5728
5729    ni_name = number of iterations loop executes
5730    ratio = ni_name / vf
5731    ratio_mult_vf_name = ratio * vf
5732
5733    and places them on the loop preheader edge.  */
5734
5735 static void
5736 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5737                                  tree ni_name,
5738                                  tree *ratio_mult_vf_name_ptr,
5739                                  tree *ratio_name_ptr)
5740 {
5741   tree ni_minus_gap_name;
5742   tree var;
5743   tree ratio_name;
5744   tree ratio_mult_vf_name;
5745   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5746   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5747   tree log_vf;
5748
5749   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5750
5751   /* If epilogue loop is required because of data accesses with gaps, we
5752      subtract one iteration from the total number of iterations here for
5753      correct calculation of RATIO.  */
5754   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5755     {
5756       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5757                                        ni_name,
5758                                        build_one_cst (TREE_TYPE (ni_name)));
5759       if (!is_gimple_val (ni_minus_gap_name))
5760         {
5761           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5762           gimple *stmts = NULL;
5763           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5764                                                     true, var);
5765           gsi_insert_seq_on_edge_immediate (pe, stmts);
5766         }
5767     }
5768   else
5769     ni_minus_gap_name = ni_name;
5770
5771   /* Create: ratio = ni >> log2(vf) */
5772   /* ???  As we have ni == number of latch executions + 1, ni could
5773      have overflown to zero.  So avoid computing ratio based on ni
5774      but compute it using the fact that we know ratio will be at least
5775      one, thus via (ni - vf) >> log2(vf) + 1.  */
5776   ratio_name
5777     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5778                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5779                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5780                                              ni_minus_gap_name,
5781                                              build_int_cst
5782                                                (TREE_TYPE (ni_name), vf)),
5783                                 log_vf),
5784                    build_int_cst (TREE_TYPE (ni_name), 1));
5785   if (!is_gimple_val (ratio_name))
5786     {
5787       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5788       gimple *stmts = NULL;
5789       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5790       gsi_insert_seq_on_edge_immediate (pe, stmts);
5791     }
5792   *ratio_name_ptr = ratio_name;
5793
5794   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5795
5796   if (ratio_mult_vf_name_ptr)
5797     {
5798       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5799                                         ratio_name, log_vf);
5800       if (!is_gimple_val (ratio_mult_vf_name))
5801         {
5802           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5803           gimple *stmts = NULL;
5804           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5805                                                      true, var);
5806           gsi_insert_seq_on_edge_immediate (pe, stmts);
5807         }
5808       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5809     }
5810
5811   return;
5812 }
5813
5814
5815 /* Function vect_transform_loop.
5816
5817    The analysis phase has determined that the loop is vectorizable.
5818    Vectorize the loop - created vectorized stmts to replace the scalar
5819    stmts in the loop, and update the loop exit condition.  */
5820
5821 void
5822 vect_transform_loop (loop_vec_info loop_vinfo)
5823 {
5824   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5825   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5826   int nbbs = loop->num_nodes;
5827   int i;
5828   tree ratio = NULL;
5829   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5830   bool grouped_store;
5831   bool slp_scheduled = false;
5832   gimple *stmt, *pattern_stmt;
5833   gimple_seq pattern_def_seq = NULL;
5834   gimple_stmt_iterator pattern_def_si = gsi_none ();
5835   bool transform_pattern_stmt = false;
5836   bool check_profitability = false;
5837   int th;
5838   /* Record number of iterations before we started tampering with the profile. */
5839   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5840
5841   if (dump_enabled_p ())
5842     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5843
5844   /* If profile is inprecise, we have chance to fix it up.  */
5845   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5846     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5847
5848   /* Use the more conservative vectorization threshold.  If the number
5849      of iterations is constant assume the cost check has been performed
5850      by our caller.  If the threshold makes all loops profitable that
5851      run at least the vectorization factor number of times checking
5852      is pointless, too.  */
5853   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5854   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5855       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5856     {
5857       if (dump_enabled_p ())
5858         dump_printf_loc (MSG_NOTE, vect_location,
5859                          "Profitability threshold is %d loop iterations.\n",
5860                          th);
5861       check_profitability = true;
5862     }
5863
5864   /* Version the loop first, if required, so the profitability check
5865      comes first.  */
5866
5867   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5868       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5869     {
5870       vect_loop_versioning (loop_vinfo, th, check_profitability);
5871       check_profitability = false;
5872     }
5873
5874   tree ni_name = vect_build_loop_niters (loop_vinfo);
5875   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5876
5877   /* Peel the loop if there are data refs with unknown alignment.
5878      Only one data ref with unknown store is allowed.  */
5879
5880   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5881     {
5882       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5883                                      th, check_profitability);
5884       check_profitability = false;
5885       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5886          be re-computed.  */
5887       ni_name = NULL_TREE;
5888     }
5889
5890   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5891      compile time constant), or it is a constant that doesn't divide by the
5892      vectorization factor, then an epilog loop needs to be created.
5893      We therefore duplicate the loop: the original loop will be vectorized,
5894      and will compute the first (n/VF) iterations.  The second copy of the loop
5895      will remain scalar and will compute the remaining (n%VF) iterations.
5896      (VF is the vectorization factor).  */
5897
5898   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5899       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5900     {
5901       tree ratio_mult_vf;
5902       if (!ni_name)
5903         ni_name = vect_build_loop_niters (loop_vinfo);
5904       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
5905                                        &ratio);
5906       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
5907                                       th, check_profitability);
5908     }
5909   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5910     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
5911                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
5912   else
5913     {
5914       if (!ni_name)
5915         ni_name = vect_build_loop_niters (loop_vinfo);
5916       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
5917     }
5918
5919   /* 1) Make sure the loop header has exactly two entries
5920      2) Make sure we have a preheader basic block.  */
5921
5922   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
5923
5924   split_edge (loop_preheader_edge (loop));
5925
5926   /* FORNOW: the vectorizer supports only loops which body consist
5927      of one basic block (header + empty latch). When the vectorizer will
5928      support more involved loop forms, the order by which the BBs are
5929      traversed need to be reconsidered.  */
5930
5931   for (i = 0; i < nbbs; i++)
5932     {
5933       basic_block bb = bbs[i];
5934       stmt_vec_info stmt_info;
5935
5936       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
5937            gsi_next (&si))
5938         {
5939           gphi *phi = si.phi ();
5940           if (dump_enabled_p ())
5941             {
5942               dump_printf_loc (MSG_NOTE, vect_location,
5943                                "------>vectorizing phi: ");
5944               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
5945               dump_printf (MSG_NOTE, "\n");
5946             }
5947           stmt_info = vinfo_for_stmt (phi);
5948           if (!stmt_info)
5949             continue;
5950
5951           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
5952             vect_loop_kill_debug_uses (loop, phi);
5953
5954           if (!STMT_VINFO_RELEVANT_P (stmt_info)
5955               && !STMT_VINFO_LIVE_P (stmt_info))
5956             continue;
5957
5958           if (STMT_VINFO_VECTYPE (stmt_info)
5959               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
5960                   != (unsigned HOST_WIDE_INT) vectorization_factor)
5961               && dump_enabled_p ())
5962             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
5963
5964           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
5965             {
5966               if (dump_enabled_p ())
5967                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
5968               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
5969             }
5970         }
5971
5972       pattern_stmt = NULL;
5973       for (gimple_stmt_iterator si = gsi_start_bb (bb);
5974            !gsi_end_p (si) || transform_pattern_stmt;)
5975         {
5976           bool is_store;
5977
5978           if (transform_pattern_stmt)
5979             stmt = pattern_stmt;
5980           else
5981             {
5982               stmt = gsi_stmt (si);
5983               /* During vectorization remove existing clobber stmts.  */
5984               if (gimple_clobber_p (stmt))
5985                 {
5986                   unlink_stmt_vdef (stmt);
5987                   gsi_remove (&si, true);
5988                   release_defs (stmt);
5989                   continue;
5990                 }
5991             }
5992
5993           if (dump_enabled_p ())
5994             {
5995               dump_printf_loc (MSG_NOTE, vect_location,
5996                                "------>vectorizing statement: ");
5997               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
5998               dump_printf (MSG_NOTE, "\n");
5999             }
6000
6001           stmt_info = vinfo_for_stmt (stmt);
6002
6003           /* vector stmts created in the outer-loop during vectorization of
6004              stmts in an inner-loop may not have a stmt_info, and do not
6005              need to be vectorized.  */
6006           if (!stmt_info)
6007             {
6008               gsi_next (&si);
6009               continue;
6010             }
6011
6012           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6013             vect_loop_kill_debug_uses (loop, stmt);
6014
6015           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6016               && !STMT_VINFO_LIVE_P (stmt_info))
6017             {
6018               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6019                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6020                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6021                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6022                 {
6023                   stmt = pattern_stmt;
6024                   stmt_info = vinfo_for_stmt (stmt);
6025                 }
6026               else
6027                 {
6028                   gsi_next (&si);
6029                   continue;
6030                 }
6031             }
6032           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6033                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6034                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6035                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6036             transform_pattern_stmt = true;
6037
6038           /* If pattern statement has def stmts, vectorize them too.  */
6039           if (is_pattern_stmt_p (stmt_info))
6040             {
6041               if (pattern_def_seq == NULL)
6042                 {
6043                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6044                   pattern_def_si = gsi_start (pattern_def_seq);
6045                 }
6046               else if (!gsi_end_p (pattern_def_si))
6047                 gsi_next (&pattern_def_si);
6048               if (pattern_def_seq != NULL)
6049                 {
6050                   gimple *pattern_def_stmt = NULL;
6051                   stmt_vec_info pattern_def_stmt_info = NULL;
6052
6053                   while (!gsi_end_p (pattern_def_si))
6054                     {
6055                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6056                       pattern_def_stmt_info
6057                         = vinfo_for_stmt (pattern_def_stmt);
6058                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6059                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6060                         break;
6061                       gsi_next (&pattern_def_si);
6062                     }
6063
6064                   if (!gsi_end_p (pattern_def_si))
6065                     {
6066                       if (dump_enabled_p ())
6067                         {
6068                           dump_printf_loc (MSG_NOTE, vect_location,
6069                                            "==> vectorizing pattern def "
6070                                            "stmt: ");
6071                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6072                                             pattern_def_stmt, 0);
6073                           dump_printf (MSG_NOTE, "\n");
6074                         }
6075
6076                       stmt = pattern_def_stmt;
6077                       stmt_info = pattern_def_stmt_info;
6078                     }
6079                   else
6080                     {
6081                       pattern_def_si = gsi_none ();
6082                       transform_pattern_stmt = false;
6083                     }
6084                 }
6085               else
6086                 transform_pattern_stmt = false;
6087             }
6088
6089           if (STMT_VINFO_VECTYPE (stmt_info))
6090             {
6091               unsigned int nunits
6092                 = (unsigned int)
6093                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6094               if (!STMT_SLP_TYPE (stmt_info)
6095                   && nunits != (unsigned int) vectorization_factor
6096                   && dump_enabled_p ())
6097                   /* For SLP VF is set according to unrolling factor, and not
6098                      to vector size, hence for SLP this print is not valid.  */
6099                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6100             }
6101
6102           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6103              reached.  */
6104           if (STMT_SLP_TYPE (stmt_info))
6105             {
6106               if (!slp_scheduled)
6107                 {
6108                   slp_scheduled = true;
6109
6110                   if (dump_enabled_p ())
6111                     dump_printf_loc (MSG_NOTE, vect_location,
6112                                      "=== scheduling SLP instances ===\n");
6113
6114                   vect_schedule_slp (loop_vinfo);
6115                 }
6116
6117               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6118               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6119                 {
6120                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6121                     {
6122                       pattern_def_seq = NULL;
6123                       gsi_next (&si);
6124                     }
6125                   continue;
6126                 }
6127             }
6128
6129           /* -------- vectorize statement ------------ */
6130           if (dump_enabled_p ())
6131             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6132
6133           grouped_store = false;
6134           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6135           if (is_store)
6136             {
6137               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6138                 {
6139                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6140                      interleaving chain was completed - free all the stores in
6141                      the chain.  */
6142                   gsi_next (&si);
6143                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6144                 }
6145               else
6146                 {
6147                   /* Free the attached stmt_vec_info and remove the stmt.  */
6148                   gimple *store = gsi_stmt (si);
6149                   free_stmt_vec_info (store);
6150                   unlink_stmt_vdef (store);
6151                   gsi_remove (&si, true);
6152                   release_defs (store);
6153                 }
6154
6155               /* Stores can only appear at the end of pattern statements.  */
6156               gcc_assert (!transform_pattern_stmt);
6157               pattern_def_seq = NULL;
6158             }
6159           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6160             {
6161               pattern_def_seq = NULL;
6162               gsi_next (&si);
6163             }
6164         }                       /* stmts in BB */
6165     }                           /* BBs in loop */
6166
6167   slpeel_make_loop_iterate_ntimes (loop, ratio);
6168
6169   /* Reduce loop iterations by the vectorization factor.  */
6170   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6171                       expected_iterations / vectorization_factor);
6172   loop->nb_iterations_upper_bound
6173     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6174   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6175       && loop->nb_iterations_upper_bound != 0)
6176     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6177   if (loop->any_estimate)
6178     {
6179       loop->nb_iterations_estimate
6180         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6181        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6182            && loop->nb_iterations_estimate != 0)
6183          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6184     }
6185
6186   if (dump_enabled_p ())
6187     {
6188       dump_printf_loc (MSG_NOTE, vect_location,
6189                        "LOOP VECTORIZED\n");
6190       if (loop->inner)
6191         dump_printf_loc (MSG_NOTE, vect_location,
6192                          "OUTER LOOP VECTORIZED\n");
6193       dump_printf (MSG_NOTE, "\n");
6194     }
6195 }