gcc/tree-vect-loop.c

   1 /* Loop Vectorization
   2    Copyright (C) 2003-2015 Free Software Foundation, Inc.
   3    Contributed by Dorit Naishlos <dorit@il.ibm.com> and
   4    Ira Rosen <irar@il.ibm.com>
   5
   6 This file is part of GCC.
   7
   8 GCC is free software; you can redistribute it and/or modify it under
   9 the terms of the GNU General Public License as published by the Free
  10 Software Foundation; either version 3, or (at your option) any later
  11 version.
  12
  13 GCC is distributed in the hope that it will be useful, but WITHOUT ANY
  14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
  15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  16 for more details.
  17
  18 You should have received a copy of the GNU General Public License
  19 along with GCC; see the file COPYING3.  If not see
  20 <http://www.gnu.org/licenses/>.  */
  21
  22 #include "config.h"
  23 #include "system.h"
  24 #include "coretypes.h"
  25 #include "dumpfile.h"
  26 #include "backend.h"
  27 #include "cfghooks.h"
  28 #include "tree.h"
  29 #include "gimple.h"
  30 #include "rtl.h"
  31 #include "ssa.h"
  32 #include "alias.h"
  33 #include "fold-const.h"
  34 #include "stor-layout.h"
  35 #include "cfganal.h"
  36 #include "gimple-pretty-print.h"
  37 #include "internal-fn.h"
  38 #include "gimplify.h"
  39 #include "gimple-iterator.h"
  40 #include "gimplify-me.h"
  41 #include "tree-ssa-loop-ivopts.h"
  42 #include "tree-ssa-loop-manip.h"
  43 #include "tree-ssa-loop-niter.h"
  44 #include "tree-pass.h"
  45 #include "cfgloop.h"
  46 #include "flags.h"
  47 #include "insn-config.h"
  48 #include "expmed.h"
  49 #include "dojump.h"
  50 #include "explow.h"
  51 #include "calls.h"
  52 #include "emit-rtl.h"
  53 #include "varasm.h"
  54 #include "stmt.h"
  55 #include "expr.h"
  56 #include "recog.h"
  57 #include "insn-codes.h"
  58 #include "optabs.h"
  59 #include "params.h"
  60 #include "diagnostic-core.h"
  61 #include "tree-chrec.h"
  62 #include "tree-scalar-evolution.h"
  63 #include "tree-vectorizer.h"
  64 #include "target.h"
  65
  66 /* Loop Vectorization Pass.
  67
  68    This pass tries to vectorize loops.
  69
  70    For example, the vectorizer transforms the following simple loop:
  71
  72         short a[N]; short b[N]; short c[N]; int i;
  73
  74         for (i=0; i<N; i++){
  75           a[i] = b[i] + c[i];
  76         }
  77
  78    as if it was manually vectorized by rewriting the source code into:
  79
  80         typedef int __attribute__((mode(V8HI))) v8hi;
  81         short a[N];  short b[N]; short c[N];   int i;
  82         v8hi *pa = (v8hi*)a, *pb = (v8hi*)b, *pc = (v8hi*)c;
  83         v8hi va, vb, vc;
  84
  85         for (i=0; i<N/8; i++){
  86           vb = pb[i];
  87           vc = pc[i];
  88           va = vb + vc;
  89           pa[i] = va;
  90         }
  91
  92         The main entry to this pass is vectorize_loops(), in which
  93    the vectorizer applies a set of analyses on a given set of loops,
  94    followed by the actual vectorization transformation for the loops that
  95    had successfully passed the analysis phase.
  96         Throughout this pass we make a distinction between two types of
  97    data: scalars (which are represented by SSA_NAMES), and memory references
  98    ("data-refs").  These two types of data require different handling both
  99    during analysis and transformation. The types of data-refs that the
 100    vectorizer currently supports are ARRAY_REFS which base is an array DECL
 101    (not a pointer), and INDIRECT_REFS through pointers; both array and pointer
 102    accesses are required to have a simple (consecutive) access pattern.
 103
 104    Analysis phase:
 105    ===============
 106         The driver for the analysis phase is vect_analyze_loop().
 107    It applies a set of analyses, some of which rely on the scalar evolution
 108    analyzer (scev) developed by Sebastian Pop.
 109
 110         During the analysis phase the vectorizer records some information
 111    per stmt in a "stmt_vec_info" struct which is attached to each stmt in the
 112    loop, as well as general information about the loop as a whole, which is
 113    recorded in a "loop_vec_info" struct attached to each loop.
 114
 115    Transformation phase:
 116    =====================
 117         The loop transformation phase scans all the stmts in the loop, and
 118    creates a vector stmt (or a sequence of stmts) for each scalar stmt S in
 119    the loop that needs to be vectorized.  It inserts the vector code sequence
 120    just before the scalar stmt S, and records a pointer to the vector code
 121    in STMT_VINFO_VEC_STMT (stmt_info) (stmt_info is the stmt_vec_info struct
 122    attached to S).  This pointer will be used for the vectorization of following
 123    stmts which use the def of stmt S. Stmt S is removed if it writes to memory;
 124    otherwise, we rely on dead code elimination for removing it.
 125
 126         For example, say stmt S1 was vectorized into stmt VS1:
 127
 128    VS1: vb = px[i];
 129    S1:  b = x[i];    STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 130    S2:  a = b;
 131
 132    To vectorize stmt S2, the vectorizer first finds the stmt that defines
 133    the operand 'b' (S1), and gets the relevant vector def 'vb' from the
 134    vector stmt VS1 pointed to by STMT_VINFO_VEC_STMT (stmt_info (S1)).  The
 135    resulting sequence would be:
 136
 137    VS1: vb = px[i];
 138    S1:  b = x[i];       STMT_VINFO_VEC_STMT (stmt_info (S1)) = VS1
 139    VS2: va = vb;
 140    S2:  a = b;          STMT_VINFO_VEC_STMT (stmt_info (S2)) = VS2
 141
 142         Operands that are not SSA_NAMEs, are data-refs that appear in
 143    load/store operations (like 'x[i]' in S1), and are handled differently.
 144
 145    Target modeling:
 146    =================
 147         Currently the only target specific information that is used is the
 148    size of the vector (in bytes) - "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".
 149    Targets that can support different sizes of vectors, for now will need
 150    to specify one value for "TARGET_VECTORIZE_UNITS_PER_SIMD_WORD".  More
 151    flexibility will be added in the future.
 152
 153         Since we only vectorize operations which vector form can be
 154    expressed using existing tree codes, to verify that an operation is
 155    supported, the vectorizer checks the relevant optab at the relevant
 156    machine_mode (e.g, optab_handler (add_optab, V8HImode)).  If
 157    the value found is CODE_FOR_nothing, then there's no target support, and
 158    we can't vectorize the stmt.
 159
 160    For additional information on this project see:
 161    http://gcc.gnu.org/projects/tree-ssa/vectorization.html
 162 */
 163
 164 static void vect_estimate_min_profitable_iters (loop_vec_info, int *, int *);
 165
 166 /* Function vect_determine_vectorization_factor
 167
 168    Determine the vectorization factor (VF).  VF is the number of data elements
 169    that are operated upon in parallel in a single iteration of the vectorized
 170    loop.  For example, when vectorizing a loop that operates on 4byte elements,
 171    on a target with vector size (VS) 16byte, the VF is set to 4, since 4
 172    elements can fit in a single vector register.
 173
 174    We currently support vectorization of loops in which all types operated upon
 175    are of the same size.  Therefore this function currently sets VF according to
 176    the size of the types operated upon, and fails if there are multiple sizes
 177    in the loop.
 178
 179    VF is also the factor by which the loop iterations are strip-mined, e.g.:
 180    original loop:
 181         for (i=0; i<N; i++){
 182           a[i] = b[i] + c[i];
 183         }
 184
 185    vectorized loop:
 186         for (i=0; i<N; i+=VF){
 187           a[i:VF] = b[i:VF] + c[i:VF];
 188         }
 189 */
 190
 191 static bool
 192 vect_determine_vectorization_factor (loop_vec_info loop_vinfo)
 193 {
 194   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 195   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
 196   int nbbs = loop->num_nodes;
 197   unsigned int vectorization_factor = 0;
 198   tree scalar_type;
 199   gphi *phi;
 200   tree vectype;
 201   unsigned int nunits;
 202   stmt_vec_info stmt_info;
 203   int i;
 204   HOST_WIDE_INT dummy;
 205   gimple stmt, pattern_stmt = NULL;
 206   gimple_seq pattern_def_seq = NULL;
 207   gimple_stmt_iterator pattern_def_si = gsi_none ();
 208   bool analyze_pattern_stmt = false;
 209
 210   if (dump_enabled_p ())
 211     dump_printf_loc (MSG_NOTE, vect_location,
 212                      "=== vect_determine_vectorization_factor ===\n");
 213
 214   for (i = 0; i < nbbs; i++)
 215     {
 216       basic_block bb = bbs[i];
 217
 218       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
 219            gsi_next (&si))
 220         {
 221           phi = si.phi ();
 222           stmt_info = vinfo_for_stmt (phi);
 223           if (dump_enabled_p ())
 224             {
 225               dump_printf_loc (MSG_NOTE, vect_location, "==> examining phi: ");
 226               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 227               dump_printf (MSG_NOTE, "\n");
 228             }
 229
 230           gcc_assert (stmt_info);
 231
 232           if (STMT_VINFO_RELEVANT_P (stmt_info))
 233             {
 234               gcc_assert (!STMT_VINFO_VECTYPE (stmt_info));
 235               scalar_type = TREE_TYPE (PHI_RESULT (phi));
 236
 237               if (dump_enabled_p ())
 238                 {
 239                   dump_printf_loc (MSG_NOTE, vect_location,
 240                                    "get vectype for scalar type:  ");
 241                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 242                   dump_printf (MSG_NOTE, "\n");
 243                 }
 244
 245               vectype = get_vectype_for_scalar_type (scalar_type);
 246               if (!vectype)
 247                 {
 248                   if (dump_enabled_p ())
 249                     {
 250                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 251                                        "not vectorized: unsupported "
 252                                        "data-type ");
 253                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 254                                          scalar_type);
 255                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 256                     }
 257                   return false;
 258                 }
 259               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 260
 261               if (dump_enabled_p ())
 262                 {
 263                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 264                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 265                   dump_printf (MSG_NOTE, "\n");
 266                 }
 267
 268               nunits = TYPE_VECTOR_SUBPARTS (vectype);
 269               if (dump_enabled_p ())
 270                 dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n",
 271                                  nunits);
 272
 273               if (!vectorization_factor
 274                   || (nunits > vectorization_factor))
 275                 vectorization_factor = nunits;
 276             }
 277         }
 278
 279       for (gimple_stmt_iterator si = gsi_start_bb (bb);
 280            !gsi_end_p (si) || analyze_pattern_stmt;)
 281         {
 282           tree vf_vectype;
 283
 284           if (analyze_pattern_stmt)
 285             stmt = pattern_stmt;
 286           else
 287             stmt = gsi_stmt (si);
 288
 289           stmt_info = vinfo_for_stmt (stmt);
 290
 291           if (dump_enabled_p ())
 292             {
 293               dump_printf_loc (MSG_NOTE, vect_location,
 294                                "==> examining statement: ");
 295               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 296               dump_printf (MSG_NOTE, "\n");
 297             }
 298
 299           gcc_assert (stmt_info);
 300
 301           /* Skip stmts which do not need to be vectorized.  */
 302           if ((!STMT_VINFO_RELEVANT_P (stmt_info)
 303                && !STMT_VINFO_LIVE_P (stmt_info))
 304               || gimple_clobber_p (stmt))
 305             {
 306               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 307                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 308                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 309                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 310                 {
 311                   stmt = pattern_stmt;
 312                   stmt_info = vinfo_for_stmt (pattern_stmt);
 313                   if (dump_enabled_p ())
 314                     {
 315                       dump_printf_loc (MSG_NOTE, vect_location,
 316                                        "==> examining pattern statement: ");
 317                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
 318                       dump_printf (MSG_NOTE, "\n");
 319                     }
 320                 }
 321               else
 322                 {
 323                   if (dump_enabled_p ())
 324                     dump_printf_loc (MSG_NOTE, vect_location, "skip.\n");
 325                   gsi_next (&si);
 326                   continue;
 327                 }
 328             }
 329           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
 330                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
 331                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
 332                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
 333             analyze_pattern_stmt = true;
 334
 335           /* If a pattern statement has def stmts, analyze them too.  */
 336           if (is_pattern_stmt_p (stmt_info))
 337             {
 338               if (pattern_def_seq == NULL)
 339                 {
 340                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
 341                   pattern_def_si = gsi_start (pattern_def_seq);
 342                 }
 343               else if (!gsi_end_p (pattern_def_si))
 344                 gsi_next (&pattern_def_si);
 345               if (pattern_def_seq != NULL)
 346                 {
 347                   gimple pattern_def_stmt = NULL;
 348                   stmt_vec_info pattern_def_stmt_info = NULL;
 349
 350                   while (!gsi_end_p (pattern_def_si))
 351                     {
 352                       pattern_def_stmt = gsi_stmt (pattern_def_si);
 353                       pattern_def_stmt_info
 354                         = vinfo_for_stmt (pattern_def_stmt);
 355                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
 356                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
 357                         break;
 358                       gsi_next (&pattern_def_si);
 359                     }
 360
 361                   if (!gsi_end_p (pattern_def_si))
 362                     {
 363                       if (dump_enabled_p ())
 364                         {
 365                           dump_printf_loc (MSG_NOTE, vect_location,
 366                                            "==> examining pattern def stmt: ");
 367                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
 368                                             pattern_def_stmt, 0);
 369                           dump_printf (MSG_NOTE, "\n");
 370                         }
 371
 372                       stmt = pattern_def_stmt;
 373                       stmt_info = pattern_def_stmt_info;
 374                     }
 375                   else
 376                     {
 377                       pattern_def_si = gsi_none ();
 378                       analyze_pattern_stmt = false;
 379                     }
 380                 }
 381               else
 382                 analyze_pattern_stmt = false;
 383             }
 384
 385           if (gimple_get_lhs (stmt) == NULL_TREE
 386               /* MASK_STORE has no lhs, but is ok.  */
 387               && (!is_gimple_call (stmt)
 388                   || !gimple_call_internal_p (stmt)
 389                   || gimple_call_internal_fn (stmt) != IFN_MASK_STORE))
 390             {
 391               if (is_gimple_call (stmt))
 392                 {
 393                   /* Ignore calls with no lhs.  These must be calls to
 394                      #pragma omp simd functions, and what vectorization factor
 395                      it really needs can't be determined until
 396                      vectorizable_simd_clone_call.  */
 397                   if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 398                     {
 399                       pattern_def_seq = NULL;
 400                       gsi_next (&si);
 401                     }
 402                   continue;
 403                 }
 404               if (dump_enabled_p ())
 405                 {
 406                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 407                                    "not vectorized: irregular stmt.");
 408                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION,  TDF_SLIM, stmt,
 409                                     0);
 410                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 411                 }
 412               return false;
 413             }
 414
 415           if (VECTOR_MODE_P (TYPE_MODE (gimple_expr_type (stmt))))
 416             {
 417               if (dump_enabled_p ())
 418                 {
 419                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 420                                    "not vectorized: vector stmt in loop:");
 421                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, stmt, 0);
 422                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 423                 }
 424               return false;
 425             }
 426
 427           if (STMT_VINFO_VECTYPE (stmt_info))
 428             {
 429               /* The only case when a vectype had been already set is for stmts
 430                  that contain a dataref, or for "pattern-stmts" (stmts
 431                  generated by the vectorizer to represent/replace a certain
 432                  idiom).  */
 433               gcc_assert (STMT_VINFO_DATA_REF (stmt_info)
 434                           || is_pattern_stmt_p (stmt_info)
 435                           || !gsi_end_p (pattern_def_si));
 436               vectype = STMT_VINFO_VECTYPE (stmt_info);
 437             }
 438           else
 439             {
 440               gcc_assert (!STMT_VINFO_DATA_REF (stmt_info));
 441               if (is_gimple_call (stmt)
 442                   && gimple_call_internal_p (stmt)
 443                   && gimple_call_internal_fn (stmt) == IFN_MASK_STORE)
 444                 scalar_type = TREE_TYPE (gimple_call_arg (stmt, 3));
 445               else
 446                 scalar_type = TREE_TYPE (gimple_get_lhs (stmt));
 447               if (dump_enabled_p ())
 448                 {
 449                   dump_printf_loc (MSG_NOTE, vect_location,
 450                                    "get vectype for scalar type:  ");
 451                   dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 452                   dump_printf (MSG_NOTE, "\n");
 453                 }
 454               vectype = get_vectype_for_scalar_type (scalar_type);
 455               if (!vectype)
 456                 {
 457                   if (dump_enabled_p ())
 458                     {
 459                       dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 460                                        "not vectorized: unsupported "
 461                                        "data-type ");
 462                       dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 463                                          scalar_type);
 464                       dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 465                     }
 466                   return false;
 467                 }
 468
 469               STMT_VINFO_VECTYPE (stmt_info) = vectype;
 470
 471               if (dump_enabled_p ())
 472                 {
 473                   dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 474                   dump_generic_expr (MSG_NOTE, TDF_SLIM, vectype);
 475                   dump_printf (MSG_NOTE, "\n");
 476                 }
 477             }
 478
 479           /* The vectorization factor is according to the smallest
 480              scalar type (or the largest vector size, but we only
 481              support one vector size per loop).  */
 482           scalar_type = vect_get_smallest_scalar_type (stmt, &dummy,
 483                                                        &dummy);
 484           if (dump_enabled_p ())
 485             {
 486               dump_printf_loc (MSG_NOTE, vect_location,
 487                                "get vectype for scalar type:  ");
 488               dump_generic_expr (MSG_NOTE, TDF_SLIM, scalar_type);
 489               dump_printf (MSG_NOTE, "\n");
 490             }
 491           vf_vectype = get_vectype_for_scalar_type (scalar_type);
 492           if (!vf_vectype)
 493             {
 494               if (dump_enabled_p ())
 495                 {
 496                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 497                                    "not vectorized: unsupported data-type ");
 498                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 499                                      scalar_type);
 500                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 501                 }
 502               return false;
 503             }
 504
 505           if ((GET_MODE_SIZE (TYPE_MODE (vectype))
 506                != GET_MODE_SIZE (TYPE_MODE (vf_vectype))))
 507             {
 508               if (dump_enabled_p ())
 509                 {
 510                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 511                                    "not vectorized: different sized vector "
 512                                    "types in statement, ");
 513                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 514                                      vectype);
 515                   dump_printf (MSG_MISSED_OPTIMIZATION, " and ");
 516                   dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
 517                                      vf_vectype);
 518                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
 519                 }
 520               return false;
 521             }
 522
 523           if (dump_enabled_p ())
 524             {
 525               dump_printf_loc (MSG_NOTE, vect_location, "vectype: ");
 526               dump_generic_expr (MSG_NOTE, TDF_SLIM, vf_vectype);
 527               dump_printf (MSG_NOTE, "\n");
 528             }
 529
 530           nunits = TYPE_VECTOR_SUBPARTS (vf_vectype);
 531           if (dump_enabled_p ())
 532             dump_printf_loc (MSG_NOTE, vect_location, "nunits = %d\n", nunits);
 533           if (!vectorization_factor
 534               || (nunits > vectorization_factor))
 535             vectorization_factor = nunits;
 536
 537           if (!analyze_pattern_stmt && gsi_end_p (pattern_def_si))
 538             {
 539               pattern_def_seq = NULL;
 540               gsi_next (&si);
 541             }
 542         }
 543     }
 544
 545   /* TODO: Analyze cost. Decide if worth while to vectorize.  */
 546   if (dump_enabled_p ())
 547     dump_printf_loc (MSG_NOTE, vect_location, "vectorization factor = %d\n",
 548                      vectorization_factor);
 549   if (vectorization_factor <= 1)
 550     {
 551       if (dump_enabled_p ())
 552         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 553                          "not vectorized: unsupported data-type\n");
 554       return false;
 555     }
 556   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
 557
 558   return true;
 559 }
 560
 561
 562 /* Function vect_is_simple_iv_evolution.
 563
 564    FORNOW: A simple evolution of an induction variables in the loop is
 565    considered a polynomial evolution.  */
 566
 567 static bool
 568 vect_is_simple_iv_evolution (unsigned loop_nb, tree access_fn, tree * init,
 569                              tree * step)
 570 {
 571   tree init_expr;
 572   tree step_expr;
 573   tree evolution_part = evolution_part_in_loop_num (access_fn, loop_nb);
 574   basic_block bb;
 575
 576   /* When there is no evolution in this loop, the evolution function
 577      is not "simple".  */
 578   if (evolution_part == NULL_TREE)
 579     return false;
 580
 581   /* When the evolution is a polynomial of degree >= 2
 582      the evolution function is not "simple".  */
 583   if (tree_is_chrec (evolution_part))
 584     return false;
 585
 586   step_expr = evolution_part;
 587   init_expr = unshare_expr (initial_condition_in_loop_num (access_fn, loop_nb));
 588
 589   if (dump_enabled_p ())
 590     {
 591       dump_printf_loc (MSG_NOTE, vect_location, "step: ");
 592       dump_generic_expr (MSG_NOTE, TDF_SLIM, step_expr);
 593       dump_printf (MSG_NOTE, ",  init: ");
 594       dump_generic_expr (MSG_NOTE, TDF_SLIM, init_expr);
 595       dump_printf (MSG_NOTE, "\n");
 596     }
 597
 598   *init = init_expr;
 599   *step = step_expr;
 600
 601   if (TREE_CODE (step_expr) != INTEGER_CST
 602       && (TREE_CODE (step_expr) != SSA_NAME
 603           || ((bb = gimple_bb (SSA_NAME_DEF_STMT (step_expr)))
 604               && flow_bb_inside_loop_p (get_loop (cfun, loop_nb), bb))
 605           || (!INTEGRAL_TYPE_P (TREE_TYPE (step_expr))
 606               && (!SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr))
 607                   || !flag_associative_math)))
 608       && (TREE_CODE (step_expr) != REAL_CST
 609           || !flag_associative_math))
 610     {
 611       if (dump_enabled_p ())
 612         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 613                          "step unknown.\n");
 614       return false;
 615     }
 616
 617   return true;
 618 }
 619
 620 /* Function vect_analyze_scalar_cycles_1.
 621
 622    Examine the cross iteration def-use cycles of scalar variables
 623    in LOOP.  LOOP_VINFO represents the loop that is now being
 624    considered for vectorization (can be LOOP, or an outer-loop
 625    enclosing LOOP).  */
 626
 627 static void
 628 vect_analyze_scalar_cycles_1 (loop_vec_info loop_vinfo, struct loop *loop)
 629 {
 630   basic_block bb = loop->header;
 631   tree init, step;
 632   auto_vec<gimple, 64> worklist;
 633   gphi_iterator gsi;
 634   bool double_reduc;
 635
 636   if (dump_enabled_p ())
 637     dump_printf_loc (MSG_NOTE, vect_location,
 638                      "=== vect_analyze_scalar_cycles ===\n");
 639
 640   /* First - identify all inductions.  Reduction detection assumes that all the
 641      inductions have been identified, therefore, this order must not be
 642      changed.  */
 643   for (gsi = gsi_start_phis  (bb); !gsi_end_p (gsi); gsi_next (&gsi))
 644     {
 645       gphi *phi = gsi.phi ();
 646       tree access_fn = NULL;
 647       tree def = PHI_RESULT (phi);
 648       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 649
 650       if (dump_enabled_p ())
 651         {
 652           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 653           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 654           dump_printf (MSG_NOTE, "\n");
 655         }
 656
 657       /* Skip virtual phi's.  The data dependences that are associated with
 658          virtual defs/uses (i.e., memory accesses) are analyzed elsewhere.  */
 659       if (virtual_operand_p (def))
 660         continue;
 661
 662       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_unknown_def_type;
 663
 664       /* Analyze the evolution function.  */
 665       access_fn = analyze_scalar_evolution (loop, def);
 666       if (access_fn)
 667         {
 668           STRIP_NOPS (access_fn);
 669           if (dump_enabled_p ())
 670             {
 671               dump_printf_loc (MSG_NOTE, vect_location,
 672                                "Access function of PHI: ");
 673               dump_generic_expr (MSG_NOTE, TDF_SLIM, access_fn);
 674               dump_printf (MSG_NOTE, "\n");
 675             }
 676           STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo)
 677             = evolution_part_in_loop_num (access_fn, loop->num);
 678         }
 679
 680       if (!access_fn
 681           || !vect_is_simple_iv_evolution (loop->num, access_fn, &init, &step)
 682           || (LOOP_VINFO_LOOP (loop_vinfo) != loop
 683               && TREE_CODE (step) != INTEGER_CST))
 684         {
 685           worklist.safe_push (phi);
 686           continue;
 687         }
 688
 689       gcc_assert (STMT_VINFO_LOOP_PHI_EVOLUTION_PART (stmt_vinfo) != NULL_TREE);
 690
 691       if (dump_enabled_p ())
 692         dump_printf_loc (MSG_NOTE, vect_location, "Detected induction.\n");
 693       STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_induction_def;
 694     }
 695
 696
 697   /* Second - identify all reductions and nested cycles.  */
 698   while (worklist.length () > 0)
 699     {
 700       gimple phi = worklist.pop ();
 701       tree def = PHI_RESULT (phi);
 702       stmt_vec_info stmt_vinfo = vinfo_for_stmt (phi);
 703       gimple reduc_stmt;
 704       bool nested_cycle;
 705
 706       if (dump_enabled_p ())
 707         {
 708           dump_printf_loc (MSG_NOTE, vect_location, "Analyze phi: ");
 709           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
 710           dump_printf (MSG_NOTE, "\n");
 711         }
 712
 713       gcc_assert (!virtual_operand_p (def)
 714                   && STMT_VINFO_DEF_TYPE (stmt_vinfo) == vect_unknown_def_type);
 715
 716       nested_cycle = (loop != LOOP_VINFO_LOOP (loop_vinfo));
 717       reduc_stmt = vect_force_simple_reduction (loop_vinfo, phi, !nested_cycle,
 718                                                 &double_reduc);
 719       if (reduc_stmt)
 720         {
 721           if (double_reduc)
 722             {
 723               if (dump_enabled_p ())
 724                 dump_printf_loc (MSG_NOTE, vect_location,
 725                                  "Detected double reduction.\n");
 726
 727               STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_double_reduction_def;
 728               STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 729                                                     vect_double_reduction_def;
 730             }
 731           else
 732             {
 733               if (nested_cycle)
 734                 {
 735                   if (dump_enabled_p ())
 736                     dump_printf_loc (MSG_NOTE, vect_location,
 737                                      "Detected vectorizable nested cycle.\n");
 738
 739                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_nested_cycle;
 740                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 741                                                              vect_nested_cycle;
 742                 }
 743               else
 744                 {
 745                   if (dump_enabled_p ())
 746                     dump_printf_loc (MSG_NOTE, vect_location,
 747                                      "Detected reduction.\n");
 748
 749                   STMT_VINFO_DEF_TYPE (stmt_vinfo) = vect_reduction_def;
 750                   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (reduc_stmt)) =
 751                                                            vect_reduction_def;
 752                   /* Store the reduction cycles for possible vectorization in
 753                      loop-aware SLP.  */
 754                   LOOP_VINFO_REDUCTIONS (loop_vinfo).safe_push (reduc_stmt);
 755                 }
 756             }
 757         }
 758       else
 759         if (dump_enabled_p ())
 760           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
 761                            "Unknown def-use cycle pattern.\n");
 762     }
 763 }
 764
 765
 766 /* Function vect_analyze_scalar_cycles.
 767
 768    Examine the cross iteration def-use cycles of scalar variables, by
 769    analyzing the loop-header PHIs of scalar variables.  Classify each
 770    cycle as one of the following: invariant, induction, reduction, unknown.
 771    We do that for the loop represented by LOOP_VINFO, and also to its
 772    inner-loop, if exists.
 773    Examples for scalar cycles:
 774
 775    Example1: reduction:
 776
 777               loop1:
 778               for (i=0; i<N; i++)
 779                  sum += a[i];
 780
 781    Example2: induction:
 782
 783               loop2:
 784               for (i=0; i<N; i++)
 785                  a[i] = i;  */
 786
 787 static void
 788 vect_analyze_scalar_cycles (loop_vec_info loop_vinfo)
 789 {
 790   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
 791
 792   vect_analyze_scalar_cycles_1 (loop_vinfo, loop);
 793
 794   /* When vectorizing an outer-loop, the inner-loop is executed sequentially.
 795      Reductions in such inner-loop therefore have different properties than
 796      the reductions in the nest that gets vectorized:
 797      1. When vectorized, they are executed in the same order as in the original
 798         scalar loop, so we can't change the order of computation when
 799         vectorizing them.
 800      2. FIXME: Inner-loop reductions can be used in the inner-loop, so the
 801         current checks are too strict.  */
 802
 803   if (loop->inner)
 804     vect_analyze_scalar_cycles_1 (loop_vinfo, loop->inner);
 805 }
 806
 807 /* Transfer group and reduction information from STMT to its pattern stmt.  */
 808
 809 static void
 810 vect_fixup_reduc_chain (gimple stmt)
 811 {
 812   gimple firstp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 813   gimple stmtp;
 814   gcc_assert (!GROUP_FIRST_ELEMENT (vinfo_for_stmt (firstp))
 815               && GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
 816   GROUP_SIZE (vinfo_for_stmt (firstp)) = GROUP_SIZE (vinfo_for_stmt (stmt));
 817   do
 818     {
 819       stmtp = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 820       GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmtp)) = firstp;
 821       stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmt));
 822       if (stmt)
 823         GROUP_NEXT_ELEMENT (vinfo_for_stmt (stmtp))
 824           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
 825     }
 826   while (stmt);
 827   STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmtp)) = vect_reduction_def;
 828 }
 829
 830 /* Fixup scalar cycles that now have their stmts detected as patterns.  */
 831
 832 static void
 833 vect_fixup_scalar_cycles_with_patterns (loop_vec_info loop_vinfo)
 834 {
 835   gimple first;
 836   unsigned i;
 837
 838   FOR_EACH_VEC_ELT (LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo), i, first)
 839     if (STMT_VINFO_IN_PATTERN_P (vinfo_for_stmt (first)))
 840       {
 841         vect_fixup_reduc_chain (first);
 842         LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo)[i]
 843           = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (first));
 844       }
 845 }
 846
 847 /* Function vect_get_loop_niters.
 848
 849    Determine how many iterations the loop is executed and place it
 850    in NUMBER_OF_ITERATIONS.  Place the number of latch iterations
 851    in NUMBER_OF_ITERATIONSM1.
 852
 853    Return the loop exit condition.  */
 854
 855
 856 static gcond *
 857 vect_get_loop_niters (struct loop *loop, tree *number_of_iterations,
 858                       tree *number_of_iterationsm1)
 859 {
 860   tree niters;
 861
 862   if (dump_enabled_p ())
 863     dump_printf_loc (MSG_NOTE, vect_location,
 864                      "=== get_loop_niters ===\n");
 865
 866   niters = number_of_latch_executions (loop);
 867   *number_of_iterationsm1 = niters;
 868
 869   /* We want the number of loop header executions which is the number
 870      of latch executions plus one.
 871      ???  For UINT_MAX latch executions this number overflows to zero
 872      for loops like do { n++; } while (n != 0);  */
 873   if (niters && !chrec_contains_undetermined (niters))
 874     niters = fold_build2 (PLUS_EXPR, TREE_TYPE (niters), unshare_expr (niters),
 875                           build_int_cst (TREE_TYPE (niters), 1));
 876   *number_of_iterations = niters;
 877
 878   return get_loop_exit_condition (loop);
 879 }
 880
 881
 882 /* Function bb_in_loop_p
 883
 884    Used as predicate for dfs order traversal of the loop bbs.  */
 885
 886 static bool
 887 bb_in_loop_p (const_basic_block bb, const void *data)
 888 {
 889   const struct loop *const loop = (const struct loop *)data;
 890   if (flow_bb_inside_loop_p (loop, bb))
 891     return true;
 892   return false;
 893 }
 894
 895
 896 /* Function new_loop_vec_info.
 897
 898    Create and initialize a new loop_vec_info struct for LOOP, as well as
 899    stmt_vec_info structs for all the stmts in LOOP.  */
 900
 901 static loop_vec_info
 902 new_loop_vec_info (struct loop *loop)
 903 {
 904   loop_vec_info res;
 905   basic_block *bbs;
 906   gimple_stmt_iterator si;
 907   unsigned int i, nbbs;
 908
 909   res = (loop_vec_info) xcalloc (1, sizeof (struct _loop_vec_info));
 910   LOOP_VINFO_LOOP (res) = loop;
 911
 912   bbs = get_loop_body (loop);
 913
 914   /* Create/Update stmt_info for all stmts in the loop.  */
 915   for (i = 0; i < loop->num_nodes; i++)
 916     {
 917       basic_block bb = bbs[i];
 918
 919       /* BBs in a nested inner-loop will have been already processed (because
 920          we will have called vect_analyze_loop_form for any nested inner-loop).
 921          Therefore, for stmts in an inner-loop we just want to update the
 922          STMT_VINFO_LOOP_VINFO field of their stmt_info to point to the new
 923          loop_info of the outer-loop we are currently considering to vectorize
 924          (instead of the loop_info of the inner-loop).
 925          For stmts in other BBs we need to create a stmt_info from scratch.  */
 926       if (bb->loop_father != loop)
 927         {
 928           /* Inner-loop bb.  */
 929           gcc_assert (loop->inner && bb->loop_father == loop->inner);
 930           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 931             {
 932               gimple phi = gsi_stmt (si);
 933               stmt_vec_info stmt_info = vinfo_for_stmt (phi);
 934               loop_vec_info inner_loop_vinfo =
 935                 STMT_VINFO_LOOP_VINFO (stmt_info);
 936               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 937               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 938             }
 939           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 940            {
 941               gimple stmt = gsi_stmt (si);
 942               stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
 943               loop_vec_info inner_loop_vinfo =
 944                  STMT_VINFO_LOOP_VINFO (stmt_info);
 945               gcc_assert (loop->inner == LOOP_VINFO_LOOP (inner_loop_vinfo));
 946               STMT_VINFO_LOOP_VINFO (stmt_info) = res;
 947            }
 948         }
 949       else
 950         {
 951           /* bb in current nest.  */
 952           for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
 953             {
 954               gimple phi = gsi_stmt (si);
 955               gimple_set_uid (phi, 0);
 956               set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, res, NULL));
 957             }
 958
 959           for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
 960             {
 961               gimple stmt = gsi_stmt (si);
 962               gimple_set_uid (stmt, 0);
 963               set_vinfo_for_stmt (stmt, new_stmt_vec_info (stmt, res, NULL));
 964             }
 965         }
 966     }
 967
 968   /* CHECKME: We want to visit all BBs before their successors (except for
 969      latch blocks, for which this assertion wouldn't hold).  In the simple
 970      case of the loop forms we allow, a dfs order of the BBs would the same
 971      as reversed postorder traversal, so we are safe.  */
 972
 973    free (bbs);
 974    bbs = XCNEWVEC (basic_block, loop->num_nodes);
 975    nbbs = dfs_enumerate_from (loop->header, 0, bb_in_loop_p,
 976                               bbs, loop->num_nodes, loop);
 977    gcc_assert (nbbs == loop->num_nodes);
 978
 979   LOOP_VINFO_BBS (res) = bbs;
 980   LOOP_VINFO_NITERSM1 (res) = NULL;
 981   LOOP_VINFO_NITERS (res) = NULL;
 982   LOOP_VINFO_NITERS_UNCHANGED (res) = NULL;
 983   LOOP_VINFO_COST_MODEL_MIN_ITERS (res) = 0;
 984   LOOP_VINFO_COST_MODEL_THRESHOLD (res) = 0;
 985   LOOP_VINFO_VECTORIZABLE_P (res) = 0;
 986   LOOP_VINFO_PEELING_FOR_ALIGNMENT (res) = 0;
 987   LOOP_VINFO_VECT_FACTOR (res) = 0;
 988   LOOP_VINFO_LOOP_NEST (res).create (3);
 989   LOOP_VINFO_DATAREFS (res).create (10);
 990   LOOP_VINFO_DDRS (res).create (10 * 10);
 991   LOOP_VINFO_UNALIGNED_DR (res) = NULL;
 992   LOOP_VINFO_MAY_MISALIGN_STMTS (res).create (
 993              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIGNMENT_CHECKS));
 994   LOOP_VINFO_MAY_ALIAS_DDRS (res).create (
 995              PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
 996   LOOP_VINFO_GROUPED_STORES (res).create (10);
 997   LOOP_VINFO_REDUCTIONS (res).create (10);
 998   LOOP_VINFO_REDUCTION_CHAINS (res).create (10);
 999   LOOP_VINFO_SLP_INSTANCES (res).create (10);
1000   LOOP_VINFO_SLP_UNROLLING_FACTOR (res) = 1;
1001   LOOP_VINFO_TARGET_COST_DATA (res) = init_cost (loop);
1002   LOOP_VINFO_PEELING_FOR_GAPS (res) = false;
1003   LOOP_VINFO_PEELING_FOR_NITER (res) = false;
1004   LOOP_VINFO_OPERANDS_SWAPPED (res) = false;
1005
1006   return res;
1007 }
1008
1009
1010 /* Function destroy_loop_vec_info.
1011
1012    Free LOOP_VINFO struct, as well as all the stmt_vec_info structs of all the
1013    stmts in the loop.  */
1014
1015 void
1016 destroy_loop_vec_info (loop_vec_info loop_vinfo, bool clean_stmts)
1017 {
1018   struct loop *loop;
1019   basic_block *bbs;
1020   int nbbs;
1021   gimple_stmt_iterator si;
1022   int j;
1023   vec<slp_instance> slp_instances;
1024   slp_instance instance;
1025   bool swapped;
1026
1027   if (!loop_vinfo)
1028     return;
1029
1030   loop = LOOP_VINFO_LOOP (loop_vinfo);
1031
1032   bbs = LOOP_VINFO_BBS (loop_vinfo);
1033   nbbs = clean_stmts ? loop->num_nodes : 0;
1034   swapped = LOOP_VINFO_OPERANDS_SWAPPED (loop_vinfo);
1035
1036   for (j = 0; j < nbbs; j++)
1037     {
1038       basic_block bb = bbs[j];
1039       for (si = gsi_start_phis (bb); !gsi_end_p (si); gsi_next (&si))
1040         free_stmt_vec_info (gsi_stmt (si));
1041
1042       for (si = gsi_start_bb (bb); !gsi_end_p (si); )
1043         {
1044           gimple stmt = gsi_stmt (si);
1045
1046           /* We may have broken canonical form by moving a constant
1047              into RHS1 of a commutative op.  Fix such occurrences.  */
1048           if (swapped && is_gimple_assign (stmt))
1049             {
1050               enum tree_code code = gimple_assign_rhs_code (stmt);
1051
1052               if ((code == PLUS_EXPR
1053                    || code == POINTER_PLUS_EXPR
1054                    || code == MULT_EXPR)
1055                   && CONSTANT_CLASS_P (gimple_assign_rhs1 (stmt)))
1056                 swap_ssa_operands (stmt,
1057                                    gimple_assign_rhs1_ptr (stmt),
1058                                    gimple_assign_rhs2_ptr (stmt));
1059             }
1060
1061           /* Free stmt_vec_info.  */
1062           free_stmt_vec_info (stmt);
1063           gsi_next (&si);
1064         }
1065     }
1066
1067   free (LOOP_VINFO_BBS (loop_vinfo));
1068   vect_destroy_datarefs (loop_vinfo, NULL);
1069   free_dependence_relations (LOOP_VINFO_DDRS (loop_vinfo));
1070   LOOP_VINFO_LOOP_NEST (loop_vinfo).release ();
1071   LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).release ();
1072   LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).release ();
1073   slp_instances = LOOP_VINFO_SLP_INSTANCES (loop_vinfo);
1074   FOR_EACH_VEC_ELT (slp_instances, j, instance)
1075     vect_free_slp_instance (instance);
1076
1077   LOOP_VINFO_SLP_INSTANCES (loop_vinfo).release ();
1078   LOOP_VINFO_GROUPED_STORES (loop_vinfo).release ();
1079   LOOP_VINFO_REDUCTIONS (loop_vinfo).release ();
1080   LOOP_VINFO_REDUCTION_CHAINS (loop_vinfo).release ();
1081
1082   delete LOOP_VINFO_PEELING_HTAB (loop_vinfo);
1083   LOOP_VINFO_PEELING_HTAB (loop_vinfo) = NULL;
1084
1085   destroy_cost_data (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1086   loop_vinfo->scalar_cost_vec.release ();
1087
1088   free (loop_vinfo);
1089   loop->aux = NULL;
1090 }
1091
1092
1093 /* Calculate the cost of one scalar iteration of the loop.  */
1094 static void
1095 vect_get_single_scalar_iteration_cost (loop_vec_info loop_vinfo)
1096 {
1097   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1098   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1099   int nbbs = loop->num_nodes, factor, scalar_single_iter_cost = 0;
1100   int innerloop_iters, i;
1101
1102   /* Count statements in scalar loop.  Using this as scalar cost for a single
1103      iteration for now.
1104
1105      TODO: Add outer loop support.
1106
1107      TODO: Consider assigning different costs to different scalar
1108      statements.  */
1109
1110   /* FORNOW.  */
1111   innerloop_iters = 1;
1112   if (loop->inner)
1113     innerloop_iters = 50; /* FIXME */
1114
1115   for (i = 0; i < nbbs; i++)
1116     {
1117       gimple_stmt_iterator si;
1118       basic_block bb = bbs[i];
1119
1120       if (bb->loop_father == loop->inner)
1121         factor = innerloop_iters;
1122       else
1123         factor = 1;
1124
1125       for (si = gsi_start_bb (bb); !gsi_end_p (si); gsi_next (&si))
1126         {
1127           gimple stmt = gsi_stmt (si);
1128           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1129
1130           if (!is_gimple_assign (stmt) && !is_gimple_call (stmt))
1131             continue;
1132
1133           /* Skip stmts that are not vectorized inside the loop.  */
1134           if (stmt_info
1135               && !STMT_VINFO_RELEVANT_P (stmt_info)
1136               && (!STMT_VINFO_LIVE_P (stmt_info)
1137                   || !VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1138               && !STMT_VINFO_IN_PATTERN_P (stmt_info))
1139             continue;
1140
1141           vect_cost_for_stmt kind;
1142           if (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt)))
1143             {
1144               if (DR_IS_READ (STMT_VINFO_DATA_REF (vinfo_for_stmt (stmt))))
1145                kind = scalar_load;
1146              else
1147                kind = scalar_store;
1148             }
1149           else
1150             kind = scalar_stmt;
1151
1152           scalar_single_iter_cost
1153             += record_stmt_cost (&LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo),
1154                                  factor, kind, NULL, 0, vect_prologue);
1155         }
1156     }
1157   LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo)
1158     = scalar_single_iter_cost;
1159 }
1160
1161
1162 /* Function vect_analyze_loop_1.
1163
1164    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1165    for it. The different analyses will record information in the
1166    loop_vec_info struct.  This is a subset of the analyses applied in
1167    vect_analyze_loop, to be applied on an inner-loop nested in the loop
1168    that is now considered for (outer-loop) vectorization.  */
1169
1170 static loop_vec_info
1171 vect_analyze_loop_1 (struct loop *loop)
1172 {
1173   loop_vec_info loop_vinfo;
1174
1175   if (dump_enabled_p ())
1176     dump_printf_loc (MSG_NOTE, vect_location,
1177                      "===== analyze_loop_nest_1 =====\n");
1178
1179   /* Check the CFG characteristics of the loop (nesting, entry/exit, etc.  */
1180
1181   loop_vinfo = vect_analyze_loop_form (loop);
1182   if (!loop_vinfo)
1183     {
1184       if (dump_enabled_p ())
1185         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1186                          "bad inner-loop form.\n");
1187       return NULL;
1188     }
1189
1190   return loop_vinfo;
1191 }
1192
1193
1194 /* Function vect_analyze_loop_form.
1195
1196    Verify that certain CFG restrictions hold, including:
1197    - the loop has a pre-header
1198    - the loop has a single entry and exit
1199    - the loop exit condition is simple enough, and the number of iterations
1200      can be analyzed (a countable loop).  */
1201
1202 loop_vec_info
1203 vect_analyze_loop_form (struct loop *loop)
1204 {
1205   loop_vec_info loop_vinfo;
1206   gcond *loop_cond;
1207   tree number_of_iterations = NULL, number_of_iterationsm1 = NULL;
1208   loop_vec_info inner_loop_vinfo = NULL;
1209
1210   if (dump_enabled_p ())
1211     dump_printf_loc (MSG_NOTE, vect_location,
1212                      "=== vect_analyze_loop_form ===\n");
1213
1214   /* Different restrictions apply when we are considering an inner-most loop,
1215      vs. an outer (nested) loop.
1216      (FORNOW. May want to relax some of these restrictions in the future).  */
1217
1218   if (!loop->inner)
1219     {
1220       /* Inner-most loop.  We currently require that the number of BBs is
1221          exactly 2 (the header and latch).  Vectorizable inner-most loops
1222          look like this:
1223
1224                         (pre-header)
1225                            |
1226                           header <--------+
1227                            | |            |
1228                            | +--> latch --+
1229                            |
1230                         (exit-bb)  */
1231
1232       if (loop->num_nodes != 2)
1233         {
1234           if (dump_enabled_p ())
1235             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1236                              "not vectorized: control flow in loop.\n");
1237           return NULL;
1238         }
1239
1240       if (empty_block_p (loop->header))
1241         {
1242           if (dump_enabled_p ())
1243             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1244                              "not vectorized: empty loop.\n");
1245           return NULL;
1246         }
1247     }
1248   else
1249     {
1250       struct loop *innerloop = loop->inner;
1251       edge entryedge;
1252
1253       /* Nested loop. We currently require that the loop is doubly-nested,
1254          contains a single inner loop, and the number of BBs is exactly 5.
1255          Vectorizable outer-loops look like this:
1256
1257                         (pre-header)
1258                            |
1259                           header <---+
1260                            |         |
1261                           inner-loop |
1262                            |         |
1263                           tail ------+
1264                            |
1265                         (exit-bb)
1266
1267          The inner-loop has the properties expected of inner-most loops
1268          as described above.  */
1269
1270       if ((loop->inner)->inner || (loop->inner)->next)
1271         {
1272           if (dump_enabled_p ())
1273             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1274                              "not vectorized: multiple nested loops.\n");
1275           return NULL;
1276         }
1277
1278       /* Analyze the inner-loop.  */
1279       inner_loop_vinfo = vect_analyze_loop_1 (loop->inner);
1280       if (!inner_loop_vinfo)
1281         {
1282           if (dump_enabled_p ())
1283             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1284                              "not vectorized: Bad inner loop.\n");
1285           return NULL;
1286         }
1287
1288       if (!expr_invariant_in_loop_p (loop,
1289                                         LOOP_VINFO_NITERS (inner_loop_vinfo)))
1290         {
1291           if (dump_enabled_p ())
1292             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1293                              "not vectorized: inner-loop count not"
1294                              " invariant.\n");
1295           destroy_loop_vec_info (inner_loop_vinfo, true);
1296           return NULL;
1297         }
1298
1299       if (loop->num_nodes != 5)
1300         {
1301           if (dump_enabled_p ())
1302             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1303                              "not vectorized: control flow in loop.\n");
1304           destroy_loop_vec_info (inner_loop_vinfo, true);
1305           return NULL;
1306         }
1307
1308       gcc_assert (EDGE_COUNT (innerloop->header->preds) == 2);
1309       entryedge = EDGE_PRED (innerloop->header, 0);
1310       if (EDGE_PRED (innerloop->header, 0)->src == innerloop->latch)
1311         entryedge = EDGE_PRED (innerloop->header, 1);
1312
1313       if (entryedge->src != loop->header
1314           || !single_exit (innerloop)
1315           || single_exit (innerloop)->dest !=  EDGE_PRED (loop->latch, 0)->src)
1316         {
1317           if (dump_enabled_p ())
1318             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1319                              "not vectorized: unsupported outerloop form.\n");
1320           destroy_loop_vec_info (inner_loop_vinfo, true);
1321           return NULL;
1322         }
1323
1324       if (dump_enabled_p ())
1325         dump_printf_loc (MSG_NOTE, vect_location,
1326                          "Considering outer-loop vectorization.\n");
1327     }
1328
1329   if (!single_exit (loop)
1330       || EDGE_COUNT (loop->header->preds) != 2)
1331     {
1332       if (dump_enabled_p ())
1333         {
1334           if (!single_exit (loop))
1335             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1336                              "not vectorized: multiple exits.\n");
1337           else if (EDGE_COUNT (loop->header->preds) != 2)
1338             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1339                              "not vectorized: too many incoming edges.\n");
1340         }
1341       if (inner_loop_vinfo)
1342         destroy_loop_vec_info (inner_loop_vinfo, true);
1343       return NULL;
1344     }
1345
1346   /* We assume that the loop exit condition is at the end of the loop. i.e,
1347      that the loop is represented as a do-while (with a proper if-guard
1348      before the loop if needed), where the loop header contains all the
1349      executable statements, and the latch is empty.  */
1350   if (!empty_block_p (loop->latch)
1351       || !gimple_seq_empty_p (phi_nodes (loop->latch)))
1352     {
1353       if (dump_enabled_p ())
1354         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1355                          "not vectorized: latch block not empty.\n");
1356       if (inner_loop_vinfo)
1357         destroy_loop_vec_info (inner_loop_vinfo, true);
1358       return NULL;
1359     }
1360
1361   /* Make sure there exists a single-predecessor exit bb:  */
1362   if (!single_pred_p (single_exit (loop)->dest))
1363     {
1364       edge e = single_exit (loop);
1365       if (!(e->flags & EDGE_ABNORMAL))
1366         {
1367           split_loop_exit_edge (e);
1368           if (dump_enabled_p ())
1369             dump_printf (MSG_NOTE, "split exit edge.\n");
1370         }
1371       else
1372         {
1373           if (dump_enabled_p ())
1374             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1375                              "not vectorized: abnormal loop exit edge.\n");
1376           if (inner_loop_vinfo)
1377             destroy_loop_vec_info (inner_loop_vinfo, true);
1378           return NULL;
1379         }
1380     }
1381
1382   loop_cond = vect_get_loop_niters (loop, &number_of_iterations,
1383                                     &number_of_iterationsm1);
1384   if (!loop_cond)
1385     {
1386       if (dump_enabled_p ())
1387         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1388                          "not vectorized: complicated exit condition.\n");
1389       if (inner_loop_vinfo)
1390         destroy_loop_vec_info (inner_loop_vinfo, true);
1391       return NULL;
1392     }
1393
1394   if (!number_of_iterations
1395       || chrec_contains_undetermined (number_of_iterations))
1396     {
1397       if (dump_enabled_p ())
1398         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1399                          "not vectorized: number of iterations cannot be "
1400                          "computed.\n");
1401       if (inner_loop_vinfo)
1402         destroy_loop_vec_info (inner_loop_vinfo, true);
1403       return NULL;
1404     }
1405
1406   if (integer_zerop (number_of_iterations))
1407     {
1408       if (dump_enabled_p ())
1409         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1410                          "not vectorized: number of iterations = 0.\n");
1411       if (inner_loop_vinfo)
1412         destroy_loop_vec_info (inner_loop_vinfo, true);
1413       return NULL;
1414     }
1415
1416   loop_vinfo = new_loop_vec_info (loop);
1417   LOOP_VINFO_NITERSM1 (loop_vinfo) = number_of_iterationsm1;
1418   LOOP_VINFO_NITERS (loop_vinfo) = number_of_iterations;
1419   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = number_of_iterations;
1420
1421   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
1422     {
1423       if (dump_enabled_p ())
1424         {
1425           dump_printf_loc (MSG_NOTE, vect_location,
1426                            "Symbolic number of iterations is ");
1427           dump_generic_expr (MSG_NOTE, TDF_DETAILS, number_of_iterations);
1428           dump_printf (MSG_NOTE, "\n");
1429         }
1430     }
1431
1432   STMT_VINFO_TYPE (vinfo_for_stmt (loop_cond)) = loop_exit_ctrl_vec_info_type;
1433
1434   /* CHECKME: May want to keep it around it in the future.  */
1435   if (inner_loop_vinfo)
1436     destroy_loop_vec_info (inner_loop_vinfo, false);
1437
1438   gcc_assert (!loop->aux);
1439   loop->aux = loop_vinfo;
1440   return loop_vinfo;
1441 }
1442
1443 /* Scan the loop stmts and dependent on whether there are any (non-)SLP
1444    statements update the vectorization factor.  */
1445
1446 static void
1447 vect_update_vf_for_slp (loop_vec_info loop_vinfo)
1448 {
1449   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1450   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1451   int nbbs = loop->num_nodes;
1452   unsigned int vectorization_factor;
1453   int i;
1454
1455   if (dump_enabled_p ())
1456     dump_printf_loc (MSG_NOTE, vect_location,
1457                      "=== vect_update_vf_for_slp ===\n");
1458
1459   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1460   gcc_assert (vectorization_factor != 0);
1461
1462   /* If all the stmts in the loop can be SLPed, we perform only SLP, and
1463      vectorization factor of the loop is the unrolling factor required by
1464      the SLP instances.  If that unrolling factor is 1, we say, that we
1465      perform pure SLP on loop - cross iteration parallelism is not
1466      exploited.  */
1467   bool only_slp_in_loop = true;
1468   for (i = 0; i < nbbs; i++)
1469     {
1470       basic_block bb = bbs[i];
1471       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1472            gsi_next (&si))
1473         {
1474           gimple stmt = gsi_stmt (si);
1475           stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
1476           if (STMT_VINFO_IN_PATTERN_P (stmt_info)
1477               && STMT_VINFO_RELATED_STMT (stmt_info))
1478             {
1479               stmt = STMT_VINFO_RELATED_STMT (stmt_info);
1480               stmt_info = vinfo_for_stmt (stmt);
1481             }
1482           if ((STMT_VINFO_RELEVANT_P (stmt_info)
1483                || VECTORIZABLE_CYCLE_DEF (STMT_VINFO_DEF_TYPE (stmt_info)))
1484               && !PURE_SLP_STMT (stmt_info))
1485             /* STMT needs both SLP and loop-based vectorization.  */
1486             only_slp_in_loop = false;
1487         }
1488     }
1489
1490   if (only_slp_in_loop)
1491     vectorization_factor = LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo);
1492   else
1493     vectorization_factor
1494       = least_common_multiple (vectorization_factor,
1495                                LOOP_VINFO_SLP_UNROLLING_FACTOR (loop_vinfo));
1496
1497   LOOP_VINFO_VECT_FACTOR (loop_vinfo) = vectorization_factor;
1498   if (dump_enabled_p ())
1499     dump_printf_loc (MSG_NOTE, vect_location,
1500                      "Updating vectorization factor to %d\n",
1501                      vectorization_factor);
1502 }
1503
1504 /* Function vect_analyze_loop_operations.
1505
1506    Scan the loop stmts and make sure they are all vectorizable.  */
1507
1508 static bool
1509 vect_analyze_loop_operations (loop_vec_info loop_vinfo)
1510 {
1511   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
1512   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
1513   int nbbs = loop->num_nodes;
1514   unsigned int vectorization_factor;
1515   int i;
1516   stmt_vec_info stmt_info;
1517   bool need_to_vectorize = false;
1518   int min_profitable_iters;
1519   int min_scalar_loop_bound;
1520   unsigned int th;
1521   bool ok;
1522   HOST_WIDE_INT max_niter;
1523   HOST_WIDE_INT estimated_niter;
1524   int min_profitable_estimate;
1525
1526   if (dump_enabled_p ())
1527     dump_printf_loc (MSG_NOTE, vect_location,
1528                      "=== vect_analyze_loop_operations ===\n");
1529
1530   for (i = 0; i < nbbs; i++)
1531     {
1532       basic_block bb = bbs[i];
1533
1534       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
1535            gsi_next (&si))
1536         {
1537           gphi *phi = si.phi ();
1538           ok = true;
1539
1540           stmt_info = vinfo_for_stmt (phi);
1541           if (dump_enabled_p ())
1542             {
1543               dump_printf_loc (MSG_NOTE, vect_location, "examining phi: ");
1544               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
1545               dump_printf (MSG_NOTE, "\n");
1546             }
1547
1548           /* Inner-loop loop-closed exit phi in outer-loop vectorization
1549              (i.e., a phi in the tail of the outer-loop).  */
1550           if (! is_loop_header_bb_p (bb))
1551             {
1552               /* FORNOW: we currently don't support the case that these phis
1553                  are not used in the outerloop (unless it is double reduction,
1554                  i.e., this phi is vect_reduction_def), cause this case
1555                  requires to actually do something here.  */
1556               if ((!STMT_VINFO_RELEVANT_P (stmt_info)
1557                    || STMT_VINFO_LIVE_P (stmt_info))
1558                   && STMT_VINFO_DEF_TYPE (stmt_info)
1559                      != vect_double_reduction_def)
1560                 {
1561                   if (dump_enabled_p ())
1562                     dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1563                                      "Unsupported loop-closed phi in "
1564                                      "outer-loop.\n");
1565                   return false;
1566                 }
1567
1568               /* If PHI is used in the outer loop, we check that its operand
1569                  is defined in the inner loop.  */
1570               if (STMT_VINFO_RELEVANT_P (stmt_info))
1571                 {
1572                   tree phi_op;
1573                   gimple op_def_stmt;
1574
1575                   if (gimple_phi_num_args (phi) != 1)
1576                     return false;
1577
1578                   phi_op = PHI_ARG_DEF (phi, 0);
1579                   if (TREE_CODE (phi_op) != SSA_NAME)
1580                     return false;
1581
1582                   op_def_stmt = SSA_NAME_DEF_STMT (phi_op);
1583                   if (gimple_nop_p (op_def_stmt)
1584                       || !flow_bb_inside_loop_p (loop, gimple_bb (op_def_stmt))
1585                       || !vinfo_for_stmt (op_def_stmt))
1586                     return false;
1587
1588                   if (STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1589                         != vect_used_in_outer
1590                       && STMT_VINFO_RELEVANT (vinfo_for_stmt (op_def_stmt))
1591                            != vect_used_in_outer_by_reduction)
1592                     return false;
1593                 }
1594
1595               continue;
1596             }
1597
1598           gcc_assert (stmt_info);
1599
1600           if (STMT_VINFO_LIVE_P (stmt_info))
1601             {
1602               /* FORNOW: not yet supported.  */
1603               if (dump_enabled_p ())
1604                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1605                                  "not vectorized: value used after loop.\n");
1606               return false;
1607             }
1608
1609           if (STMT_VINFO_RELEVANT (stmt_info) == vect_used_in_scope
1610               && STMT_VINFO_DEF_TYPE (stmt_info) != vect_induction_def)
1611             {
1612               /* A scalar-dependence cycle that we don't support.  */
1613               if (dump_enabled_p ())
1614                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1615                                  "not vectorized: scalar dependence cycle.\n");
1616               return false;
1617             }
1618
1619           if (STMT_VINFO_RELEVANT_P (stmt_info))
1620             {
1621               need_to_vectorize = true;
1622               if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
1623                 ok = vectorizable_induction (phi, NULL, NULL);
1624             }
1625
1626           if (!ok)
1627             {
1628               if (dump_enabled_p ())
1629                 {
1630                   dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1631                                    "not vectorized: relevant phi not "
1632                                    "supported: ");
1633                   dump_gimple_stmt (MSG_MISSED_OPTIMIZATION, TDF_SLIM, phi, 0);
1634                   dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
1635                 }
1636               return false;
1637             }
1638         }
1639
1640       for (gimple_stmt_iterator si = gsi_start_bb (bb); !gsi_end_p (si);
1641            gsi_next (&si))
1642         {
1643           gimple stmt = gsi_stmt (si);
1644           if (!gimple_clobber_p (stmt)
1645               && !vect_analyze_stmt (stmt, &need_to_vectorize, NULL))
1646             return false;
1647         }
1648     } /* bbs */
1649
1650   /* All operations in the loop are either irrelevant (deal with loop
1651      control, or dead), or only used outside the loop and can be moved
1652      out of the loop (e.g. invariants, inductions).  The loop can be
1653      optimized away by scalar optimizations.  We're better off not
1654      touching this loop.  */
1655   if (!need_to_vectorize)
1656     {
1657       if (dump_enabled_p ())
1658         dump_printf_loc (MSG_NOTE, vect_location,
1659                          "All the computation can be taken out of the loop.\n");
1660       if (dump_enabled_p ())
1661         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1662                          "not vectorized: redundant loop. no profit to "
1663                          "vectorize.\n");
1664       return false;
1665     }
1666
1667   vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1668   gcc_assert (vectorization_factor != 0);
1669
1670   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo) && dump_enabled_p ())
1671     dump_printf_loc (MSG_NOTE, vect_location,
1672                      "vectorization_factor = %d, niters = "
1673                      HOST_WIDE_INT_PRINT_DEC "\n", vectorization_factor,
1674                      LOOP_VINFO_INT_NITERS (loop_vinfo));
1675
1676   if ((LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1677        && (LOOP_VINFO_INT_NITERS (loop_vinfo) < vectorization_factor))
1678       || ((max_niter = max_stmt_executions_int (loop)) != -1
1679           && (unsigned HOST_WIDE_INT) max_niter < vectorization_factor))
1680     {
1681       if (dump_enabled_p ())
1682         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1683                          "not vectorized: iteration count too small.\n");
1684       if (dump_enabled_p ())
1685         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1686                          "not vectorized: iteration count smaller than "
1687                          "vectorization factor.\n");
1688       return false;
1689     }
1690
1691   /* Analyze cost.  Decide if worth while to vectorize.  */
1692
1693   vect_estimate_min_profitable_iters (loop_vinfo, &min_profitable_iters,
1694                                       &min_profitable_estimate);
1695   LOOP_VINFO_COST_MODEL_MIN_ITERS (loop_vinfo) = min_profitable_iters;
1696
1697   if (min_profitable_iters < 0)
1698     {
1699       if (dump_enabled_p ())
1700         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1701                          "not vectorized: vectorization not profitable.\n");
1702       if (dump_enabled_p ())
1703         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1704                          "not vectorized: vector version will never be "
1705                          "profitable.\n");
1706       return false;
1707     }
1708
1709   min_scalar_loop_bound = ((PARAM_VALUE (PARAM_MIN_VECT_LOOP_BOUND)
1710                             * vectorization_factor) - 1);
1711
1712
1713   /* Use the cost model only if it is more conservative than user specified
1714      threshold.  */
1715
1716   th = (unsigned) min_scalar_loop_bound;
1717   if (min_profitable_iters
1718       && (!min_scalar_loop_bound
1719           || min_profitable_iters > min_scalar_loop_bound))
1720     th = (unsigned) min_profitable_iters;
1721
1722   LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) = th;
1723
1724   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1725       && LOOP_VINFO_INT_NITERS (loop_vinfo) <= th)
1726     {
1727       if (dump_enabled_p ())
1728         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1729                          "not vectorized: vectorization not profitable.\n");
1730       if (dump_enabled_p ())
1731         dump_printf_loc (MSG_NOTE, vect_location,
1732                          "not vectorized: iteration count smaller than user "
1733                          "specified loop bound parameter or minimum profitable "
1734                          "iterations (whichever is more conservative).\n");
1735       return false;
1736     }
1737
1738   if ((estimated_niter = estimated_stmt_executions_int (loop)) != -1
1739       && ((unsigned HOST_WIDE_INT) estimated_niter
1740           <= MAX (th, (unsigned)min_profitable_estimate)))
1741     {
1742       if (dump_enabled_p ())
1743         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1744                          "not vectorized: estimated iteration count too "
1745                          "small.\n");
1746       if (dump_enabled_p ())
1747         dump_printf_loc (MSG_NOTE, vect_location,
1748                          "not vectorized: estimated iteration count smaller "
1749                          "than specified loop bound parameter or minimum "
1750                          "profitable iterations (whichever is more "
1751                          "conservative).\n");
1752       return false;
1753     }
1754
1755   return true;
1756 }
1757
1758
1759 /* Function vect_analyze_loop_2.
1760
1761    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1762    for it.  The different analyses will record information in the
1763    loop_vec_info struct.  */
1764 static bool
1765 vect_analyze_loop_2 (loop_vec_info loop_vinfo)
1766 {
1767   bool ok;
1768   int max_vf = MAX_VECTORIZATION_FACTOR;
1769   int min_vf = 2;
1770   unsigned int th;
1771   unsigned int n_stmts = 0;
1772
1773   /* Find all data references in the loop (which correspond to vdefs/vuses)
1774      and analyze their evolution in the loop.  Also adjust the minimal
1775      vectorization factor according to the loads and stores.
1776
1777      FORNOW: Handle only simple, array references, which
1778      alignment can be forced, and aligned pointer-references.  */
1779
1780   ok = vect_analyze_data_refs (loop_vinfo, NULL, &min_vf, &n_stmts);
1781   if (!ok)
1782     {
1783       if (dump_enabled_p ())
1784         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1785                          "bad data references.\n");
1786       return false;
1787     }
1788
1789   /* Classify all cross-iteration scalar data-flow cycles.
1790      Cross-iteration cycles caused by virtual phis are analyzed separately.  */
1791
1792   vect_analyze_scalar_cycles (loop_vinfo);
1793
1794   vect_pattern_recog (loop_vinfo, NULL);
1795
1796   vect_fixup_scalar_cycles_with_patterns (loop_vinfo);
1797
1798   /* Analyze the access patterns of the data-refs in the loop (consecutive,
1799      complex, etc.). FORNOW: Only handle consecutive access pattern.  */
1800
1801   ok = vect_analyze_data_ref_accesses (loop_vinfo, NULL);
1802   if (!ok)
1803     {
1804       if (dump_enabled_p ())
1805         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1806                          "bad data access.\n");
1807       return false;
1808     }
1809
1810   /* Data-flow analysis to detect stmts that do not need to be vectorized.  */
1811
1812   ok = vect_mark_stmts_to_be_vectorized (loop_vinfo);
1813   if (!ok)
1814     {
1815       if (dump_enabled_p ())
1816         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1817                          "unexpected pattern.\n");
1818       return false;
1819     }
1820
1821   /* Analyze data dependences between the data-refs in the loop
1822      and adjust the maximum vectorization factor according to
1823      the dependences.
1824      FORNOW: fail at the first data dependence that we encounter.  */
1825
1826   ok = vect_analyze_data_ref_dependences (loop_vinfo, &max_vf);
1827   if (!ok
1828       || max_vf < min_vf)
1829     {
1830       if (dump_enabled_p ())
1831             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1832                              "bad data dependence.\n");
1833       return false;
1834     }
1835
1836   ok = vect_determine_vectorization_factor (loop_vinfo);
1837   if (!ok)
1838     {
1839       if (dump_enabled_p ())
1840         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1841                          "can't determine vectorization factor.\n");
1842       return false;
1843     }
1844   if (max_vf < LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1845     {
1846       if (dump_enabled_p ())
1847         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1848                          "bad data dependence.\n");
1849       return false;
1850     }
1851
1852   /* Check the SLP opportunities in the loop, analyze and build SLP trees.  */
1853   ok = vect_analyze_slp (loop_vinfo, NULL, n_stmts);
1854   if (!ok)
1855     return false;
1856
1857   /* If there are any SLP instances mark them as pure_slp.  */
1858   bool slp = vect_make_slp_decision (loop_vinfo);
1859   if (slp)
1860     {
1861       /* Find stmts that need to be both vectorized and SLPed.  */
1862       vect_detect_hybrid_slp (loop_vinfo);
1863
1864       /* Update the vectorization factor based on the SLP decision.  */
1865       vect_update_vf_for_slp (loop_vinfo);
1866     }
1867
1868   /* Analyze the alignment of the data-refs in the loop.
1869      Fail if a data reference is found that cannot be vectorized.  */
1870
1871   ok = vect_analyze_data_refs_alignment (loop_vinfo, NULL);
1872   if (!ok)
1873     {
1874       if (dump_enabled_p ())
1875         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1876                          "bad data alignment.\n");
1877       return false;
1878     }
1879
1880   /* Prune the list of ddrs to be tested at run-time by versioning for alias.
1881      It is important to call pruning after vect_analyze_data_ref_accesses,
1882      since we use grouping information gathered by interleaving analysis.  */
1883   ok = vect_prune_runtime_alias_test_list (loop_vinfo);
1884   if (!ok)
1885     {
1886       if (dump_enabled_p ())
1887         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1888                          "number of versioning for alias "
1889                          "run-time tests exceeds %d "
1890                          "(--param vect-max-version-for-alias-checks)\n",
1891                          PARAM_VALUE (PARAM_VECT_MAX_VERSION_FOR_ALIAS_CHECKS));
1892       return false;
1893     }
1894
1895   /* Compute the scalar iteration cost.  */
1896   vect_get_single_scalar_iteration_cost (loop_vinfo);
1897
1898   /* This pass will decide on using loop versioning and/or loop peeling in
1899      order to enhance the alignment of data references in the loop.  */
1900
1901   ok = vect_enhance_data_refs_alignment (loop_vinfo);
1902   if (!ok)
1903     {
1904       if (dump_enabled_p ())
1905         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1906                          "bad data alignment.\n");
1907       return false;
1908     }
1909
1910   if (slp)
1911     {
1912       /* Analyze operations in the SLP instances.  Note this may
1913          remove unsupported SLP instances which makes the above
1914          SLP kind detection invalid.  */
1915       unsigned old_size = LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length ();
1916       vect_slp_analyze_operations (LOOP_VINFO_SLP_INSTANCES (loop_vinfo),
1917                                    LOOP_VINFO_TARGET_COST_DATA (loop_vinfo));
1918       if (LOOP_VINFO_SLP_INSTANCES (loop_vinfo).length () != old_size)
1919         return false;
1920     }
1921
1922   /* Scan all the remaining operations in the loop that are not subject
1923      to SLP and make sure they are vectorizable.  */
1924   ok = vect_analyze_loop_operations (loop_vinfo);
1925   if (!ok)
1926     {
1927       if (dump_enabled_p ())
1928         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1929                          "bad operation or unsupported loop bound.\n");
1930       return false;
1931     }
1932
1933   /* Decide whether we need to create an epilogue loop to handle
1934      remaining scalar iterations.  */
1935   th = ((LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo) + 1)
1936         / LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1937        * LOOP_VINFO_VECT_FACTOR (loop_vinfo);
1938
1939   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
1940       && LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) > 0)
1941     {
1942       if (ctz_hwi (LOOP_VINFO_INT_NITERS (loop_vinfo)
1943                    - LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
1944           < exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo)))
1945         LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1946     }
1947   else if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo)
1948            || (tree_ctz (LOOP_VINFO_NITERS (loop_vinfo))
1949                < (unsigned)exact_log2 (LOOP_VINFO_VECT_FACTOR (loop_vinfo))
1950                /* In case of versioning, check if the maximum number of
1951                   iterations is greater than th.  If they are identical,
1952                   the epilogue is unnecessary.  */
1953                && ((!LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo)
1954                     && !LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
1955                    || (unsigned HOST_WIDE_INT)max_stmt_executions_int
1956                         (LOOP_VINFO_LOOP (loop_vinfo)) > th)))
1957     LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo) = true;
1958
1959   /* If an epilogue loop is required make sure we can create one.  */
1960   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
1961       || LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo))
1962     {
1963       if (dump_enabled_p ())
1964         dump_printf_loc (MSG_NOTE, vect_location, "epilog loop required\n");
1965       if (!vect_can_advance_ivs_p (loop_vinfo)
1966           || !slpeel_can_duplicate_loop_p (LOOP_VINFO_LOOP (loop_vinfo),
1967                                            single_exit (LOOP_VINFO_LOOP
1968                                                          (loop_vinfo))))
1969         {
1970           if (dump_enabled_p ())
1971             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
1972                              "not vectorized: can't create required "
1973                              "epilog loop\n");
1974           return false;
1975         }
1976     }
1977
1978   return true;
1979 }
1980
1981 /* Function vect_analyze_loop.
1982
1983    Apply a set of analyses on LOOP, and create a loop_vec_info struct
1984    for it.  The different analyses will record information in the
1985    loop_vec_info struct.  */
1986 loop_vec_info
1987 vect_analyze_loop (struct loop *loop)
1988 {
1989   loop_vec_info loop_vinfo;
1990   unsigned int vector_sizes;
1991
1992   /* Autodetect first vector size we try.  */
1993   current_vector_size = 0;
1994   vector_sizes = targetm.vectorize.autovectorize_vector_sizes ();
1995
1996   if (dump_enabled_p ())
1997     dump_printf_loc (MSG_NOTE, vect_location,
1998                      "===== analyze_loop_nest =====\n");
1999
2000   if (loop_outer (loop)
2001       && loop_vec_info_for_loop (loop_outer (loop))
2002       && LOOP_VINFO_VECTORIZABLE_P (loop_vec_info_for_loop (loop_outer (loop))))
2003     {
2004       if (dump_enabled_p ())
2005         dump_printf_loc (MSG_NOTE, vect_location,
2006                          "outer-loop already vectorized.\n");
2007       return NULL;
2008     }
2009
2010   while (1)
2011     {
2012       /* Check the CFG characteristics of the loop (nesting, entry/exit).  */
2013       loop_vinfo = vect_analyze_loop_form (loop);
2014       if (!loop_vinfo)
2015         {
2016           if (dump_enabled_p ())
2017             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2018                              "bad loop form.\n");
2019           return NULL;
2020         }
2021
2022       if (vect_analyze_loop_2 (loop_vinfo))
2023         {
2024           LOOP_VINFO_VECTORIZABLE_P (loop_vinfo) = 1;
2025
2026           return loop_vinfo;
2027         }
2028
2029       destroy_loop_vec_info (loop_vinfo, true);
2030
2031       vector_sizes &= ~current_vector_size;
2032       if (vector_sizes == 0
2033           || current_vector_size == 0)
2034         return NULL;
2035
2036       /* Try the next biggest vector size.  */
2037       current_vector_size = 1 << floor_log2 (vector_sizes);
2038       if (dump_enabled_p ())
2039         dump_printf_loc (MSG_NOTE, vect_location,
2040                          "***** Re-trying analysis with "
2041                          "vector size %d\n", current_vector_size);
2042     }
2043 }
2044
2045
2046 /* Function reduction_code_for_scalar_code
2047
2048    Input:
2049    CODE - tree_code of a reduction operations.
2050
2051    Output:
2052    REDUC_CODE - the corresponding tree-code to be used to reduce the
2053       vector of partial results into a single scalar result, or ERROR_MARK
2054       if the operation is a supported reduction operation, but does not have
2055       such a tree-code.
2056
2057    Return FALSE if CODE currently cannot be vectorized as reduction.  */
2058
2059 static bool
2060 reduction_code_for_scalar_code (enum tree_code code,
2061                                 enum tree_code *reduc_code)
2062 {
2063   switch (code)
2064     {
2065       case MAX_EXPR:
2066         *reduc_code = REDUC_MAX_EXPR;
2067         return true;
2068
2069       case MIN_EXPR:
2070         *reduc_code = REDUC_MIN_EXPR;
2071         return true;
2072
2073       case PLUS_EXPR:
2074         *reduc_code = REDUC_PLUS_EXPR;
2075         return true;
2076
2077       case MULT_EXPR:
2078       case MINUS_EXPR:
2079       case BIT_IOR_EXPR:
2080       case BIT_XOR_EXPR:
2081       case BIT_AND_EXPR:
2082         *reduc_code = ERROR_MARK;
2083         return true;
2084
2085       default:
2086        return false;
2087     }
2088 }
2089
2090
2091 /* Error reporting helper for vect_is_simple_reduction below.  GIMPLE statement
2092    STMT is printed with a message MSG. */
2093
2094 static void
2095 report_vect_op (int msg_type, gimple stmt, const char *msg)
2096 {
2097   dump_printf_loc (msg_type, vect_location, "%s", msg);
2098   dump_gimple_stmt (msg_type, TDF_SLIM, stmt, 0);
2099   dump_printf (msg_type, "\n");
2100 }
2101
2102
2103 /* Detect SLP reduction of the form:
2104
2105    #a1 = phi <a5, a0>
2106    a2 = operation (a1)
2107    a3 = operation (a2)
2108    a4 = operation (a3)
2109    a5 = operation (a4)
2110
2111    #a = phi <a5>
2112
2113    PHI is the reduction phi node (#a1 = phi <a5, a0> above)
2114    FIRST_STMT is the first reduction stmt in the chain
2115    (a2 = operation (a1)).
2116
2117    Return TRUE if a reduction chain was detected.  */
2118
2119 static bool
2120 vect_is_slp_reduction (loop_vec_info loop_info, gimple phi, gimple first_stmt)
2121 {
2122   struct loop *loop = (gimple_bb (phi))->loop_father;
2123   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2124   enum tree_code code;
2125   gimple current_stmt = NULL, loop_use_stmt = NULL, first, next_stmt;
2126   stmt_vec_info use_stmt_info, current_stmt_info;
2127   tree lhs;
2128   imm_use_iterator imm_iter;
2129   use_operand_p use_p;
2130   int nloop_uses, size = 0, n_out_of_loop_uses;
2131   bool found = false;
2132
2133   if (loop != vect_loop)
2134     return false;
2135
2136   lhs = PHI_RESULT (phi);
2137   code = gimple_assign_rhs_code (first_stmt);
2138   while (1)
2139     {
2140       nloop_uses = 0;
2141       n_out_of_loop_uses = 0;
2142       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
2143         {
2144           gimple use_stmt = USE_STMT (use_p);
2145           if (is_gimple_debug (use_stmt))
2146             continue;
2147
2148           /* Check if we got back to the reduction phi.  */
2149           if (use_stmt == phi)
2150             {
2151               loop_use_stmt = use_stmt;
2152               found = true;
2153               break;
2154             }
2155
2156           if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2157             {
2158               loop_use_stmt = use_stmt;
2159               nloop_uses++;
2160             }
2161            else
2162              n_out_of_loop_uses++;
2163
2164            /* There are can be either a single use in the loop or two uses in
2165               phi nodes.  */
2166            if (nloop_uses > 1 || (n_out_of_loop_uses && nloop_uses))
2167              return false;
2168         }
2169
2170       if (found)
2171         break;
2172
2173       /* We reached a statement with no loop uses.  */
2174       if (nloop_uses == 0)
2175         return false;
2176
2177       /* This is a loop exit phi, and we haven't reached the reduction phi.  */
2178       if (gimple_code (loop_use_stmt) == GIMPLE_PHI)
2179         return false;
2180
2181       if (!is_gimple_assign (loop_use_stmt)
2182           || code != gimple_assign_rhs_code (loop_use_stmt)
2183           || !flow_bb_inside_loop_p (loop, gimple_bb (loop_use_stmt)))
2184         return false;
2185
2186       /* Insert USE_STMT into reduction chain.  */
2187       use_stmt_info = vinfo_for_stmt (loop_use_stmt);
2188       if (current_stmt)
2189         {
2190           current_stmt_info = vinfo_for_stmt (current_stmt);
2191           GROUP_NEXT_ELEMENT (current_stmt_info) = loop_use_stmt;
2192           GROUP_FIRST_ELEMENT (use_stmt_info)
2193             = GROUP_FIRST_ELEMENT (current_stmt_info);
2194         }
2195       else
2196         GROUP_FIRST_ELEMENT (use_stmt_info) = loop_use_stmt;
2197
2198       lhs = gimple_assign_lhs (loop_use_stmt);
2199       current_stmt = loop_use_stmt;
2200       size++;
2201    }
2202
2203   if (!found || loop_use_stmt != phi || size < 2)
2204     return false;
2205
2206   /* Swap the operands, if needed, to make the reduction operand be the second
2207      operand.  */
2208   lhs = PHI_RESULT (phi);
2209   next_stmt = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2210   while (next_stmt)
2211     {
2212       if (gimple_assign_rhs2 (next_stmt) == lhs)
2213         {
2214           tree op = gimple_assign_rhs1 (next_stmt);
2215           gimple def_stmt = NULL;
2216
2217           if (TREE_CODE (op) == SSA_NAME)
2218             def_stmt = SSA_NAME_DEF_STMT (op);
2219
2220           /* Check that the other def is either defined in the loop
2221              ("vect_internal_def"), or it's an induction (defined by a
2222              loop-header phi-node).  */
2223           if (def_stmt
2224               && gimple_bb (def_stmt)
2225               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2226               && (is_gimple_assign (def_stmt)
2227                   || is_gimple_call (def_stmt)
2228                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2229                            == vect_induction_def
2230                   || (gimple_code (def_stmt) == GIMPLE_PHI
2231                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2232                                   == vect_internal_def
2233                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2234             {
2235               lhs = gimple_assign_lhs (next_stmt);
2236               next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2237               continue;
2238             }
2239
2240           return false;
2241         }
2242       else
2243         {
2244           tree op = gimple_assign_rhs2 (next_stmt);
2245           gimple def_stmt = NULL;
2246
2247           if (TREE_CODE (op) == SSA_NAME)
2248             def_stmt = SSA_NAME_DEF_STMT (op);
2249
2250           /* Check that the other def is either defined in the loop
2251             ("vect_internal_def"), or it's an induction (defined by a
2252             loop-header phi-node).  */
2253           if (def_stmt
2254               && gimple_bb (def_stmt)
2255               && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2256               && (is_gimple_assign (def_stmt)
2257                   || is_gimple_call (def_stmt)
2258                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2259                               == vect_induction_def
2260                   || (gimple_code (def_stmt) == GIMPLE_PHI
2261                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
2262                                   == vect_internal_def
2263                       && !is_loop_header_bb_p (gimple_bb (def_stmt)))))
2264             {
2265               if (dump_enabled_p ())
2266                 {
2267                   dump_printf_loc (MSG_NOTE, vect_location, "swapping oprnds: ");
2268                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, next_stmt, 0);
2269                   dump_printf (MSG_NOTE, "\n");
2270                 }
2271
2272               swap_ssa_operands (next_stmt,
2273                                  gimple_assign_rhs1_ptr (next_stmt),
2274                                  gimple_assign_rhs2_ptr (next_stmt));
2275               update_stmt (next_stmt);
2276
2277               if (CONSTANT_CLASS_P (gimple_assign_rhs1 (next_stmt)))
2278                 LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2279             }
2280           else
2281             return false;
2282         }
2283
2284       lhs = gimple_assign_lhs (next_stmt);
2285       next_stmt = GROUP_NEXT_ELEMENT (vinfo_for_stmt (next_stmt));
2286     }
2287
2288   /* Save the chain for further analysis in SLP detection.  */
2289   first = GROUP_FIRST_ELEMENT (vinfo_for_stmt (current_stmt));
2290   LOOP_VINFO_REDUCTION_CHAINS (loop_info).safe_push (first);
2291   GROUP_SIZE (vinfo_for_stmt (first)) = size;
2292
2293   return true;
2294 }
2295
2296
2297 /* Function vect_is_simple_reduction_1
2298
2299    (1) Detect a cross-iteration def-use cycle that represents a simple
2300    reduction computation.  We look for the following pattern:
2301
2302    loop_header:
2303      a1 = phi < a0, a2 >
2304      a3 = ...
2305      a2 = operation (a3, a1)
2306
2307    or
2308
2309    a3 = ...
2310    loop_header:
2311      a1 = phi < a0, a2 >
2312      a2 = operation (a3, a1)
2313
2314    such that:
2315    1. operation is commutative and associative and it is safe to
2316       change the order of the computation (if CHECK_REDUCTION is true)
2317    2. no uses for a2 in the loop (a2 is used out of the loop)
2318    3. no uses of a1 in the loop besides the reduction operation
2319    4. no uses of a1 outside the loop.
2320
2321    Conditions 1,4 are tested here.
2322    Conditions 2,3 are tested in vect_mark_stmts_to_be_vectorized.
2323
2324    (2) Detect a cross-iteration def-use cycle in nested loops, i.e.,
2325    nested cycles, if CHECK_REDUCTION is false.
2326
2327    (3) Detect cycles of phi nodes in outer-loop vectorization, i.e., double
2328    reductions:
2329
2330      a1 = phi < a0, a2 >
2331      inner loop (def of a3)
2332      a2 = phi < a3 >
2333
2334    If MODIFY is true it tries also to rework the code in-place to enable
2335    detection of more reduction patterns.  For the time being we rewrite
2336    "res -= RHS" into "rhs += -RHS" when it seems worthwhile.
2337 */
2338
2339 static gimple
2340 vect_is_simple_reduction_1 (loop_vec_info loop_info, gimple phi,
2341                             bool check_reduction, bool *double_reduc,
2342                             bool modify)
2343 {
2344   struct loop *loop = (gimple_bb (phi))->loop_father;
2345   struct loop *vect_loop = LOOP_VINFO_LOOP (loop_info);
2346   edge latch_e = loop_latch_edge (loop);
2347   tree loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
2348   gimple def_stmt, def1 = NULL, def2 = NULL;
2349   enum tree_code orig_code, code;
2350   tree op1, op2, op3 = NULL_TREE, op4 = NULL_TREE;
2351   tree type;
2352   int nloop_uses;
2353   tree name;
2354   imm_use_iterator imm_iter;
2355   use_operand_p use_p;
2356   bool phi_def;
2357
2358   *double_reduc = false;
2359
2360   /* If CHECK_REDUCTION is true, we assume inner-most loop vectorization,
2361      otherwise, we assume outer loop vectorization.  */
2362   gcc_assert ((check_reduction && loop == vect_loop)
2363               || (!check_reduction && flow_loop_nested_p (vect_loop, loop)));
2364
2365   name = PHI_RESULT (phi);
2366   /* ???  If there are no uses of the PHI result the inner loop reduction
2367      won't be detected as possibly double-reduction by vectorizable_reduction
2368      because that tries to walk the PHI arg from the preheader edge which
2369      can be constant.  See PR60382.  */
2370   if (has_zero_uses (name))
2371     return NULL;
2372   nloop_uses = 0;
2373   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2374     {
2375       gimple use_stmt = USE_STMT (use_p);
2376       if (is_gimple_debug (use_stmt))
2377         continue;
2378
2379       if (!flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2380         {
2381           if (dump_enabled_p ())
2382             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2383                              "intermediate value used outside loop.\n");
2384
2385           return NULL;
2386         }
2387
2388       nloop_uses++;
2389       if (nloop_uses > 1)
2390         {
2391           if (dump_enabled_p ())
2392             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2393                              "reduction used in loop.\n");
2394           return NULL;
2395         }
2396     }
2397
2398   if (TREE_CODE (loop_arg) != SSA_NAME)
2399     {
2400       if (dump_enabled_p ())
2401         {
2402           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2403                            "reduction: not ssa_name: ");
2404           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM, loop_arg);
2405           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
2406         }
2407       return NULL;
2408     }
2409
2410   def_stmt = SSA_NAME_DEF_STMT (loop_arg);
2411   if (!def_stmt)
2412     {
2413       if (dump_enabled_p ())
2414         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2415                          "reduction: no def_stmt.\n");
2416       return NULL;
2417     }
2418
2419   if (!is_gimple_assign (def_stmt) && gimple_code (def_stmt) != GIMPLE_PHI)
2420     {
2421       if (dump_enabled_p ())
2422         {
2423           dump_gimple_stmt (MSG_NOTE, TDF_SLIM, def_stmt, 0);
2424           dump_printf (MSG_NOTE, "\n");
2425         }
2426       return NULL;
2427     }
2428
2429   if (is_gimple_assign (def_stmt))
2430     {
2431       name = gimple_assign_lhs (def_stmt);
2432       phi_def = false;
2433     }
2434   else
2435     {
2436       name = PHI_RESULT (def_stmt);
2437       phi_def = true;
2438     }
2439
2440   nloop_uses = 0;
2441   FOR_EACH_IMM_USE_FAST (use_p, imm_iter, name)
2442     {
2443       gimple use_stmt = USE_STMT (use_p);
2444       if (is_gimple_debug (use_stmt))
2445         continue;
2446       if (flow_bb_inside_loop_p (loop, gimple_bb (use_stmt)))
2447         nloop_uses++;
2448       if (nloop_uses > 1)
2449         {
2450           if (dump_enabled_p ())
2451             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2452                              "reduction used in loop.\n");
2453           return NULL;
2454         }
2455     }
2456
2457   /* If DEF_STMT is a phi node itself, we expect it to have a single argument
2458      defined in the inner loop.  */
2459   if (phi_def)
2460     {
2461       op1 = PHI_ARG_DEF (def_stmt, 0);
2462
2463       if (gimple_phi_num_args (def_stmt) != 1
2464           || TREE_CODE (op1) != SSA_NAME)
2465         {
2466           if (dump_enabled_p ())
2467             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
2468                              "unsupported phi node definition.\n");
2469
2470           return NULL;
2471         }
2472
2473       def1 = SSA_NAME_DEF_STMT (op1);
2474       if (gimple_bb (def1)
2475           && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
2476           && loop->inner
2477           && flow_bb_inside_loop_p (loop->inner, gimple_bb (def1))
2478           && is_gimple_assign (def1))
2479         {
2480           if (dump_enabled_p ())
2481             report_vect_op (MSG_NOTE, def_stmt,
2482                             "detected double reduction: ");
2483
2484           *double_reduc = true;
2485           return def_stmt;
2486         }
2487
2488       return NULL;
2489     }
2490
2491   code = orig_code = gimple_assign_rhs_code (def_stmt);
2492
2493   /* We can handle "res -= x[i]", which is non-associative by
2494      simply rewriting this into "res += -x[i]".  Avoid changing
2495      gimple instruction for the first simple tests and only do this
2496      if we're allowed to change code at all.  */
2497   if (code == MINUS_EXPR
2498       && modify
2499       && (op1 = gimple_assign_rhs1 (def_stmt))
2500       && TREE_CODE (op1) == SSA_NAME
2501       && SSA_NAME_DEF_STMT (op1) == phi)
2502     code = PLUS_EXPR;
2503
2504   if (check_reduction
2505       && (!commutative_tree_code (code) || !associative_tree_code (code)))
2506     {
2507       if (dump_enabled_p ())
2508         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2509                         "reduction: not commutative/associative: ");
2510       return NULL;
2511     }
2512
2513   if (get_gimple_rhs_class (code) != GIMPLE_BINARY_RHS)
2514     {
2515       if (code != COND_EXPR)
2516         {
2517           if (dump_enabled_p ())
2518             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2519                             "reduction: not binary operation: ");
2520
2521           return NULL;
2522         }
2523
2524       op3 = gimple_assign_rhs1 (def_stmt);
2525       if (COMPARISON_CLASS_P (op3))
2526         {
2527           op4 = TREE_OPERAND (op3, 1);
2528           op3 = TREE_OPERAND (op3, 0);
2529         }
2530
2531       op1 = gimple_assign_rhs2 (def_stmt);
2532       op2 = gimple_assign_rhs3 (def_stmt);
2533
2534       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2535         {
2536           if (dump_enabled_p ())
2537             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2538                             "reduction: uses not ssa_names: ");
2539
2540           return NULL;
2541         }
2542     }
2543   else
2544     {
2545       op1 = gimple_assign_rhs1 (def_stmt);
2546       op2 = gimple_assign_rhs2 (def_stmt);
2547
2548       if (TREE_CODE (op1) != SSA_NAME && TREE_CODE (op2) != SSA_NAME)
2549         {
2550           if (dump_enabled_p ())
2551             report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2552                             "reduction: uses not ssa_names: ");
2553
2554           return NULL;
2555         }
2556    }
2557
2558   type = TREE_TYPE (gimple_assign_lhs (def_stmt));
2559   if ((TREE_CODE (op1) == SSA_NAME
2560        && !types_compatible_p (type,TREE_TYPE (op1)))
2561       || (TREE_CODE (op2) == SSA_NAME
2562           && !types_compatible_p (type, TREE_TYPE (op2)))
2563       || (op3 && TREE_CODE (op3) == SSA_NAME
2564           && !types_compatible_p (type, TREE_TYPE (op3)))
2565       || (op4 && TREE_CODE (op4) == SSA_NAME
2566           && !types_compatible_p (type, TREE_TYPE (op4))))
2567     {
2568       if (dump_enabled_p ())
2569         {
2570           dump_printf_loc (MSG_NOTE, vect_location,
2571                            "reduction: multiple types: operation type: ");
2572           dump_generic_expr (MSG_NOTE, TDF_SLIM, type);
2573           dump_printf (MSG_NOTE, ", operands types: ");
2574           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2575                              TREE_TYPE (op1));
2576           dump_printf (MSG_NOTE, ",");
2577           dump_generic_expr (MSG_NOTE, TDF_SLIM,
2578                              TREE_TYPE (op2));
2579           if (op3)
2580             {
2581               dump_printf (MSG_NOTE, ",");
2582               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2583                                  TREE_TYPE (op3));
2584             }
2585
2586           if (op4)
2587             {
2588               dump_printf (MSG_NOTE, ",");
2589               dump_generic_expr (MSG_NOTE, TDF_SLIM,
2590                                  TREE_TYPE (op4));
2591             }
2592           dump_printf (MSG_NOTE, "\n");
2593         }
2594
2595       return NULL;
2596     }
2597
2598   /* Check that it's ok to change the order of the computation.
2599      Generally, when vectorizing a reduction we change the order of the
2600      computation.  This may change the behavior of the program in some
2601      cases, so we need to check that this is ok.  One exception is when
2602      vectorizing an outer-loop: the inner-loop is executed sequentially,
2603      and therefore vectorizing reductions in the inner-loop during
2604      outer-loop vectorization is safe.  */
2605
2606   /* CHECKME: check for !flag_finite_math_only too?  */
2607   if (SCALAR_FLOAT_TYPE_P (type) && !flag_associative_math
2608       && check_reduction)
2609     {
2610       /* Changing the order of operations changes the semantics.  */
2611       if (dump_enabled_p ())
2612         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2613                         "reduction: unsafe fp math optimization: ");
2614       return NULL;
2615     }
2616   else if (INTEGRAL_TYPE_P (type) && TYPE_OVERFLOW_TRAPS (type)
2617            && check_reduction)
2618     {
2619       /* Changing the order of operations changes the semantics.  */
2620       if (dump_enabled_p ())
2621         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2622                         "reduction: unsafe int math optimization: ");
2623       return NULL;
2624     }
2625   else if (SAT_FIXED_POINT_TYPE_P (type) && check_reduction)
2626     {
2627       /* Changing the order of operations changes the semantics.  */
2628       if (dump_enabled_p ())
2629         report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2630                         "reduction: unsafe fixed-point math optimization: ");
2631       return NULL;
2632     }
2633
2634   /* If we detected "res -= x[i]" earlier, rewrite it into
2635      "res += -x[i]" now.  If this turns out to be useless reassoc
2636      will clean it up again.  */
2637   if (orig_code == MINUS_EXPR)
2638     {
2639       tree rhs = gimple_assign_rhs2 (def_stmt);
2640       tree negrhs = make_ssa_name (TREE_TYPE (rhs));
2641       gimple negate_stmt = gimple_build_assign (negrhs, NEGATE_EXPR, rhs);
2642       gimple_stmt_iterator gsi = gsi_for_stmt (def_stmt);
2643       set_vinfo_for_stmt (negate_stmt, new_stmt_vec_info (negate_stmt,
2644                                                           loop_info, NULL));
2645       gsi_insert_before (&gsi, negate_stmt, GSI_NEW_STMT);
2646       gimple_assign_set_rhs2 (def_stmt, negrhs);
2647       gimple_assign_set_rhs_code (def_stmt, PLUS_EXPR);
2648       update_stmt (def_stmt);
2649     }
2650
2651   /* Reduction is safe. We're dealing with one of the following:
2652      1) integer arithmetic and no trapv
2653      2) floating point arithmetic, and special flags permit this optimization
2654      3) nested cycle (i.e., outer loop vectorization).  */
2655   if (TREE_CODE (op1) == SSA_NAME)
2656     def1 = SSA_NAME_DEF_STMT (op1);
2657
2658   if (TREE_CODE (op2) == SSA_NAME)
2659     def2 = SSA_NAME_DEF_STMT (op2);
2660
2661   if (code != COND_EXPR
2662       && ((!def1 || gimple_nop_p (def1)) && (!def2 || gimple_nop_p (def2))))
2663     {
2664       if (dump_enabled_p ())
2665         report_vect_op (MSG_NOTE, def_stmt, "reduction: no defs for operands: ");
2666       return NULL;
2667     }
2668
2669   /* Check that one def is the reduction def, defined by PHI,
2670      the other def is either defined in the loop ("vect_internal_def"),
2671      or it's an induction (defined by a loop-header phi-node).  */
2672
2673   if (def2 && def2 == phi
2674       && (code == COND_EXPR
2675           || !def1 || gimple_nop_p (def1)
2676           || !flow_bb_inside_loop_p (loop, gimple_bb (def1))
2677           || (def1 && flow_bb_inside_loop_p (loop, gimple_bb (def1))
2678               && (is_gimple_assign (def1)
2679                   || is_gimple_call (def1)
2680                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2681                       == vect_induction_def
2682                   || (gimple_code (def1) == GIMPLE_PHI
2683                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def1))
2684                           == vect_internal_def
2685                       && !is_loop_header_bb_p (gimple_bb (def1)))))))
2686     {
2687       if (dump_enabled_p ())
2688         report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2689       return def_stmt;
2690     }
2691
2692   if (def1 && def1 == phi
2693       && (code == COND_EXPR
2694           || !def2 || gimple_nop_p (def2)
2695           || !flow_bb_inside_loop_p (loop, gimple_bb (def2))
2696           || (def2 && flow_bb_inside_loop_p (loop, gimple_bb (def2))
2697               && (is_gimple_assign (def2)
2698                   || is_gimple_call (def2)
2699                   || STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2700                       == vect_induction_def
2701                   || (gimple_code (def2) == GIMPLE_PHI
2702                       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def2))
2703                           == vect_internal_def
2704                       && !is_loop_header_bb_p (gimple_bb (def2)))))))
2705     {
2706       if (check_reduction)
2707         {
2708           /* Swap operands (just for simplicity - so that the rest of the code
2709              can assume that the reduction variable is always the last (second)
2710              argument).  */
2711           if (dump_enabled_p ())
2712             report_vect_op (MSG_NOTE, def_stmt,
2713                             "detected reduction: need to swap operands: ");
2714
2715           swap_ssa_operands (def_stmt, gimple_assign_rhs1_ptr (def_stmt),
2716                              gimple_assign_rhs2_ptr (def_stmt));
2717
2718           if (CONSTANT_CLASS_P (gimple_assign_rhs1 (def_stmt)))
2719             LOOP_VINFO_OPERANDS_SWAPPED (loop_info) = true;
2720         }
2721       else
2722         {
2723           if (dump_enabled_p ())
2724             report_vect_op (MSG_NOTE, def_stmt, "detected reduction: ");
2725         }
2726
2727       return def_stmt;
2728     }
2729
2730   /* Try to find SLP reduction chain.  */
2731   if (check_reduction && vect_is_slp_reduction (loop_info, phi, def_stmt))
2732     {
2733       if (dump_enabled_p ())
2734         report_vect_op (MSG_NOTE, def_stmt,
2735                         "reduction: detected reduction chain: ");
2736
2737       return def_stmt;
2738     }
2739
2740   if (dump_enabled_p ())
2741     report_vect_op (MSG_MISSED_OPTIMIZATION, def_stmt,
2742                     "reduction: unknown pattern: ");
2743
2744   return NULL;
2745 }
2746
2747 /* Wrapper around vect_is_simple_reduction_1, that won't modify code
2748    in-place.  Arguments as there.  */
2749
2750 static gimple
2751 vect_is_simple_reduction (loop_vec_info loop_info, gimple phi,
2752                           bool check_reduction, bool *double_reduc)
2753 {
2754   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2755                                      double_reduc, false);
2756 }
2757
2758 /* Wrapper around vect_is_simple_reduction_1, which will modify code
2759    in-place if it enables detection of more reductions.  Arguments
2760    as there.  */
2761
2762 gimple
2763 vect_force_simple_reduction (loop_vec_info loop_info, gimple phi,
2764                           bool check_reduction, bool *double_reduc)
2765 {
2766   return vect_is_simple_reduction_1 (loop_info, phi, check_reduction,
2767                                      double_reduc, true);
2768 }
2769
2770 /* Calculate cost of peeling the loop PEEL_ITERS_PROLOGUE times.  */
2771 int
2772 vect_get_known_peeling_cost (loop_vec_info loop_vinfo, int peel_iters_prologue,
2773                              int *peel_iters_epilogue,
2774                              stmt_vector_for_cost *scalar_cost_vec,
2775                              stmt_vector_for_cost *prologue_cost_vec,
2776                              stmt_vector_for_cost *epilogue_cost_vec)
2777 {
2778   int retval = 0;
2779   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2780
2781   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
2782     {
2783       *peel_iters_epilogue = vf/2;
2784       if (dump_enabled_p ())
2785         dump_printf_loc (MSG_NOTE, vect_location,
2786                          "cost model: epilogue peel iters set to vf/2 "
2787                          "because loop iterations are unknown .\n");
2788
2789       /* If peeled iterations are known but number of scalar loop
2790          iterations are unknown, count a taken branch per peeled loop.  */
2791       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2792                                  NULL, 0, vect_prologue);
2793       retval = record_stmt_cost (prologue_cost_vec, 1, cond_branch_taken,
2794                                  NULL, 0, vect_epilogue);
2795     }
2796   else
2797     {
2798       int niters = LOOP_VINFO_INT_NITERS (loop_vinfo);
2799       peel_iters_prologue = niters < peel_iters_prologue ?
2800                             niters : peel_iters_prologue;
2801       *peel_iters_epilogue = (niters - peel_iters_prologue) % vf;
2802       /* If we need to peel for gaps, but no peeling is required, we have to
2803          peel VF iterations.  */
2804       if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo) && !*peel_iters_epilogue)
2805         *peel_iters_epilogue = vf;
2806     }
2807
2808   stmt_info_for_cost *si;
2809   int j;
2810   if (peel_iters_prologue)
2811     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2812       retval += record_stmt_cost (prologue_cost_vec,
2813                                   si->count * peel_iters_prologue,
2814                                   si->kind, NULL, si->misalign,
2815                                   vect_prologue);
2816   if (*peel_iters_epilogue)
2817     FOR_EACH_VEC_ELT (*scalar_cost_vec, j, si)
2818       retval += record_stmt_cost (epilogue_cost_vec,
2819                                   si->count * *peel_iters_epilogue,
2820                                   si->kind, NULL, si->misalign,
2821                                   vect_epilogue);
2822
2823   return retval;
2824 }
2825
2826 /* Function vect_estimate_min_profitable_iters
2827
2828    Return the number of iterations required for the vector version of the
2829    loop to be profitable relative to the cost of the scalar version of the
2830    loop.  */
2831
2832 static void
2833 vect_estimate_min_profitable_iters (loop_vec_info loop_vinfo,
2834                                     int *ret_min_profitable_niters,
2835                                     int *ret_min_profitable_estimate)
2836 {
2837   int min_profitable_iters;
2838   int min_profitable_estimate;
2839   int peel_iters_prologue;
2840   int peel_iters_epilogue;
2841   unsigned vec_inside_cost = 0;
2842   int vec_outside_cost = 0;
2843   unsigned vec_prologue_cost = 0;
2844   unsigned vec_epilogue_cost = 0;
2845   int scalar_single_iter_cost = 0;
2846   int scalar_outside_cost = 0;
2847   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
2848   int npeel = LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo);
2849   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2850
2851   /* Cost model disabled.  */
2852   if (unlimited_cost_model (LOOP_VINFO_LOOP (loop_vinfo)))
2853     {
2854       dump_printf_loc (MSG_NOTE, vect_location, "cost model disabled.\n");
2855       *ret_min_profitable_niters = 0;
2856       *ret_min_profitable_estimate = 0;
2857       return;
2858     }
2859
2860   /* Requires loop versioning tests to handle misalignment.  */
2861   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo))
2862     {
2863       /*  FIXME: Make cost depend on complexity of individual check.  */
2864       unsigned len = LOOP_VINFO_MAY_MISALIGN_STMTS (loop_vinfo).length ();
2865       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2866                             vect_prologue);
2867       dump_printf (MSG_NOTE,
2868                    "cost model: Adding cost of checks for loop "
2869                    "versioning to treat misalignment.\n");
2870     }
2871
2872   /* Requires loop versioning with alias checks.  */
2873   if (LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2874     {
2875       /*  FIXME: Make cost depend on complexity of individual check.  */
2876       unsigned len = LOOP_VINFO_MAY_ALIAS_DDRS (loop_vinfo).length ();
2877       (void) add_stmt_cost (target_cost_data, len, vector_stmt, NULL, 0,
2878                             vect_prologue);
2879       dump_printf (MSG_NOTE,
2880                    "cost model: Adding cost of checks for loop "
2881                    "versioning aliasing.\n");
2882     }
2883
2884   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
2885       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
2886     (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken, NULL, 0,
2887                           vect_prologue);
2888
2889   /* Count statements in scalar loop.  Using this as scalar cost for a single
2890      iteration for now.
2891
2892      TODO: Add outer loop support.
2893
2894      TODO: Consider assigning different costs to different scalar
2895      statements.  */
2896
2897   scalar_single_iter_cost
2898     = LOOP_VINFO_SINGLE_SCALAR_ITERATION_COST (loop_vinfo);
2899
2900   /* Add additional cost for the peeled instructions in prologue and epilogue
2901      loop.
2902
2903      FORNOW: If we don't know the value of peel_iters for prologue or epilogue
2904      at compile-time - we assume it's vf/2 (the worst would be vf-1).
2905
2906      TODO: Build an expression that represents peel_iters for prologue and
2907      epilogue to be used in a run-time test.  */
2908
2909   if (npeel  < 0)
2910     {
2911       peel_iters_prologue = vf/2;
2912       dump_printf (MSG_NOTE, "cost model: "
2913                    "prologue peel iters set to vf/2.\n");
2914
2915       /* If peeling for alignment is unknown, loop bound of main loop becomes
2916          unknown.  */
2917       peel_iters_epilogue = vf/2;
2918       dump_printf (MSG_NOTE, "cost model: "
2919                    "epilogue peel iters set to vf/2 because "
2920                    "peeling for alignment is unknown.\n");
2921
2922       /* If peeled iterations are unknown, count a taken branch and a not taken
2923          branch per peeled loop. Even if scalar loop iterations are known,
2924          vector iterations are not known since peeled prologue iterations are
2925          not known. Hence guards remain the same.  */
2926       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2927                             NULL, 0, vect_prologue);
2928       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2929                             NULL, 0, vect_prologue);
2930       (void) add_stmt_cost (target_cost_data, 1, cond_branch_taken,
2931                             NULL, 0, vect_epilogue);
2932       (void) add_stmt_cost (target_cost_data, 1, cond_branch_not_taken,
2933                             NULL, 0, vect_epilogue);
2934       stmt_info_for_cost *si;
2935       int j;
2936       FOR_EACH_VEC_ELT (LOOP_VINFO_SCALAR_ITERATION_COST (loop_vinfo), j, si)
2937         {
2938           struct _stmt_vec_info *stmt_info
2939             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2940           (void) add_stmt_cost (target_cost_data,
2941                                 si->count * peel_iters_prologue,
2942                                 si->kind, stmt_info, si->misalign,
2943                                 vect_prologue);
2944           (void) add_stmt_cost (target_cost_data,
2945                                 si->count * peel_iters_epilogue,
2946                                 si->kind, stmt_info, si->misalign,
2947                                 vect_epilogue);
2948         }
2949     }
2950   else
2951     {
2952       stmt_vector_for_cost prologue_cost_vec, epilogue_cost_vec;
2953       stmt_info_for_cost *si;
2954       int j;
2955       void *data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
2956
2957       prologue_cost_vec.create (2);
2958       epilogue_cost_vec.create (2);
2959       peel_iters_prologue = npeel;
2960
2961       (void) vect_get_known_peeling_cost (loop_vinfo, peel_iters_prologue,
2962                                           &peel_iters_epilogue,
2963                                           &LOOP_VINFO_SCALAR_ITERATION_COST
2964                                             (loop_vinfo),
2965                                           &prologue_cost_vec,
2966                                           &epilogue_cost_vec);
2967
2968       FOR_EACH_VEC_ELT (prologue_cost_vec, j, si)
2969         {
2970           struct _stmt_vec_info *stmt_info
2971             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2972           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2973                                 si->misalign, vect_prologue);
2974         }
2975
2976       FOR_EACH_VEC_ELT (epilogue_cost_vec, j, si)
2977         {
2978           struct _stmt_vec_info *stmt_info
2979             = si->stmt ? vinfo_for_stmt (si->stmt) : NULL;
2980           (void) add_stmt_cost (data, si->count, si->kind, stmt_info,
2981                                 si->misalign, vect_epilogue);
2982         }
2983
2984       prologue_cost_vec.release ();
2985       epilogue_cost_vec.release ();
2986     }
2987
2988   /* FORNOW: The scalar outside cost is incremented in one of the
2989      following ways:
2990
2991      1. The vectorizer checks for alignment and aliasing and generates
2992      a condition that allows dynamic vectorization.  A cost model
2993      check is ANDED with the versioning condition.  Hence scalar code
2994      path now has the added cost of the versioning check.
2995
2996        if (cost > th & versioning_check)
2997          jmp to vector code
2998
2999      Hence run-time scalar is incremented by not-taken branch cost.
3000
3001      2. The vectorizer then checks if a prologue is required.  If the
3002      cost model check was not done before during versioning, it has to
3003      be done before the prologue check.
3004
3005        if (cost <= th)
3006          prologue = scalar_iters
3007        if (prologue == 0)
3008          jmp to vector code
3009        else
3010          execute prologue
3011        if (prologue == num_iters)
3012          go to exit
3013
3014      Hence the run-time scalar cost is incremented by a taken branch,
3015      plus a not-taken branch, plus a taken branch cost.
3016
3017      3. The vectorizer then checks if an epilogue is required.  If the
3018      cost model check was not done before during prologue check, it
3019      has to be done with the epilogue check.
3020
3021        if (prologue == 0)
3022          jmp to vector code
3023        else
3024          execute prologue
3025        if (prologue == num_iters)
3026          go to exit
3027        vector code:
3028          if ((cost <= th) | (scalar_iters-prologue-epilogue == 0))
3029            jmp to epilogue
3030
3031      Hence the run-time scalar cost should be incremented by 2 taken
3032      branches.
3033
3034      TODO: The back end may reorder the BBS's differently and reverse
3035      conditions/branch directions.  Change the estimates below to
3036      something more reasonable.  */
3037
3038   /* If the number of iterations is known and we do not do versioning, we can
3039      decide whether to vectorize at compile time.  Hence the scalar version
3040      do not carry cost model guard costs.  */
3041   if (!LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo)
3042       || LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3043       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3044     {
3045       /* Cost model check occurs at versioning.  */
3046       if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
3047           || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
3048         scalar_outside_cost += vect_get_stmt_cost (cond_branch_not_taken);
3049       else
3050         {
3051           /* Cost model check occurs at prologue generation.  */
3052           if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo) < 0)
3053             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken)
3054               + vect_get_stmt_cost (cond_branch_not_taken);
3055           /* Cost model check occurs at epilogue generation.  */
3056           else
3057             scalar_outside_cost += 2 * vect_get_stmt_cost (cond_branch_taken);
3058         }
3059     }
3060
3061   /* Complete the target-specific cost calculations.  */
3062   finish_cost (LOOP_VINFO_TARGET_COST_DATA (loop_vinfo), &vec_prologue_cost,
3063                &vec_inside_cost, &vec_epilogue_cost);
3064
3065   vec_outside_cost = (int)(vec_prologue_cost + vec_epilogue_cost);
3066
3067   if (dump_enabled_p ())
3068     {
3069       dump_printf_loc (MSG_NOTE, vect_location, "Cost model analysis: \n");
3070       dump_printf (MSG_NOTE, "  Vector inside of loop cost: %d\n",
3071                    vec_inside_cost);
3072       dump_printf (MSG_NOTE, "  Vector prologue cost: %d\n",
3073                    vec_prologue_cost);
3074       dump_printf (MSG_NOTE, "  Vector epilogue cost: %d\n",
3075                    vec_epilogue_cost);
3076       dump_printf (MSG_NOTE, "  Scalar iteration cost: %d\n",
3077                    scalar_single_iter_cost);
3078       dump_printf (MSG_NOTE, "  Scalar outside cost: %d\n",
3079                    scalar_outside_cost);
3080       dump_printf (MSG_NOTE, "  Vector outside cost: %d\n",
3081                    vec_outside_cost);
3082       dump_printf (MSG_NOTE, "  prologue iterations: %d\n",
3083                    peel_iters_prologue);
3084       dump_printf (MSG_NOTE, "  epilogue iterations: %d\n",
3085                    peel_iters_epilogue);
3086     }
3087
3088   /* Calculate number of iterations required to make the vector version
3089      profitable, relative to the loop bodies only.  The following condition
3090      must hold true:
3091      SIC * niters + SOC > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC
3092      where
3093      SIC = scalar iteration cost, VIC = vector iteration cost,
3094      VOC = vector outside cost, VF = vectorization factor,
3095      PL_ITERS = prologue iterations, EP_ITERS= epilogue iterations
3096      SOC = scalar outside cost for run time cost model check.  */
3097
3098   if ((scalar_single_iter_cost * vf) > (int) vec_inside_cost)
3099     {
3100       if (vec_outside_cost <= 0)
3101         min_profitable_iters = 1;
3102       else
3103         {
3104           min_profitable_iters = ((vec_outside_cost - scalar_outside_cost) * vf
3105                                   - vec_inside_cost * peel_iters_prologue
3106                                   - vec_inside_cost * peel_iters_epilogue)
3107                                  / ((scalar_single_iter_cost * vf)
3108                                     - vec_inside_cost);
3109
3110           if ((scalar_single_iter_cost * vf * min_profitable_iters)
3111               <= (((int) vec_inside_cost * min_profitable_iters)
3112                   + (((int) vec_outside_cost - scalar_outside_cost) * vf)))
3113             min_profitable_iters++;
3114         }
3115     }
3116   /* vector version will never be profitable.  */
3117   else
3118     {
3119       if (LOOP_VINFO_LOOP (loop_vinfo)->force_vectorize)
3120         warning_at (vect_location, OPT_Wopenmp_simd, "vectorization "
3121                     "did not happen for a simd loop");
3122
3123       if (dump_enabled_p ())
3124         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3125                          "cost model: the vector iteration cost = %d "
3126                          "divided by the scalar iteration cost = %d "
3127                          "is greater or equal to the vectorization factor = %d"
3128                          ".\n",
3129                          vec_inside_cost, scalar_single_iter_cost, vf);
3130       *ret_min_profitable_niters = -1;
3131       *ret_min_profitable_estimate = -1;
3132       return;
3133     }
3134
3135   dump_printf (MSG_NOTE,
3136                "  Calculated minimum iters for profitability: %d\n",
3137                min_profitable_iters);
3138
3139   min_profitable_iters =
3140         min_profitable_iters < vf ? vf : min_profitable_iters;
3141
3142   /* Because the condition we create is:
3143      if (niters <= min_profitable_iters)
3144        then skip the vectorized loop.  */
3145   min_profitable_iters--;
3146
3147   if (dump_enabled_p ())
3148     dump_printf_loc (MSG_NOTE, vect_location,
3149                      "  Runtime profitability threshold = %d\n",
3150                      min_profitable_iters);
3151
3152   *ret_min_profitable_niters = min_profitable_iters;
3153
3154   /* Calculate number of iterations required to make the vector version
3155      profitable, relative to the loop bodies only.
3156
3157      Non-vectorized variant is SIC * niters and it must win over vector
3158      variant on the expected loop trip count.  The following condition must hold true:
3159      SIC * niters > VIC * ((niters-PL_ITERS-EP_ITERS)/VF) + VOC + SOC  */
3160
3161   if (vec_outside_cost <= 0)
3162     min_profitable_estimate = 1;
3163   else
3164     {
3165       min_profitable_estimate = ((vec_outside_cost + scalar_outside_cost) * vf
3166                                  - vec_inside_cost * peel_iters_prologue
3167                                  - vec_inside_cost * peel_iters_epilogue)
3168                                  / ((scalar_single_iter_cost * vf)
3169                                    - vec_inside_cost);
3170     }
3171   min_profitable_estimate --;
3172   min_profitable_estimate = MAX (min_profitable_estimate, min_profitable_iters);
3173   if (dump_enabled_p ())
3174     dump_printf_loc (MSG_NOTE, vect_location,
3175                      "  Static estimate profitability threshold = %d\n",
3176                       min_profitable_iters);
3177
3178   *ret_min_profitable_estimate = min_profitable_estimate;
3179 }
3180
3181 /* Writes into SEL a mask for a vec_perm, equivalent to a vec_shr by OFFSET
3182    vector elements (not bits) for a vector of mode MODE.  */
3183 static void
3184 calc_vec_perm_mask_for_shift (enum machine_mode mode, unsigned int offset,
3185                               unsigned char *sel)
3186 {
3187   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3188
3189   for (i = 0; i < nelt; i++)
3190     sel[i] = (i + offset) & (2*nelt - 1);
3191 }
3192
3193 /* Checks whether the target supports whole-vector shifts for vectors of mode
3194    MODE.  This is the case if _either_ the platform handles vec_shr_optab, _or_
3195    it supports vec_perm_const with masks for all necessary shift amounts.  */
3196 static bool
3197 have_whole_vector_shift (enum machine_mode mode)
3198 {
3199   if (optab_handler (vec_shr_optab, mode) != CODE_FOR_nothing)
3200     return true;
3201
3202   if (direct_optab_handler (vec_perm_const_optab, mode) == CODE_FOR_nothing)
3203     return false;
3204
3205   unsigned int i, nelt = GET_MODE_NUNITS (mode);
3206   unsigned char *sel = XALLOCAVEC (unsigned char, nelt);
3207
3208   for (i = nelt/2; i >= 1; i/=2)
3209     {
3210       calc_vec_perm_mask_for_shift (mode, i, sel);
3211       if (!can_vec_perm_p (mode, false, sel))
3212         return false;
3213     }
3214   return true;
3215 }
3216
3217 /* Return the reduction operand (with index REDUC_INDEX) of STMT.  */
3218
3219 static tree
3220 get_reduction_op (gimple stmt, int reduc_index)
3221 {
3222   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
3223     {
3224     case GIMPLE_SINGLE_RHS:
3225       gcc_assert (TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt))
3226                   == ternary_op);
3227       return TREE_OPERAND (gimple_assign_rhs1 (stmt), reduc_index);
3228     case GIMPLE_UNARY_RHS:
3229       return gimple_assign_rhs1 (stmt);
3230     case GIMPLE_BINARY_RHS:
3231       return (reduc_index
3232               ? gimple_assign_rhs2 (stmt) : gimple_assign_rhs1 (stmt));
3233     case GIMPLE_TERNARY_RHS:
3234       return gimple_op (stmt, reduc_index + 1);
3235     default:
3236       gcc_unreachable ();
3237     }
3238 }
3239
3240 /* TODO: Close dependency between vect_model_*_cost and vectorizable_*
3241    functions. Design better to avoid maintenance issues.  */
3242
3243 /* Function vect_model_reduction_cost.
3244
3245    Models cost for a reduction operation, including the vector ops
3246    generated within the strip-mine loop, the initial definition before
3247    the loop, and the epilogue code that must be generated.  */
3248
3249 static bool
3250 vect_model_reduction_cost (stmt_vec_info stmt_info, enum tree_code reduc_code,
3251                            int ncopies, int reduc_index)
3252 {
3253   int prologue_cost = 0, epilogue_cost = 0;
3254   enum tree_code code;
3255   optab optab;
3256   tree vectype;
3257   gimple stmt, orig_stmt;
3258   tree reduction_op;
3259   machine_mode mode;
3260   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3261   struct loop *loop = NULL;
3262   void *target_cost_data;
3263
3264   if (loop_vinfo)
3265     {
3266       loop = LOOP_VINFO_LOOP (loop_vinfo);
3267       target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3268     }
3269   else
3270     target_cost_data = BB_VINFO_TARGET_COST_DATA (STMT_VINFO_BB_VINFO (stmt_info));
3271
3272   /* Cost of reduction op inside loop.  */
3273   unsigned inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3274                                         stmt_info, 0, vect_body);
3275   stmt = STMT_VINFO_STMT (stmt_info);
3276
3277   reduction_op = get_reduction_op (stmt, reduc_index);
3278
3279   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
3280   if (!vectype)
3281     {
3282       if (dump_enabled_p ())
3283         {
3284           dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
3285                            "unsupported data-type ");
3286           dump_generic_expr (MSG_MISSED_OPTIMIZATION, TDF_SLIM,
3287                              TREE_TYPE (reduction_op));
3288           dump_printf (MSG_MISSED_OPTIMIZATION, "\n");
3289         }
3290       return false;
3291    }
3292
3293   mode = TYPE_MODE (vectype);
3294   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
3295
3296   if (!orig_stmt)
3297     orig_stmt = STMT_VINFO_STMT (stmt_info);
3298
3299   code = gimple_assign_rhs_code (orig_stmt);
3300
3301   /* Add in cost for initial definition.  */
3302   prologue_cost += add_stmt_cost (target_cost_data, 1, scalar_to_vec,
3303                                   stmt_info, 0, vect_prologue);
3304
3305   /* Determine cost of epilogue code.
3306
3307      We have a reduction operator that will reduce the vector in one statement.
3308      Also requires scalar extract.  */
3309
3310   if (!loop || !nested_in_vect_loop_p (loop, orig_stmt))
3311     {
3312       if (reduc_code != ERROR_MARK)
3313         {
3314           epilogue_cost += add_stmt_cost (target_cost_data, 1, vector_stmt,
3315                                           stmt_info, 0, vect_epilogue);
3316           epilogue_cost += add_stmt_cost (target_cost_data, 1, vec_to_scalar,
3317                                           stmt_info, 0, vect_epilogue);
3318         }
3319       else
3320         {
3321           int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
3322           tree bitsize =
3323             TYPE_SIZE (TREE_TYPE (gimple_assign_lhs (orig_stmt)));
3324           int element_bitsize = tree_to_uhwi (bitsize);
3325           int nelements = vec_size_in_bits / element_bitsize;
3326
3327           optab = optab_for_tree_code (code, vectype, optab_default);
3328
3329           /* We have a whole vector shift available.  */
3330           if (VECTOR_MODE_P (mode)
3331               && optab_handler (optab, mode) != CODE_FOR_nothing
3332               && have_whole_vector_shift (mode))
3333             {
3334               /* Final reduction via vector shifts and the reduction operator.
3335                  Also requires scalar extract.  */
3336               epilogue_cost += add_stmt_cost (target_cost_data,
3337                                               exact_log2 (nelements) * 2,
3338                                               vector_stmt, stmt_info, 0,
3339                                               vect_epilogue);
3340               epilogue_cost += add_stmt_cost (target_cost_data, 1,
3341                                               vec_to_scalar, stmt_info, 0,
3342                                               vect_epilogue);
3343             }
3344           else
3345             /* Use extracts and reduction op for final reduction.  For N
3346                elements, we have N extracts and N-1 reduction ops.  */
3347             epilogue_cost += add_stmt_cost (target_cost_data,
3348                                             nelements + nelements - 1,
3349                                             vector_stmt, stmt_info, 0,
3350                                             vect_epilogue);
3351         }
3352     }
3353
3354   if (dump_enabled_p ())
3355     dump_printf (MSG_NOTE,
3356                  "vect_model_reduction_cost: inside_cost = %d, "
3357                  "prologue_cost = %d, epilogue_cost = %d .\n", inside_cost,
3358                  prologue_cost, epilogue_cost);
3359
3360   return true;
3361 }
3362
3363
3364 /* Function vect_model_induction_cost.
3365
3366    Models cost for induction operations.  */
3367
3368 static void
3369 vect_model_induction_cost (stmt_vec_info stmt_info, int ncopies)
3370 {
3371   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
3372   void *target_cost_data = LOOP_VINFO_TARGET_COST_DATA (loop_vinfo);
3373   unsigned inside_cost, prologue_cost;
3374
3375   /* loop cost for vec_loop.  */
3376   inside_cost = add_stmt_cost (target_cost_data, ncopies, vector_stmt,
3377                                stmt_info, 0, vect_body);
3378
3379   /* prologue cost for vec_init and vec_step.  */
3380   prologue_cost = add_stmt_cost (target_cost_data, 2, scalar_to_vec,
3381                                  stmt_info, 0, vect_prologue);
3382
3383   if (dump_enabled_p ())
3384     dump_printf_loc (MSG_NOTE, vect_location,
3385                      "vect_model_induction_cost: inside_cost = %d, "
3386                      "prologue_cost = %d .\n", inside_cost, prologue_cost);
3387 }
3388
3389
3390 /* Function get_initial_def_for_induction
3391
3392    Input:
3393    STMT - a stmt that performs an induction operation in the loop.
3394    IV_PHI - the initial value of the induction variable
3395
3396    Output:
3397    Return a vector variable, initialized with the first VF values of
3398    the induction variable.  E.g., for an iv with IV_PHI='X' and
3399    evolution S, for a vector of 4 units, we want to return:
3400    [X, X + S, X + 2*S, X + 3*S].  */
3401
3402 static tree
3403 get_initial_def_for_induction (gimple iv_phi)
3404 {
3405   stmt_vec_info stmt_vinfo = vinfo_for_stmt (iv_phi);
3406   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3407   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3408   tree vectype;
3409   int nunits;
3410   edge pe = loop_preheader_edge (loop);
3411   struct loop *iv_loop;
3412   basic_block new_bb;
3413   tree new_vec, vec_init, vec_step, t;
3414   tree new_var;
3415   tree new_name;
3416   gimple init_stmt, new_stmt;
3417   gphi *induction_phi;
3418   tree induc_def, vec_def, vec_dest;
3419   tree init_expr, step_expr;
3420   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
3421   int i;
3422   int ncopies;
3423   tree expr;
3424   stmt_vec_info phi_info = vinfo_for_stmt (iv_phi);
3425   bool nested_in_vect_loop = false;
3426   gimple_seq stmts = NULL;
3427   imm_use_iterator imm_iter;
3428   use_operand_p use_p;
3429   gimple exit_phi;
3430   edge latch_e;
3431   tree loop_arg;
3432   gimple_stmt_iterator si;
3433   basic_block bb = gimple_bb (iv_phi);
3434   tree stepvectype;
3435   tree resvectype;
3436
3437   /* Is phi in an inner-loop, while vectorizing an enclosing outer-loop?  */
3438   if (nested_in_vect_loop_p (loop, iv_phi))
3439     {
3440       nested_in_vect_loop = true;
3441       iv_loop = loop->inner;
3442     }
3443   else
3444     iv_loop = loop;
3445   gcc_assert (iv_loop == (gimple_bb (iv_phi))->loop_father);
3446
3447   latch_e = loop_latch_edge (iv_loop);
3448   loop_arg = PHI_ARG_DEF_FROM_EDGE (iv_phi, latch_e);
3449
3450   step_expr = STMT_VINFO_LOOP_PHI_EVOLUTION_PART (phi_info);
3451   gcc_assert (step_expr != NULL_TREE);
3452
3453   pe = loop_preheader_edge (iv_loop);
3454   init_expr = PHI_ARG_DEF_FROM_EDGE (iv_phi,
3455                                      loop_preheader_edge (iv_loop));
3456
3457   vectype = get_vectype_for_scalar_type (TREE_TYPE (init_expr));
3458   resvectype = get_vectype_for_scalar_type (TREE_TYPE (PHI_RESULT (iv_phi)));
3459   gcc_assert (vectype);
3460   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3461   ncopies = vf / nunits;
3462
3463   gcc_assert (phi_info);
3464   gcc_assert (ncopies >= 1);
3465
3466   /* Convert the step to the desired type.  */
3467   step_expr = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3468                                                   step_expr),
3469                                     &stmts, true, NULL_TREE);
3470   if (stmts)
3471     {
3472       new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3473       gcc_assert (!new_bb);
3474     }
3475
3476   /* Find the first insertion point in the BB.  */
3477   si = gsi_after_labels (bb);
3478
3479   /* Create the vector that holds the initial_value of the induction.  */
3480   if (nested_in_vect_loop)
3481     {
3482       /* iv_loop is nested in the loop to be vectorized.  init_expr had already
3483          been created during vectorization of previous stmts.  We obtain it
3484          from the STMT_VINFO_VEC_STMT of the defining stmt.  */
3485       vec_init = vect_get_vec_def_for_operand (init_expr, iv_phi, NULL);
3486       /* If the initial value is not of proper type, convert it.  */
3487       if (!useless_type_conversion_p (vectype, TREE_TYPE (vec_init)))
3488         {
3489           new_stmt
3490             = gimple_build_assign (vect_get_new_vect_var (vectype,
3491                                                           vect_simple_var,
3492                                                           "vec_iv_"),
3493                                    VIEW_CONVERT_EXPR,
3494                                    build1 (VIEW_CONVERT_EXPR, vectype,
3495                                            vec_init));
3496           vec_init = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3497           gimple_assign_set_lhs (new_stmt, vec_init);
3498           new_bb = gsi_insert_on_edge_immediate (loop_preheader_edge (iv_loop),
3499                                                  new_stmt);
3500           gcc_assert (!new_bb);
3501           set_vinfo_for_stmt (new_stmt,
3502                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3503         }
3504     }
3505   else
3506     {
3507       vec<constructor_elt, va_gc> *v;
3508
3509       /* iv_loop is the loop to be vectorized. Create:
3510          vec_init = [X, X+S, X+2*S, X+3*S] (S = step_expr, X = init_expr)  */
3511       new_var = vect_get_new_vect_var (TREE_TYPE (vectype),
3512                                        vect_scalar_var, "var_");
3513       new_name = force_gimple_operand (fold_convert (TREE_TYPE (vectype),
3514                                                      init_expr),
3515                                        &stmts, false, new_var);
3516       if (stmts)
3517         {
3518           new_bb = gsi_insert_seq_on_edge_immediate (pe, stmts);
3519           gcc_assert (!new_bb);
3520         }
3521
3522       vec_alloc (v, nunits);
3523       bool constant_p = is_gimple_min_invariant (new_name);
3524       CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3525       for (i = 1; i < nunits; i++)
3526         {
3527           /* Create: new_name_i = new_name + step_expr  */
3528           new_name = fold_build2 (PLUS_EXPR, TREE_TYPE (new_name),
3529                                   new_name, step_expr);
3530           if (!is_gimple_min_invariant (new_name))
3531             {
3532               init_stmt = gimple_build_assign (new_var, new_name);
3533               new_name = make_ssa_name (new_var, init_stmt);
3534               gimple_assign_set_lhs (init_stmt, new_name);
3535               new_bb = gsi_insert_on_edge_immediate (pe, init_stmt);
3536               gcc_assert (!new_bb);
3537               if (dump_enabled_p ())
3538                 {
3539                   dump_printf_loc (MSG_NOTE, vect_location,
3540                                    "created new init_stmt: ");
3541                   dump_gimple_stmt (MSG_NOTE, TDF_SLIM, init_stmt, 0);
3542                   dump_printf (MSG_NOTE, "\n");
3543                 }
3544               constant_p = false;
3545             }
3546           CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, new_name);
3547         }
3548       /* Create a vector from [new_name_0, new_name_1, ..., new_name_nunits-1]  */
3549       if (constant_p)
3550         new_vec = build_vector_from_ctor (vectype, v);
3551       else
3552         new_vec = build_constructor (vectype, v);
3553       vec_init = vect_init_vector (iv_phi, new_vec, vectype, NULL);
3554     }
3555
3556
3557   /* Create the vector that holds the step of the induction.  */
3558   if (nested_in_vect_loop)
3559     /* iv_loop is nested in the loop to be vectorized. Generate:
3560        vec_step = [S, S, S, S]  */
3561     new_name = step_expr;
3562   else
3563     {
3564       /* iv_loop is the loop to be vectorized. Generate:
3565           vec_step = [VF*S, VF*S, VF*S, VF*S]  */
3566       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3567         {
3568           expr = build_int_cst (integer_type_node, vf);
3569           expr = fold_convert (TREE_TYPE (step_expr), expr);
3570         }
3571       else
3572         expr = build_int_cst (TREE_TYPE (step_expr), vf);
3573       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3574                               expr, step_expr);
3575       if (TREE_CODE (step_expr) == SSA_NAME)
3576         new_name = vect_init_vector (iv_phi, new_name,
3577                                      TREE_TYPE (step_expr), NULL);
3578     }
3579
3580   t = unshare_expr (new_name);
3581   gcc_assert (CONSTANT_CLASS_P (new_name)
3582               || TREE_CODE (new_name) == SSA_NAME);
3583   stepvectype = get_vectype_for_scalar_type (TREE_TYPE (new_name));
3584   gcc_assert (stepvectype);
3585   new_vec = build_vector_from_val (stepvectype, t);
3586   vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3587
3588
3589   /* Create the following def-use cycle:
3590      loop prolog:
3591          vec_init = ...
3592          vec_step = ...
3593      loop:
3594          vec_iv = PHI <vec_init, vec_loop>
3595          ...
3596          STMT
3597          ...
3598          vec_loop = vec_iv + vec_step;  */
3599
3600   /* Create the induction-phi that defines the induction-operand.  */
3601   vec_dest = vect_get_new_vect_var (vectype, vect_simple_var, "vec_iv_");
3602   induction_phi = create_phi_node (vec_dest, iv_loop->header);
3603   set_vinfo_for_stmt (induction_phi,
3604                       new_stmt_vec_info (induction_phi, loop_vinfo, NULL));
3605   induc_def = PHI_RESULT (induction_phi);
3606
3607   /* Create the iv update inside the loop  */
3608   new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR, induc_def, vec_step);
3609   vec_def = make_ssa_name (vec_dest, new_stmt);
3610   gimple_assign_set_lhs (new_stmt, vec_def);
3611   gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3612   set_vinfo_for_stmt (new_stmt, new_stmt_vec_info (new_stmt, loop_vinfo,
3613                                                    NULL));
3614
3615   /* Set the arguments of the phi node:  */
3616   add_phi_arg (induction_phi, vec_init, pe, UNKNOWN_LOCATION);
3617   add_phi_arg (induction_phi, vec_def, loop_latch_edge (iv_loop),
3618                UNKNOWN_LOCATION);
3619
3620
3621   /* In case that vectorization factor (VF) is bigger than the number
3622      of elements that we can fit in a vectype (nunits), we have to generate
3623      more than one vector stmt - i.e - we need to "unroll" the
3624      vector stmt by a factor VF/nunits.  For more details see documentation
3625      in vectorizable_operation.  */
3626
3627   if (ncopies > 1)
3628     {
3629       stmt_vec_info prev_stmt_vinfo;
3630       /* FORNOW. This restriction should be relaxed.  */
3631       gcc_assert (!nested_in_vect_loop);
3632
3633       /* Create the vector that holds the step of the induction.  */
3634       if (SCALAR_FLOAT_TYPE_P (TREE_TYPE (step_expr)))
3635         {
3636           expr = build_int_cst (integer_type_node, nunits);
3637           expr = fold_convert (TREE_TYPE (step_expr), expr);
3638         }
3639       else
3640         expr = build_int_cst (TREE_TYPE (step_expr), nunits);
3641       new_name = fold_build2 (MULT_EXPR, TREE_TYPE (step_expr),
3642                               expr, step_expr);
3643       if (TREE_CODE (step_expr) == SSA_NAME)
3644         new_name = vect_init_vector (iv_phi, new_name,
3645                                      TREE_TYPE (step_expr), NULL);
3646       t = unshare_expr (new_name);
3647       gcc_assert (CONSTANT_CLASS_P (new_name)
3648                   || TREE_CODE (new_name) == SSA_NAME);
3649       new_vec = build_vector_from_val (stepvectype, t);
3650       vec_step = vect_init_vector (iv_phi, new_vec, stepvectype, NULL);
3651
3652       vec_def = induc_def;
3653       prev_stmt_vinfo = vinfo_for_stmt (induction_phi);
3654       for (i = 1; i < ncopies; i++)
3655         {
3656           /* vec_i = vec_prev + vec_step  */
3657           new_stmt = gimple_build_assign (vec_dest, PLUS_EXPR,
3658                                           vec_def, vec_step);
3659           vec_def = make_ssa_name (vec_dest, new_stmt);
3660           gimple_assign_set_lhs (new_stmt, vec_def);
3661
3662           gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3663           if (!useless_type_conversion_p (resvectype, vectype))
3664             {
3665               new_stmt
3666                 = gimple_build_assign
3667                         (vect_get_new_vect_var (resvectype, vect_simple_var,
3668                                                 "vec_iv_"),
3669                          VIEW_CONVERT_EXPR,
3670                          build1 (VIEW_CONVERT_EXPR, resvectype,
3671                                  gimple_assign_lhs (new_stmt)));
3672               gimple_assign_set_lhs (new_stmt,
3673                                      make_ssa_name
3674                                        (gimple_assign_lhs (new_stmt), new_stmt));
3675               gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3676             }
3677           set_vinfo_for_stmt (new_stmt,
3678                               new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3679           STMT_VINFO_RELATED_STMT (prev_stmt_vinfo) = new_stmt;
3680           prev_stmt_vinfo = vinfo_for_stmt (new_stmt);
3681         }
3682     }
3683
3684   if (nested_in_vect_loop)
3685     {
3686       /* Find the loop-closed exit-phi of the induction, and record
3687          the final vector of induction results:  */
3688       exit_phi = NULL;
3689       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
3690         {
3691           gimple use_stmt = USE_STMT (use_p);
3692           if (is_gimple_debug (use_stmt))
3693             continue;
3694
3695           if (!flow_bb_inside_loop_p (iv_loop, gimple_bb (use_stmt)))
3696             {
3697               exit_phi = use_stmt;
3698               break;
3699             }
3700         }
3701       if (exit_phi)
3702         {
3703           stmt_vec_info stmt_vinfo = vinfo_for_stmt (exit_phi);
3704           /* FORNOW. Currently not supporting the case that an inner-loop induction
3705              is not used in the outer-loop (i.e. only outside the outer-loop).  */
3706           gcc_assert (STMT_VINFO_RELEVANT_P (stmt_vinfo)
3707                       && !STMT_VINFO_LIVE_P (stmt_vinfo));
3708
3709           STMT_VINFO_VEC_STMT (stmt_vinfo) = new_stmt;
3710           if (dump_enabled_p ())
3711             {
3712               dump_printf_loc (MSG_NOTE, vect_location,
3713                                "vector of inductions after inner-loop:");
3714               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, new_stmt, 0);
3715               dump_printf (MSG_NOTE, "\n");
3716             }
3717         }
3718     }
3719
3720
3721   if (dump_enabled_p ())
3722     {
3723       dump_printf_loc (MSG_NOTE, vect_location,
3724                        "transform induction: created def-use cycle: ");
3725       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, induction_phi, 0);
3726       dump_printf (MSG_NOTE, "\n");
3727       dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
3728                         SSA_NAME_DEF_STMT (vec_def), 0);
3729       dump_printf (MSG_NOTE, "\n");
3730     }
3731
3732   STMT_VINFO_VEC_STMT (phi_info) = induction_phi;
3733   if (!useless_type_conversion_p (resvectype, vectype))
3734     {
3735       new_stmt = gimple_build_assign (vect_get_new_vect_var (resvectype,
3736                                                              vect_simple_var,
3737                                                              "vec_iv_"),
3738                                       VIEW_CONVERT_EXPR,
3739                                       build1 (VIEW_CONVERT_EXPR, resvectype,
3740                                               induc_def));
3741       induc_def = make_ssa_name (gimple_assign_lhs (new_stmt), new_stmt);
3742       gimple_assign_set_lhs (new_stmt, induc_def);
3743       si = gsi_after_labels (bb);
3744       gsi_insert_before (&si, new_stmt, GSI_SAME_STMT);
3745       set_vinfo_for_stmt (new_stmt,
3746                           new_stmt_vec_info (new_stmt, loop_vinfo, NULL));
3747       STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_stmt))
3748         = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (induction_phi));
3749     }
3750
3751   return induc_def;
3752 }
3753
3754
3755 /* Function get_initial_def_for_reduction
3756
3757    Input:
3758    STMT - a stmt that performs a reduction operation in the loop.
3759    INIT_VAL - the initial value of the reduction variable
3760
3761    Output:
3762    ADJUSTMENT_DEF - a tree that holds a value to be added to the final result
3763         of the reduction (used for adjusting the epilog - see below).
3764    Return a vector variable, initialized according to the operation that STMT
3765         performs. This vector will be used as the initial value of the
3766         vector of partial results.
3767
3768    Option1 (adjust in epilog): Initialize the vector as follows:
3769      add/bit or/xor:    [0,0,...,0,0]
3770      mult/bit and:      [1,1,...,1,1]
3771      min/max/cond_expr: [init_val,init_val,..,init_val,init_val]
3772    and when necessary (e.g. add/mult case) let the caller know
3773    that it needs to adjust the result by init_val.
3774
3775    Option2: Initialize the vector as follows:
3776      add/bit or/xor:    [init_val,0,0,...,0]
3777      mult/bit and:      [init_val,1,1,...,1]
3778      min/max/cond_expr: [init_val,init_val,...,init_val]
3779    and no adjustments are needed.
3780
3781    For example, for the following code:
3782
3783    s = init_val;
3784    for (i=0;i<n;i++)
3785      s = s + a[i];
3786
3787    STMT is 's = s + a[i]', and the reduction variable is 's'.
3788    For a vector of 4 units, we want to return either [0,0,0,init_val],
3789    or [0,0,0,0] and let the caller know that it needs to adjust
3790    the result at the end by 'init_val'.
3791
3792    FORNOW, we are using the 'adjust in epilog' scheme, because this way the
3793    initialization vector is simpler (same element in all entries), if
3794    ADJUSTMENT_DEF is not NULL, and Option2 otherwise.
3795
3796    A cost model should help decide between these two schemes.  */
3797
3798 tree
3799 get_initial_def_for_reduction (gimple stmt, tree init_val,
3800                                tree *adjustment_def)
3801 {
3802   stmt_vec_info stmt_vinfo = vinfo_for_stmt (stmt);
3803   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_vinfo);
3804   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
3805   tree scalar_type = TREE_TYPE (init_val);
3806   tree vectype = get_vectype_for_scalar_type (scalar_type);
3807   int nunits;
3808   enum tree_code code = gimple_assign_rhs_code (stmt);
3809   tree def_for_init;
3810   tree init_def;
3811   tree *elts;
3812   int i;
3813   bool nested_in_vect_loop = false;
3814   tree init_value;
3815   REAL_VALUE_TYPE real_init_val = dconst0;
3816   int int_init_val = 0;
3817   gimple def_stmt = NULL;
3818
3819   gcc_assert (vectype);
3820   nunits = TYPE_VECTOR_SUBPARTS (vectype);
3821
3822   gcc_assert (POINTER_TYPE_P (scalar_type) || INTEGRAL_TYPE_P (scalar_type)
3823               || SCALAR_FLOAT_TYPE_P (scalar_type));
3824
3825   if (nested_in_vect_loop_p (loop, stmt))
3826     nested_in_vect_loop = true;
3827   else
3828     gcc_assert (loop == (gimple_bb (stmt))->loop_father);
3829
3830   /* In case of double reduction we only create a vector variable to be put
3831      in the reduction phi node.  The actual statement creation is done in
3832      vect_create_epilog_for_reduction.  */
3833   if (adjustment_def && nested_in_vect_loop
3834       && TREE_CODE (init_val) == SSA_NAME
3835       && (def_stmt = SSA_NAME_DEF_STMT (init_val))
3836       && gimple_code (def_stmt) == GIMPLE_PHI
3837       && flow_bb_inside_loop_p (loop, gimple_bb (def_stmt))
3838       && vinfo_for_stmt (def_stmt)
3839       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_stmt))
3840           == vect_double_reduction_def)
3841     {
3842       *adjustment_def = NULL;
3843       return vect_create_destination_var (init_val, vectype);
3844     }
3845
3846   if (TREE_CONSTANT (init_val))
3847     {
3848       if (SCALAR_FLOAT_TYPE_P (scalar_type))
3849         init_value = build_real (scalar_type, TREE_REAL_CST (init_val));
3850       else
3851         init_value = build_int_cst (scalar_type, TREE_INT_CST_LOW (init_val));
3852     }
3853   else
3854     init_value = init_val;
3855
3856   switch (code)
3857     {
3858       case WIDEN_SUM_EXPR:
3859       case DOT_PROD_EXPR:
3860       case SAD_EXPR:
3861       case PLUS_EXPR:
3862       case MINUS_EXPR:
3863       case BIT_IOR_EXPR:
3864       case BIT_XOR_EXPR:
3865       case MULT_EXPR:
3866       case BIT_AND_EXPR:
3867         /* ADJUSMENT_DEF is NULL when called from
3868            vect_create_epilog_for_reduction to vectorize double reduction.  */
3869         if (adjustment_def)
3870           {
3871             if (nested_in_vect_loop)
3872               *adjustment_def = vect_get_vec_def_for_operand (init_val, stmt,
3873                                                               NULL);
3874             else
3875               *adjustment_def = init_val;
3876           }
3877
3878         if (code == MULT_EXPR)
3879           {
3880             real_init_val = dconst1;
3881             int_init_val = 1;
3882           }
3883
3884         if (code == BIT_AND_EXPR)
3885           int_init_val = -1;
3886
3887         if (SCALAR_FLOAT_TYPE_P (scalar_type))
3888           def_for_init = build_real (scalar_type, real_init_val);
3889         else
3890           def_for_init = build_int_cst (scalar_type, int_init_val);
3891
3892         /* Create a vector of '0' or '1' except the first element.  */
3893         elts = XALLOCAVEC (tree, nunits);
3894         for (i = nunits - 2; i >= 0; --i)
3895           elts[i + 1] = def_for_init;
3896
3897         /* Option1: the first element is '0' or '1' as well.  */
3898         if (adjustment_def)
3899           {
3900             elts[0] = def_for_init;
3901             init_def = build_vector (vectype, elts);
3902             break;
3903           }
3904
3905         /* Option2: the first element is INIT_VAL.  */
3906         elts[0] = init_val;
3907         if (TREE_CONSTANT (init_val))
3908           init_def = build_vector (vectype, elts);
3909         else
3910           {
3911             vec<constructor_elt, va_gc> *v;
3912             vec_alloc (v, nunits);
3913             CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, init_val);
3914             for (i = 1; i < nunits; ++i)
3915               CONSTRUCTOR_APPEND_ELT (v, NULL_TREE, elts[i]);
3916             init_def = build_constructor (vectype, v);
3917           }
3918
3919         break;
3920
3921       case MIN_EXPR:
3922       case MAX_EXPR:
3923       case COND_EXPR:
3924         if (adjustment_def)
3925           {
3926             *adjustment_def = NULL_TREE;
3927             init_def = vect_get_vec_def_for_operand (init_val, stmt, NULL);
3928             break;
3929           }
3930
3931         init_def = build_vector_from_val (vectype, init_value);
3932         break;
3933
3934       default:
3935         gcc_unreachable ();
3936     }
3937
3938   return init_def;
3939 }
3940
3941 /* Function vect_create_epilog_for_reduction
3942
3943    Create code at the loop-epilog to finalize the result of a reduction
3944    computation.
3945
3946    VECT_DEFS is list of vector of partial results, i.e., the lhs's of vector
3947      reduction statements.
3948    STMT is the scalar reduction stmt that is being vectorized.
3949    NCOPIES is > 1 in case the vectorization factor (VF) is bigger than the
3950      number of elements that we can fit in a vectype (nunits).  In this case
3951      we have to generate more than one vector stmt - i.e - we need to "unroll"
3952      the vector stmt by a factor VF/nunits.  For more details see documentation
3953      in vectorizable_operation.
3954    REDUC_CODE is the tree-code for the epilog reduction.
3955    REDUCTION_PHIS is a list of the phi-nodes that carry the reduction
3956      computation.
3957    REDUC_INDEX is the index of the operand in the right hand side of the
3958      statement that is defined by REDUCTION_PHI.
3959    DOUBLE_REDUC is TRUE if double reduction phi nodes should be handled.
3960    SLP_NODE is an SLP node containing a group of reduction statements. The
3961      first one in this group is STMT.
3962
3963    This function:
3964    1. Creates the reduction def-use cycles: sets the arguments for
3965       REDUCTION_PHIS:
3966       The loop-entry argument is the vectorized initial-value of the reduction.
3967       The loop-latch argument is taken from VECT_DEFS - the vector of partial
3968       sums.
3969    2. "Reduces" each vector of partial results VECT_DEFS into a single result,
3970       by applying the operation specified by REDUC_CODE if available, or by
3971       other means (whole-vector shifts or a scalar loop).
3972       The function also creates a new phi node at the loop exit to preserve
3973       loop-closed form, as illustrated below.
3974
3975      The flow at the entry to this function:
3976
3977         loop:
3978           vec_def = phi <null, null>            # REDUCTION_PHI
3979           VECT_DEF = vector_stmt                # vectorized form of STMT
3980           s_loop = scalar_stmt                  # (scalar) STMT
3981         loop_exit:
3982           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3983           use <s_out0>
3984           use <s_out0>
3985
3986      The above is transformed by this function into:
3987
3988         loop:
3989           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
3990           VECT_DEF = vector_stmt                # vectorized form of STMT
3991           s_loop = scalar_stmt                  # (scalar) STMT
3992         loop_exit:
3993           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
3994           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
3995           v_out2 = reduce <v_out1>
3996           s_out3 = extract_field <v_out2, 0>
3997           s_out4 = adjust_result <s_out3>
3998           use <s_out4>
3999           use <s_out4>
4000 */
4001
4002 static void
4003 vect_create_epilog_for_reduction (vec<tree> vect_defs, gimple stmt,
4004                                   int ncopies, enum tree_code reduc_code,
4005                                   vec<gimple> reduction_phis,
4006                                   int reduc_index, bool double_reduc,
4007                                   slp_tree slp_node)
4008 {
4009   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4010   stmt_vec_info prev_phi_info;
4011   tree vectype;
4012   machine_mode mode;
4013   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4014   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo), *outer_loop = NULL;
4015   basic_block exit_bb;
4016   tree scalar_dest;
4017   tree scalar_type;
4018   gimple new_phi = NULL, phi;
4019   gimple_stmt_iterator exit_gsi;
4020   tree vec_dest;
4021   tree new_temp = NULL_TREE, new_dest, new_name, new_scalar_dest;
4022   gimple epilog_stmt = NULL;
4023   enum tree_code code = gimple_assign_rhs_code (stmt);
4024   gimple exit_phi;
4025   tree bitsize;
4026   tree adjustment_def = NULL;
4027   tree vec_initial_def = NULL;
4028   tree reduction_op, expr, def;
4029   tree orig_name, scalar_result;
4030   imm_use_iterator imm_iter, phi_imm_iter;
4031   use_operand_p use_p, phi_use_p;
4032   gimple use_stmt, orig_stmt, reduction_phi = NULL;
4033   bool nested_in_vect_loop = false;
4034   auto_vec<gimple> new_phis;
4035   auto_vec<gimple> inner_phis;
4036   enum vect_def_type dt = vect_unknown_def_type;
4037   int j, i;
4038   auto_vec<tree> scalar_results;
4039   unsigned int group_size = 1, k, ratio;
4040   auto_vec<tree> vec_initial_defs;
4041   auto_vec<gimple> phis;
4042   bool slp_reduc = false;
4043   tree new_phi_result;
4044   gimple inner_phi = NULL;
4045
4046   if (slp_node)
4047     group_size = SLP_TREE_SCALAR_STMTS (slp_node).length ();
4048
4049   if (nested_in_vect_loop_p (loop, stmt))
4050     {
4051       outer_loop = loop;
4052       loop = loop->inner;
4053       nested_in_vect_loop = true;
4054       gcc_assert (!slp_node);
4055     }
4056
4057   reduction_op = get_reduction_op (stmt, reduc_index);
4058
4059   vectype = get_vectype_for_scalar_type (TREE_TYPE (reduction_op));
4060   gcc_assert (vectype);
4061   mode = TYPE_MODE (vectype);
4062
4063   /* 1. Create the reduction def-use cycle:
4064      Set the arguments of REDUCTION_PHIS, i.e., transform
4065
4066         loop:
4067           vec_def = phi <null, null>            # REDUCTION_PHI
4068           VECT_DEF = vector_stmt                # vectorized form of STMT
4069           ...
4070
4071      into:
4072
4073         loop:
4074           vec_def = phi <vec_init, VECT_DEF>    # REDUCTION_PHI
4075           VECT_DEF = vector_stmt                # vectorized form of STMT
4076           ...
4077
4078      (in case of SLP, do it for all the phis). */
4079
4080   /* Get the loop-entry arguments.  */
4081   if (slp_node)
4082     vect_get_vec_defs (reduction_op, NULL_TREE, stmt, &vec_initial_defs,
4083                        NULL, slp_node, reduc_index);
4084   else
4085     {
4086       vec_initial_defs.create (1);
4087      /* For the case of reduction, vect_get_vec_def_for_operand returns
4088         the scalar def before the loop, that defines the initial value
4089         of the reduction variable.  */
4090       vec_initial_def = vect_get_vec_def_for_operand (reduction_op, stmt,
4091                                                       &adjustment_def);
4092       vec_initial_defs.quick_push (vec_initial_def);
4093     }
4094
4095   /* Set phi nodes arguments.  */
4096   FOR_EACH_VEC_ELT (reduction_phis, i, phi)
4097     {
4098       tree vec_init_def, def;
4099       gimple_seq stmts;
4100       vec_init_def = force_gimple_operand (vec_initial_defs[i], &stmts,
4101                                            true, NULL_TREE);
4102       gsi_insert_seq_on_edge_immediate (loop_preheader_edge (loop), stmts);
4103       def = vect_defs[i];
4104       for (j = 0; j < ncopies; j++)
4105         {
4106           /* Set the loop-entry arg of the reduction-phi.  */
4107           add_phi_arg (as_a <gphi *> (phi), vec_init_def,
4108                        loop_preheader_edge (loop), UNKNOWN_LOCATION);
4109
4110           /* Set the loop-latch arg for the reduction-phi.  */
4111           if (j > 0)
4112             def = vect_get_vec_def_for_stmt_copy (vect_unknown_def_type, def);
4113
4114           add_phi_arg (as_a <gphi *> (phi), def, loop_latch_edge (loop),
4115                        UNKNOWN_LOCATION);
4116
4117           if (dump_enabled_p ())
4118             {
4119               dump_printf_loc (MSG_NOTE, vect_location,
4120                                "transform reduction: created def-use cycle: ");
4121               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
4122               dump_printf (MSG_NOTE, "\n");
4123               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, SSA_NAME_DEF_STMT (def), 0);
4124               dump_printf (MSG_NOTE, "\n");
4125             }
4126
4127           phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4128         }
4129     }
4130
4131   /* 2. Create epilog code.
4132         The reduction epilog code operates across the elements of the vector
4133         of partial results computed by the vectorized loop.
4134         The reduction epilog code consists of:
4135
4136         step 1: compute the scalar result in a vector (v_out2)
4137         step 2: extract the scalar result (s_out3) from the vector (v_out2)
4138         step 3: adjust the scalar result (s_out3) if needed.
4139
4140         Step 1 can be accomplished using one the following three schemes:
4141           (scheme 1) using reduc_code, if available.
4142           (scheme 2) using whole-vector shifts, if available.
4143           (scheme 3) using a scalar loop. In this case steps 1+2 above are
4144                      combined.
4145
4146           The overall epilog code looks like this:
4147
4148           s_out0 = phi <s_loop>         # original EXIT_PHI
4149           v_out1 = phi <VECT_DEF>       # NEW_EXIT_PHI
4150           v_out2 = reduce <v_out1>              # step 1
4151           s_out3 = extract_field <v_out2, 0>    # step 2
4152           s_out4 = adjust_result <s_out3>       # step 3
4153
4154           (step 3 is optional, and steps 1 and 2 may be combined).
4155           Lastly, the uses of s_out0 are replaced by s_out4.  */
4156
4157
4158   /* 2.1 Create new loop-exit-phis to preserve loop-closed form:
4159          v_out1 = phi <VECT_DEF>
4160          Store them in NEW_PHIS.  */
4161
4162   exit_bb = single_exit (loop)->dest;
4163   prev_phi_info = NULL;
4164   new_phis.create (vect_defs.length ());
4165   FOR_EACH_VEC_ELT (vect_defs, i, def)
4166     {
4167       for (j = 0; j < ncopies; j++)
4168         {
4169           tree new_def = copy_ssa_name (def);
4170           phi = create_phi_node (new_def, exit_bb);
4171           set_vinfo_for_stmt (phi, new_stmt_vec_info (phi, loop_vinfo, NULL));
4172           if (j == 0)
4173             new_phis.quick_push (phi);
4174           else
4175             {
4176               def = vect_get_vec_def_for_stmt_copy (dt, def);
4177               STMT_VINFO_RELATED_STMT (prev_phi_info) = phi;
4178             }
4179
4180           SET_PHI_ARG_DEF (phi, single_exit (loop)->dest_idx, def);
4181           prev_phi_info = vinfo_for_stmt (phi);
4182         }
4183     }
4184
4185   /* The epilogue is created for the outer-loop, i.e., for the loop being
4186      vectorized.  Create exit phis for the outer loop.  */
4187   if (double_reduc)
4188     {
4189       loop = outer_loop;
4190       exit_bb = single_exit (loop)->dest;
4191       inner_phis.create (vect_defs.length ());
4192       FOR_EACH_VEC_ELT (new_phis, i, phi)
4193         {
4194           tree new_result = copy_ssa_name (PHI_RESULT (phi));
4195           gphi *outer_phi = create_phi_node (new_result, exit_bb);
4196           SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4197                            PHI_RESULT (phi));
4198           set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4199                                                             loop_vinfo, NULL));
4200           inner_phis.quick_push (phi);
4201           new_phis[i] = outer_phi;
4202           prev_phi_info = vinfo_for_stmt (outer_phi);
4203           while (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi)))
4204             {
4205               phi = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (phi));
4206               new_result = copy_ssa_name (PHI_RESULT (phi));
4207               outer_phi = create_phi_node (new_result, exit_bb);
4208               SET_PHI_ARG_DEF (outer_phi, single_exit (loop)->dest_idx,
4209                                PHI_RESULT (phi));
4210               set_vinfo_for_stmt (outer_phi, new_stmt_vec_info (outer_phi,
4211                                                         loop_vinfo, NULL));
4212               STMT_VINFO_RELATED_STMT (prev_phi_info) = outer_phi;
4213               prev_phi_info = vinfo_for_stmt (outer_phi);
4214             }
4215         }
4216     }
4217
4218   exit_gsi = gsi_after_labels (exit_bb);
4219
4220   /* 2.2 Get the relevant tree-code to use in the epilog for schemes 2,3
4221          (i.e. when reduc_code is not available) and in the final adjustment
4222          code (if needed).  Also get the original scalar reduction variable as
4223          defined in the loop.  In case STMT is a "pattern-stmt" (i.e. - it
4224          represents a reduction pattern), the tree-code and scalar-def are
4225          taken from the original stmt that the pattern-stmt (STMT) replaces.
4226          Otherwise (it is a regular reduction) - the tree-code and scalar-def
4227          are taken from STMT.  */
4228
4229   orig_stmt = STMT_VINFO_RELATED_STMT (stmt_info);
4230   if (!orig_stmt)
4231     {
4232       /* Regular reduction  */
4233       orig_stmt = stmt;
4234     }
4235   else
4236     {
4237       /* Reduction pattern  */
4238       stmt_vec_info stmt_vinfo = vinfo_for_stmt (orig_stmt);
4239       gcc_assert (STMT_VINFO_IN_PATTERN_P (stmt_vinfo));
4240       gcc_assert (STMT_VINFO_RELATED_STMT (stmt_vinfo) == stmt);
4241     }
4242
4243   code = gimple_assign_rhs_code (orig_stmt);
4244   /* For MINUS_EXPR the initial vector is [init_val,0,...,0], therefore,
4245      partial results are added and not subtracted.  */
4246   if (code == MINUS_EXPR)
4247     code = PLUS_EXPR;
4248
4249   scalar_dest = gimple_assign_lhs (orig_stmt);
4250   scalar_type = TREE_TYPE (scalar_dest);
4251   scalar_results.create (group_size);
4252   new_scalar_dest = vect_create_destination_var (scalar_dest, NULL);
4253   bitsize = TYPE_SIZE (scalar_type);
4254
4255   /* In case this is a reduction in an inner-loop while vectorizing an outer
4256      loop - we don't need to extract a single scalar result at the end of the
4257      inner-loop (unless it is double reduction, i.e., the use of reduction is
4258      outside the outer-loop).  The final vector of partial results will be used
4259      in the vectorized outer-loop, or reduced to a scalar result at the end of
4260      the outer-loop.  */
4261   if (nested_in_vect_loop && !double_reduc)
4262     goto vect_finalize_reduction;
4263
4264   /* SLP reduction without reduction chain, e.g.,
4265      # a1 = phi <a2, a0>
4266      # b1 = phi <b2, b0>
4267      a2 = operation (a1)
4268      b2 = operation (b1)  */
4269   slp_reduc = (slp_node && !GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)));
4270
4271   /* In case of reduction chain, e.g.,
4272      # a1 = phi <a3, a0>
4273      a2 = operation (a1)
4274      a3 = operation (a2),
4275
4276      we may end up with more than one vector result.  Here we reduce them to
4277      one vector.  */
4278   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4279     {
4280       tree first_vect = PHI_RESULT (new_phis[0]);
4281       tree tmp;
4282       gassign *new_vec_stmt = NULL;
4283
4284       vec_dest = vect_create_destination_var (scalar_dest, vectype);
4285       for (k = 1; k < new_phis.length (); k++)
4286         {
4287           gimple next_phi = new_phis[k];
4288           tree second_vect = PHI_RESULT (next_phi);
4289
4290           tmp = build2 (code, vectype,  first_vect, second_vect);
4291           new_vec_stmt = gimple_build_assign (vec_dest, tmp);
4292           first_vect = make_ssa_name (vec_dest, new_vec_stmt);
4293           gimple_assign_set_lhs (new_vec_stmt, first_vect);
4294           gsi_insert_before (&exit_gsi, new_vec_stmt, GSI_SAME_STMT);
4295         }
4296
4297       new_phi_result = first_vect;
4298       if (new_vec_stmt)
4299         {
4300           new_phis.truncate (0);
4301           new_phis.safe_push (new_vec_stmt);
4302         }
4303     }
4304   else
4305     new_phi_result = PHI_RESULT (new_phis[0]);
4306
4307   /* 2.3 Create the reduction code, using one of the three schemes described
4308          above. In SLP we simply need to extract all the elements from the
4309          vector (without reducing them), so we use scalar shifts.  */
4310   if (reduc_code != ERROR_MARK && !slp_reduc)
4311     {
4312       tree tmp;
4313       tree vec_elem_type;
4314
4315       /*** Case 1:  Create:
4316            v_out2 = reduc_expr <v_out1>  */
4317
4318       if (dump_enabled_p ())
4319         dump_printf_loc (MSG_NOTE, vect_location,
4320                          "Reduce using direct vector reduction.\n");
4321
4322       vec_elem_type = TREE_TYPE (TREE_TYPE (new_phi_result));
4323       if (!useless_type_conversion_p (scalar_type, vec_elem_type))
4324         {
4325           tree tmp_dest =
4326               vect_create_destination_var (scalar_dest, vec_elem_type);
4327           tmp = build1 (reduc_code, vec_elem_type, new_phi_result);
4328           epilog_stmt = gimple_build_assign (tmp_dest, tmp);
4329           new_temp = make_ssa_name (tmp_dest, epilog_stmt);
4330           gimple_assign_set_lhs (epilog_stmt, new_temp);
4331           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4332
4333           tmp = build1 (NOP_EXPR, scalar_type, new_temp);
4334         }
4335       else
4336         tmp = build1 (reduc_code, scalar_type, new_phi_result);
4337       epilog_stmt = gimple_build_assign (new_scalar_dest, tmp);
4338       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4339       gimple_assign_set_lhs (epilog_stmt, new_temp);
4340       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4341       scalar_results.safe_push (new_temp);
4342     }
4343   else
4344     {
4345       bool reduce_with_shift = have_whole_vector_shift (mode);
4346       int element_bitsize = tree_to_uhwi (bitsize);
4347       int vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4348       tree vec_temp;
4349
4350       /* Regardless of whether we have a whole vector shift, if we're
4351          emulating the operation via tree-vect-generic, we don't want
4352          to use it.  Only the first round of the reduction is likely
4353          to still be profitable via emulation.  */
4354       /* ??? It might be better to emit a reduction tree code here, so that
4355          tree-vect-generic can expand the first round via bit tricks.  */
4356       if (!VECTOR_MODE_P (mode))
4357         reduce_with_shift = false;
4358       else
4359         {
4360           optab optab = optab_for_tree_code (code, vectype, optab_default);
4361           if (optab_handler (optab, mode) == CODE_FOR_nothing)
4362             reduce_with_shift = false;
4363         }
4364
4365       if (reduce_with_shift && !slp_reduc)
4366         {
4367           int nelements = vec_size_in_bits / element_bitsize;
4368           unsigned char *sel = XALLOCAVEC (unsigned char, nelements);
4369
4370           int elt_offset;
4371
4372           tree zero_vec = build_zero_cst (vectype);
4373           /*** Case 2: Create:
4374              for (offset = nelements/2; offset >= 1; offset/=2)
4375                 {
4376                   Create:  va' = vec_shift <va, offset>
4377                   Create:  va = vop <va, va'>
4378                 }  */
4379
4380           tree rhs;
4381
4382           if (dump_enabled_p ())
4383             dump_printf_loc (MSG_NOTE, vect_location,
4384                              "Reduce using vector shifts\n");
4385
4386           vec_dest = vect_create_destination_var (scalar_dest, vectype);
4387           new_temp = new_phi_result;
4388           for (elt_offset = nelements / 2;
4389                elt_offset >= 1;
4390                elt_offset /= 2)
4391             {
4392               calc_vec_perm_mask_for_shift (mode, elt_offset, sel);
4393               tree mask = vect_gen_perm_mask_any (vectype, sel);
4394               epilog_stmt = gimple_build_assign (vec_dest, VEC_PERM_EXPR,
4395                                                  new_temp, zero_vec, mask);
4396               new_name = make_ssa_name (vec_dest, epilog_stmt);
4397               gimple_assign_set_lhs (epilog_stmt, new_name);
4398               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4399
4400               epilog_stmt = gimple_build_assign (vec_dest, code, new_name,
4401                                                  new_temp);
4402               new_temp = make_ssa_name (vec_dest, epilog_stmt);
4403               gimple_assign_set_lhs (epilog_stmt, new_temp);
4404               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4405             }
4406
4407           /* 2.4  Extract the final scalar result.  Create:
4408              s_out3 = extract_field <v_out2, bitpos>  */
4409
4410           if (dump_enabled_p ())
4411             dump_printf_loc (MSG_NOTE, vect_location,
4412                              "extract scalar result\n");
4413
4414           rhs = build3 (BIT_FIELD_REF, scalar_type, new_temp,
4415                         bitsize, bitsize_zero_node);
4416           epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4417           new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4418           gimple_assign_set_lhs (epilog_stmt, new_temp);
4419           gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4420           scalar_results.safe_push (new_temp);
4421         }
4422       else
4423         {
4424           /*** Case 3: Create:
4425              s = extract_field <v_out2, 0>
4426              for (offset = element_size;
4427                   offset < vector_size;
4428                   offset += element_size;)
4429                {
4430                  Create:  s' = extract_field <v_out2, offset>
4431                  Create:  s = op <s, s'>  // For non SLP cases
4432                }  */
4433
4434           if (dump_enabled_p ())
4435             dump_printf_loc (MSG_NOTE, vect_location,
4436                              "Reduce using scalar code.\n");
4437
4438           vec_size_in_bits = tree_to_uhwi (TYPE_SIZE (vectype));
4439           FOR_EACH_VEC_ELT (new_phis, i, new_phi)
4440             {
4441               int bit_offset;
4442               if (gimple_code (new_phi) == GIMPLE_PHI)
4443                 vec_temp = PHI_RESULT (new_phi);
4444               else
4445                 vec_temp = gimple_assign_lhs (new_phi);
4446               tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp, bitsize,
4447                             bitsize_zero_node);
4448               epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4449               new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4450               gimple_assign_set_lhs (epilog_stmt, new_temp);
4451               gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4452
4453               /* In SLP we don't need to apply reduction operation, so we just
4454                  collect s' values in SCALAR_RESULTS.  */
4455               if (slp_reduc)
4456                 scalar_results.safe_push (new_temp);
4457
4458               for (bit_offset = element_bitsize;
4459                    bit_offset < vec_size_in_bits;
4460                    bit_offset += element_bitsize)
4461                 {
4462                   tree bitpos = bitsize_int (bit_offset);
4463                   tree rhs = build3 (BIT_FIELD_REF, scalar_type, vec_temp,
4464                                      bitsize, bitpos);
4465
4466                   epilog_stmt = gimple_build_assign (new_scalar_dest, rhs);
4467                   new_name = make_ssa_name (new_scalar_dest, epilog_stmt);
4468                   gimple_assign_set_lhs (epilog_stmt, new_name);
4469                   gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4470
4471                   if (slp_reduc)
4472                     {
4473                       /* In SLP we don't need to apply reduction operation, so
4474                          we just collect s' values in SCALAR_RESULTS.  */
4475                       new_temp = new_name;
4476                       scalar_results.safe_push (new_name);
4477                     }
4478                   else
4479                     {
4480                       epilog_stmt = gimple_build_assign (new_scalar_dest, code,
4481                                                          new_name, new_temp);
4482                       new_temp = make_ssa_name (new_scalar_dest, epilog_stmt);
4483                       gimple_assign_set_lhs (epilog_stmt, new_temp);
4484                       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4485                     }
4486                 }
4487             }
4488
4489           /* The only case where we need to reduce scalar results in SLP, is
4490              unrolling.  If the size of SCALAR_RESULTS is greater than
4491              GROUP_SIZE, we reduce them combining elements modulo
4492              GROUP_SIZE.  */
4493           if (slp_reduc)
4494             {
4495               tree res, first_res, new_res;
4496               gimple new_stmt;
4497
4498               /* Reduce multiple scalar results in case of SLP unrolling.  */
4499               for (j = group_size; scalar_results.iterate (j, &res);
4500                    j++)
4501                 {
4502                   first_res = scalar_results[j % group_size];
4503                   new_stmt = gimple_build_assign (new_scalar_dest, code,
4504                                                   first_res, res);
4505                   new_res = make_ssa_name (new_scalar_dest, new_stmt);
4506                   gimple_assign_set_lhs (new_stmt, new_res);
4507                   gsi_insert_before (&exit_gsi, new_stmt, GSI_SAME_STMT);
4508                   scalar_results[j % group_size] = new_res;
4509                 }
4510             }
4511           else
4512             /* Not SLP - we have one scalar to keep in SCALAR_RESULTS.  */
4513             scalar_results.safe_push (new_temp);
4514         }
4515     }
4516
4517 vect_finalize_reduction:
4518
4519   if (double_reduc)
4520     loop = loop->inner;
4521
4522   /* 2.5 Adjust the final result by the initial value of the reduction
4523          variable. (When such adjustment is not needed, then
4524          'adjustment_def' is zero).  For example, if code is PLUS we create:
4525          new_temp = loop_exit_def + adjustment_def  */
4526
4527   if (adjustment_def)
4528     {
4529       gcc_assert (!slp_reduc);
4530       if (nested_in_vect_loop)
4531         {
4532           new_phi = new_phis[0];
4533           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) == VECTOR_TYPE);
4534           expr = build2 (code, vectype, PHI_RESULT (new_phi), adjustment_def);
4535           new_dest = vect_create_destination_var (scalar_dest, vectype);
4536         }
4537       else
4538         {
4539           new_temp = scalar_results[0];
4540           gcc_assert (TREE_CODE (TREE_TYPE (adjustment_def)) != VECTOR_TYPE);
4541           expr = build2 (code, scalar_type, new_temp, adjustment_def);
4542           new_dest = vect_create_destination_var (scalar_dest, scalar_type);
4543         }
4544
4545       epilog_stmt = gimple_build_assign (new_dest, expr);
4546       new_temp = make_ssa_name (new_dest, epilog_stmt);
4547       gimple_assign_set_lhs (epilog_stmt, new_temp);
4548       gsi_insert_before (&exit_gsi, epilog_stmt, GSI_SAME_STMT);
4549       if (nested_in_vect_loop)
4550         {
4551           set_vinfo_for_stmt (epilog_stmt,
4552                               new_stmt_vec_info (epilog_stmt, loop_vinfo,
4553                                                  NULL));
4554           STMT_VINFO_RELATED_STMT (vinfo_for_stmt (epilog_stmt)) =
4555                 STMT_VINFO_RELATED_STMT (vinfo_for_stmt (new_phi));
4556
4557           if (!double_reduc)
4558             scalar_results.quick_push (new_temp);
4559           else
4560             scalar_results[0] = new_temp;
4561         }
4562       else
4563         scalar_results[0] = new_temp;
4564
4565       new_phis[0] = epilog_stmt;
4566     }
4567
4568   /* 2.6  Handle the loop-exit phis.  Replace the uses of scalar loop-exit
4569           phis with new adjusted scalar results, i.e., replace use <s_out0>
4570           with use <s_out4>.
4571
4572      Transform:
4573         loop_exit:
4574           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4575           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4576           v_out2 = reduce <v_out1>
4577           s_out3 = extract_field <v_out2, 0>
4578           s_out4 = adjust_result <s_out3>
4579           use <s_out0>
4580           use <s_out0>
4581
4582      into:
4583
4584         loop_exit:
4585           s_out0 = phi <s_loop>                 # (scalar) EXIT_PHI
4586           v_out1 = phi <VECT_DEF>               # NEW_EXIT_PHI
4587           v_out2 = reduce <v_out1>
4588           s_out3 = extract_field <v_out2, 0>
4589           s_out4 = adjust_result <s_out3>
4590           use <s_out4>
4591           use <s_out4> */
4592
4593
4594   /* In SLP reduction chain we reduce vector results into one vector if
4595      necessary, hence we set here GROUP_SIZE to 1.  SCALAR_DEST is the LHS of
4596      the last stmt in the reduction chain, since we are looking for the loop
4597      exit phi node.  */
4598   if (GROUP_FIRST_ELEMENT (vinfo_for_stmt (stmt)))
4599     {
4600       gimple dest_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[group_size - 1];
4601       /* Handle reduction patterns.  */
4602       if (STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt)))
4603         dest_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (dest_stmt));
4604
4605       scalar_dest = gimple_assign_lhs (dest_stmt);
4606       group_size = 1;
4607     }
4608
4609   /* In SLP we may have several statements in NEW_PHIS and REDUCTION_PHIS (in
4610      case that GROUP_SIZE is greater than vectorization factor).  Therefore, we
4611      need to match SCALAR_RESULTS with corresponding statements.  The first
4612      (GROUP_SIZE / number of new vector stmts) scalar results correspond to
4613      the first vector stmt, etc.
4614      (RATIO is equal to (GROUP_SIZE / number of new vector stmts)).  */
4615   if (group_size > new_phis.length ())
4616     {
4617       ratio = group_size / new_phis.length ();
4618       gcc_assert (!(group_size % new_phis.length ()));
4619     }
4620   else
4621     ratio = 1;
4622
4623   for (k = 0; k < group_size; k++)
4624     {
4625       if (k % ratio == 0)
4626         {
4627           epilog_stmt = new_phis[k / ratio];
4628           reduction_phi = reduction_phis[k / ratio];
4629           if (double_reduc)
4630             inner_phi = inner_phis[k / ratio];
4631         }
4632
4633       if (slp_reduc)
4634         {
4635           gimple current_stmt = SLP_TREE_SCALAR_STMTS (slp_node)[k];
4636
4637           orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (current_stmt));
4638           /* SLP statements can't participate in patterns.  */
4639           gcc_assert (!orig_stmt);
4640           scalar_dest = gimple_assign_lhs (current_stmt);
4641         }
4642
4643       phis.create (3);
4644       /* Find the loop-closed-use at the loop exit of the original scalar
4645          result.  (The reduction result is expected to have two immediate uses -
4646          one at the latch block, and one at the loop exit).  */
4647       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4648         if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p)))
4649             && !is_gimple_debug (USE_STMT (use_p)))
4650           phis.safe_push (USE_STMT (use_p));
4651
4652       /* While we expect to have found an exit_phi because of loop-closed-ssa
4653          form we can end up without one if the scalar cycle is dead.  */
4654
4655       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4656         {
4657           if (outer_loop)
4658             {
4659               stmt_vec_info exit_phi_vinfo = vinfo_for_stmt (exit_phi);
4660               gphi *vect_phi;
4661
4662               /* FORNOW. Currently not supporting the case that an inner-loop
4663                  reduction is not used in the outer-loop (but only outside the
4664                  outer-loop), unless it is double reduction.  */
4665               gcc_assert ((STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
4666                            && !STMT_VINFO_LIVE_P (exit_phi_vinfo))
4667                           || double_reduc);
4668
4669               if (double_reduc)
4670                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = inner_phi;
4671               else
4672                 STMT_VINFO_VEC_STMT (exit_phi_vinfo) = epilog_stmt;
4673               if (!double_reduc
4674                   || STMT_VINFO_DEF_TYPE (exit_phi_vinfo)
4675                       != vect_double_reduction_def)
4676                 continue;
4677
4678               /* Handle double reduction:
4679
4680                  stmt1: s1 = phi <s0, s2>  - double reduction phi (outer loop)
4681                  stmt2:   s3 = phi <s1, s4> - (regular) reduc phi (inner loop)
4682                  stmt3:   s4 = use (s3)     - (regular) reduc stmt (inner loop)
4683                  stmt4: s2 = phi <s4>      - double reduction stmt (outer loop)
4684
4685                  At that point the regular reduction (stmt2 and stmt3) is
4686                  already vectorized, as well as the exit phi node, stmt4.
4687                  Here we vectorize the phi node of double reduction, stmt1, and
4688                  update all relevant statements.  */
4689
4690               /* Go through all the uses of s2 to find double reduction phi
4691                  node, i.e., stmt1 above.  */
4692               orig_name = PHI_RESULT (exit_phi);
4693               FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4694                 {
4695                   stmt_vec_info use_stmt_vinfo;
4696                   stmt_vec_info new_phi_vinfo;
4697                   tree vect_phi_init, preheader_arg, vect_phi_res, init_def;
4698                   basic_block bb = gimple_bb (use_stmt);
4699                   gimple use;
4700
4701                   /* Check that USE_STMT is really double reduction phi
4702                      node.  */
4703                   if (gimple_code (use_stmt) != GIMPLE_PHI
4704                       || gimple_phi_num_args (use_stmt) != 2
4705                       || bb->loop_father != outer_loop)
4706                     continue;
4707                   use_stmt_vinfo = vinfo_for_stmt (use_stmt);
4708                   if (!use_stmt_vinfo
4709                       || STMT_VINFO_DEF_TYPE (use_stmt_vinfo)
4710                           != vect_double_reduction_def)
4711                     continue;
4712
4713                   /* Create vector phi node for double reduction:
4714                      vs1 = phi <vs0, vs2>
4715                      vs1 was created previously in this function by a call to
4716                        vect_get_vec_def_for_operand and is stored in
4717                        vec_initial_def;
4718                      vs2 is defined by INNER_PHI, the vectorized EXIT_PHI;
4719                      vs0 is created here.  */
4720
4721                   /* Create vector phi node.  */
4722                   vect_phi = create_phi_node (vec_initial_def, bb);
4723                   new_phi_vinfo = new_stmt_vec_info (vect_phi,
4724                                     loop_vec_info_for_loop (outer_loop), NULL);
4725                   set_vinfo_for_stmt (vect_phi, new_phi_vinfo);
4726
4727                   /* Create vs0 - initial def of the double reduction phi.  */
4728                   preheader_arg = PHI_ARG_DEF_FROM_EDGE (use_stmt,
4729                                              loop_preheader_edge (outer_loop));
4730                   init_def = get_initial_def_for_reduction (stmt,
4731                                                           preheader_arg, NULL);
4732                   vect_phi_init = vect_init_vector (use_stmt, init_def,
4733                                                     vectype, NULL);
4734
4735                   /* Update phi node arguments with vs0 and vs2.  */
4736                   add_phi_arg (vect_phi, vect_phi_init,
4737                                loop_preheader_edge (outer_loop),
4738                                UNKNOWN_LOCATION);
4739                   add_phi_arg (vect_phi, PHI_RESULT (inner_phi),
4740                                loop_latch_edge (outer_loop), UNKNOWN_LOCATION);
4741                   if (dump_enabled_p ())
4742                     {
4743                       dump_printf_loc (MSG_NOTE, vect_location,
4744                                        "created double reduction phi node: ");
4745                       dump_gimple_stmt (MSG_NOTE, TDF_SLIM, vect_phi, 0);
4746                       dump_printf (MSG_NOTE, "\n");
4747                     }
4748
4749                   vect_phi_res = PHI_RESULT (vect_phi);
4750
4751                   /* Replace the use, i.e., set the correct vs1 in the regular
4752                      reduction phi node.  FORNOW, NCOPIES is always 1, so the
4753                      loop is redundant.  */
4754                   use = reduction_phi;
4755                   for (j = 0; j < ncopies; j++)
4756                     {
4757                       edge pr_edge = loop_preheader_edge (loop);
4758                       SET_PHI_ARG_DEF (use, pr_edge->dest_idx, vect_phi_res);
4759                       use = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (use));
4760                     }
4761                 }
4762             }
4763         }
4764
4765       phis.release ();
4766       if (nested_in_vect_loop)
4767         {
4768           if (double_reduc)
4769             loop = outer_loop;
4770           else
4771             continue;
4772         }
4773
4774       phis.create (3);
4775       /* Find the loop-closed-use at the loop exit of the original scalar
4776          result.  (The reduction result is expected to have two immediate uses,
4777          one at the latch block, and one at the loop exit).  For double
4778          reductions we are looking for exit phis of the outer loop.  */
4779       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, scalar_dest)
4780         {
4781           if (!flow_bb_inside_loop_p (loop, gimple_bb (USE_STMT (use_p))))
4782             {
4783               if (!is_gimple_debug (USE_STMT (use_p)))
4784                 phis.safe_push (USE_STMT (use_p));
4785             }
4786           else
4787             {
4788               if (double_reduc && gimple_code (USE_STMT (use_p)) == GIMPLE_PHI)
4789                 {
4790                   tree phi_res = PHI_RESULT (USE_STMT (use_p));
4791
4792                   FOR_EACH_IMM_USE_FAST (phi_use_p, phi_imm_iter, phi_res)
4793                     {
4794                       if (!flow_bb_inside_loop_p (loop,
4795                                              gimple_bb (USE_STMT (phi_use_p)))
4796                           && !is_gimple_debug (USE_STMT (phi_use_p)))
4797                         phis.safe_push (USE_STMT (phi_use_p));
4798                     }
4799                 }
4800             }
4801         }
4802
4803       FOR_EACH_VEC_ELT (phis, i, exit_phi)
4804         {
4805           /* Replace the uses:  */
4806           orig_name = PHI_RESULT (exit_phi);
4807           scalar_result = scalar_results[k];
4808           FOR_EACH_IMM_USE_STMT (use_stmt, imm_iter, orig_name)
4809             FOR_EACH_IMM_USE_ON_STMT (use_p, imm_iter)
4810               SET_USE (use_p, scalar_result);
4811         }
4812
4813       phis.release ();
4814     }
4815 }
4816
4817
4818 /* Function vectorizable_reduction.
4819
4820    Check if STMT performs a reduction operation that can be vectorized.
4821    If VEC_STMT is also passed, vectorize the STMT: create a vectorized
4822    stmt to replace it, put it in VEC_STMT, and insert it at GSI.
4823    Return FALSE if not a vectorizable STMT, TRUE otherwise.
4824
4825    This function also handles reduction idioms (patterns) that have been
4826    recognized in advance during vect_pattern_recog.  In this case, STMT may be
4827    of this form:
4828      X = pattern_expr (arg0, arg1, ..., X)
4829    and it's STMT_VINFO_RELATED_STMT points to the last stmt in the original
4830    sequence that had been detected and replaced by the pattern-stmt (STMT).
4831
4832    In some cases of reduction patterns, the type of the reduction variable X is
4833    different than the type of the other arguments of STMT.
4834    In such cases, the vectype that is used when transforming STMT into a vector
4835    stmt is different than the vectype that is used to determine the
4836    vectorization factor, because it consists of a different number of elements
4837    than the actual number of elements that are being operated upon in parallel.
4838
4839    For example, consider an accumulation of shorts into an int accumulator.
4840    On some targets it's possible to vectorize this pattern operating on 8
4841    shorts at a time (hence, the vectype for purposes of determining the
4842    vectorization factor should be V8HI); on the other hand, the vectype that
4843    is used to create the vector form is actually V4SI (the type of the result).
4844
4845    Upon entry to this function, STMT_VINFO_VECTYPE records the vectype that
4846    indicates what is the actual level of parallelism (V8HI in the example), so
4847    that the right vectorization factor would be derived.  This vectype
4848    corresponds to the type of arguments to the reduction stmt, and should *NOT*
4849    be used to create the vectorized stmt.  The right vectype for the vectorized
4850    stmt is obtained from the type of the result X:
4851         get_vectype_for_scalar_type (TREE_TYPE (X))
4852
4853    This means that, contrary to "regular" reductions (or "regular" stmts in
4854    general), the following equation:
4855       STMT_VINFO_VECTYPE == get_vectype_for_scalar_type (TREE_TYPE (X))
4856    does *NOT* necessarily hold for reduction patterns.  */
4857
4858 bool
4859 vectorizable_reduction (gimple stmt, gimple_stmt_iterator *gsi,
4860                         gimple *vec_stmt, slp_tree slp_node)
4861 {
4862   tree vec_dest;
4863   tree scalar_dest;
4864   tree loop_vec_def0 = NULL_TREE, loop_vec_def1 = NULL_TREE;
4865   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
4866   tree vectype_out = STMT_VINFO_VECTYPE (stmt_info);
4867   tree vectype_in = NULL_TREE;
4868   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
4869   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
4870   enum tree_code code, orig_code, epilog_reduc_code;
4871   machine_mode vec_mode;
4872   int op_type;
4873   optab optab, reduc_optab;
4874   tree new_temp = NULL_TREE;
4875   tree def;
4876   gimple def_stmt;
4877   enum vect_def_type dt;
4878   gphi *new_phi = NULL;
4879   tree scalar_type;
4880   bool is_simple_use;
4881   gimple orig_stmt;
4882   stmt_vec_info orig_stmt_info;
4883   tree expr = NULL_TREE;
4884   int i;
4885   int ncopies;
4886   int epilog_copies;
4887   stmt_vec_info prev_stmt_info, prev_phi_info;
4888   bool single_defuse_cycle = false;
4889   tree reduc_def = NULL_TREE;
4890   gimple new_stmt = NULL;
4891   int j;
4892   tree ops[3];
4893   bool nested_cycle = false, found_nested_cycle_def = false;
4894   gimple reduc_def_stmt = NULL;
4895   bool double_reduc = false, dummy;
4896   basic_block def_bb;
4897   struct loop * def_stmt_loop, *outer_loop = NULL;
4898   tree def_arg;
4899   gimple def_arg_stmt;
4900   auto_vec<tree> vec_oprnds0;
4901   auto_vec<tree> vec_oprnds1;
4902   auto_vec<tree> vect_defs;
4903   auto_vec<gimple> phis;
4904   int vec_num;
4905   tree def0, def1, tem, op0, op1 = NULL_TREE;
4906   bool first_p = true;
4907
4908   /* In case of reduction chain we switch to the first stmt in the chain, but
4909      we don't update STMT_INFO, since only the last stmt is marked as reduction
4910      and has reduction properties.  */
4911   if (GROUP_FIRST_ELEMENT (stmt_info)
4912       && GROUP_FIRST_ELEMENT (stmt_info) != stmt)
4913     {
4914       stmt = GROUP_FIRST_ELEMENT (stmt_info);
4915       first_p = false;
4916     }
4917
4918   if (nested_in_vect_loop_p (loop, stmt))
4919     {
4920       outer_loop = loop;
4921       loop = loop->inner;
4922       nested_cycle = true;
4923     }
4924
4925   /* 1. Is vectorizable reduction?  */
4926   /* Not supportable if the reduction variable is used in the loop, unless
4927      it's a reduction chain.  */
4928   if (STMT_VINFO_RELEVANT (stmt_info) > vect_used_in_outer
4929       && !GROUP_FIRST_ELEMENT (stmt_info))
4930     return false;
4931
4932   /* Reductions that are not used even in an enclosing outer-loop,
4933      are expected to be "live" (used out of the loop).  */
4934   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope
4935       && !STMT_VINFO_LIVE_P (stmt_info))
4936     return false;
4937
4938   /* Make sure it was already recognized as a reduction computation.  */
4939   if (STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_reduction_def
4940       && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (stmt)) != vect_nested_cycle)
4941     return false;
4942
4943   /* 2. Has this been recognized as a reduction pattern?
4944
4945      Check if STMT represents a pattern that has been recognized
4946      in earlier analysis stages.  For stmts that represent a pattern,
4947      the STMT_VINFO_RELATED_STMT field records the last stmt in
4948      the original sequence that constitutes the pattern.  */
4949
4950   orig_stmt = STMT_VINFO_RELATED_STMT (vinfo_for_stmt (stmt));
4951   if (orig_stmt)
4952     {
4953       orig_stmt_info = vinfo_for_stmt (orig_stmt);
4954       gcc_assert (STMT_VINFO_IN_PATTERN_P (orig_stmt_info));
4955       gcc_assert (!STMT_VINFO_IN_PATTERN_P (stmt_info));
4956     }
4957
4958   /* 3. Check the operands of the operation.  The first operands are defined
4959         inside the loop body. The last operand is the reduction variable,
4960         which is defined by the loop-header-phi.  */
4961
4962   gcc_assert (is_gimple_assign (stmt));
4963
4964   /* Flatten RHS.  */
4965   switch (get_gimple_rhs_class (gimple_assign_rhs_code (stmt)))
4966     {
4967     case GIMPLE_SINGLE_RHS:
4968       op_type = TREE_OPERAND_LENGTH (gimple_assign_rhs1 (stmt));
4969       if (op_type == ternary_op)
4970         {
4971           tree rhs = gimple_assign_rhs1 (stmt);
4972           ops[0] = TREE_OPERAND (rhs, 0);
4973           ops[1] = TREE_OPERAND (rhs, 1);
4974           ops[2] = TREE_OPERAND (rhs, 2);
4975           code = TREE_CODE (rhs);
4976         }
4977       else
4978         return false;
4979       break;
4980
4981     case GIMPLE_BINARY_RHS:
4982       code = gimple_assign_rhs_code (stmt);
4983       op_type = TREE_CODE_LENGTH (code);
4984       gcc_assert (op_type == binary_op);
4985       ops[0] = gimple_assign_rhs1 (stmt);
4986       ops[1] = gimple_assign_rhs2 (stmt);
4987       break;
4988
4989     case GIMPLE_TERNARY_RHS:
4990       code = gimple_assign_rhs_code (stmt);
4991       op_type = TREE_CODE_LENGTH (code);
4992       gcc_assert (op_type == ternary_op);
4993       ops[0] = gimple_assign_rhs1 (stmt);
4994       ops[1] = gimple_assign_rhs2 (stmt);
4995       ops[2] = gimple_assign_rhs3 (stmt);
4996       break;
4997
4998     case GIMPLE_UNARY_RHS:
4999       return false;
5000
5001     default:
5002       gcc_unreachable ();
5003     }
5004   /* The default is that the reduction variable is the last in statement.  */
5005   int reduc_index = op_type - 1;
5006
5007   if (code == COND_EXPR && slp_node)
5008     return false;
5009
5010   scalar_dest = gimple_assign_lhs (stmt);
5011   scalar_type = TREE_TYPE (scalar_dest);
5012   if (!POINTER_TYPE_P (scalar_type) && !INTEGRAL_TYPE_P (scalar_type)
5013       && !SCALAR_FLOAT_TYPE_P (scalar_type))
5014     return false;
5015
5016   /* Do not try to vectorize bit-precision reductions.  */
5017   if ((TYPE_PRECISION (scalar_type)
5018        != GET_MODE_PRECISION (TYPE_MODE (scalar_type))))
5019     return false;
5020
5021   /* All uses but the last are expected to be defined in the loop.
5022      The last use is the reduction variable.  In case of nested cycle this
5023      assumption is not true: we use reduc_index to record the index of the
5024      reduction variable.  */
5025   for (i = 0; i < op_type - 1; i++)
5026     {
5027       /* The condition of COND_EXPR is checked in vectorizable_condition().  */
5028       if (i == 0 && code == COND_EXPR)
5029         continue;
5030
5031       is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5032                                             &def_stmt, &def, &dt, &tem);
5033       if (!vectype_in)
5034         vectype_in = tem;
5035       gcc_assert (is_simple_use);
5036
5037       if (dt != vect_internal_def
5038           && dt != vect_external_def
5039           && dt != vect_constant_def
5040           && dt != vect_induction_def
5041           && !(dt == vect_nested_cycle && nested_cycle))
5042         return false;
5043
5044       if (dt == vect_nested_cycle)
5045         {
5046           found_nested_cycle_def = true;
5047           reduc_def_stmt = def_stmt;
5048           reduc_index = i;
5049         }
5050     }
5051
5052   is_simple_use = vect_is_simple_use_1 (ops[i], stmt, loop_vinfo, NULL,
5053                                         &def_stmt, &def, &dt, &tem);
5054   if (!vectype_in)
5055     vectype_in = tem;
5056   gcc_assert (is_simple_use);
5057   if (!found_nested_cycle_def)
5058     reduc_def_stmt = def_stmt;
5059
5060   if (reduc_def_stmt && gimple_code (reduc_def_stmt) != GIMPLE_PHI)
5061     return false;
5062
5063   if (!(dt == vect_reduction_def
5064         || dt == vect_nested_cycle
5065         || ((dt == vect_internal_def || dt == vect_external_def
5066              || dt == vect_constant_def || dt == vect_induction_def)
5067             && nested_cycle && found_nested_cycle_def)))
5068     {
5069       /* For pattern recognized stmts, orig_stmt might be a reduction,
5070          but some helper statements for the pattern might not, or
5071          might be COND_EXPRs with reduction uses in the condition.  */
5072       gcc_assert (orig_stmt);
5073       return false;
5074     }
5075
5076   gimple tmp = vect_is_simple_reduction (loop_vinfo, reduc_def_stmt,
5077                                          !nested_cycle, &dummy);
5078   if (orig_stmt)
5079     gcc_assert (tmp == orig_stmt
5080                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == orig_stmt);
5081   else
5082     /* We changed STMT to be the first stmt in reduction chain, hence we
5083        check that in this case the first element in the chain is STMT.  */
5084     gcc_assert (stmt == tmp
5085                 || GROUP_FIRST_ELEMENT (vinfo_for_stmt (tmp)) == stmt);
5086
5087   if (STMT_VINFO_LIVE_P (vinfo_for_stmt (reduc_def_stmt)))
5088     return false;
5089
5090   if (slp_node || PURE_SLP_STMT (stmt_info))
5091     ncopies = 1;
5092   else
5093     ncopies = (LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5094                / TYPE_VECTOR_SUBPARTS (vectype_in));
5095
5096   gcc_assert (ncopies >= 1);
5097
5098   vec_mode = TYPE_MODE (vectype_in);
5099
5100   if (code == COND_EXPR)
5101     {
5102       if (!vectorizable_condition (stmt, gsi, NULL, ops[reduc_index], 0, NULL))
5103         {
5104           if (dump_enabled_p ())
5105             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5106                              "unsupported condition in reduction\n");
5107
5108           return false;
5109         }
5110     }
5111   else
5112     {
5113       /* 4. Supportable by target?  */
5114
5115       if (code == LSHIFT_EXPR || code == RSHIFT_EXPR
5116           || code == LROTATE_EXPR || code == RROTATE_EXPR)
5117         {
5118           /* Shifts and rotates are only supported by vectorizable_shifts,
5119              not vectorizable_reduction.  */
5120           if (dump_enabled_p ())
5121             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5122                              "unsupported shift or rotation.\n");
5123           return false;
5124         }
5125
5126       /* 4.1. check support for the operation in the loop  */
5127       optab = optab_for_tree_code (code, vectype_in, optab_default);
5128       if (!optab)
5129         {
5130           if (dump_enabled_p ())
5131             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5132                              "no optab.\n");
5133
5134           return false;
5135         }
5136
5137       if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5138         {
5139           if (dump_enabled_p ())
5140             dump_printf (MSG_NOTE, "op not supported by target.\n");
5141
5142           if (GET_MODE_SIZE (vec_mode) != UNITS_PER_WORD
5143               || LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5144                   < vect_min_worthwhile_factor (code))
5145             return false;
5146
5147           if (dump_enabled_p ())
5148             dump_printf (MSG_NOTE, "proceeding using word mode.\n");
5149         }
5150
5151       /* Worthwhile without SIMD support?  */
5152       if (!VECTOR_MODE_P (TYPE_MODE (vectype_in))
5153           && LOOP_VINFO_VECT_FACTOR (loop_vinfo)
5154              < vect_min_worthwhile_factor (code))
5155         {
5156           if (dump_enabled_p ())
5157             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5158                              "not worthwhile without SIMD support.\n");
5159
5160           return false;
5161         }
5162     }
5163
5164   /* 4.2. Check support for the epilog operation.
5165
5166           If STMT represents a reduction pattern, then the type of the
5167           reduction variable may be different than the type of the rest
5168           of the arguments.  For example, consider the case of accumulation
5169           of shorts into an int accumulator; The original code:
5170                         S1: int_a = (int) short_a;
5171           orig_stmt->   S2: int_acc = plus <int_a ,int_acc>;
5172
5173           was replaced with:
5174                         STMT: int_acc = widen_sum <short_a, int_acc>
5175
5176           This means that:
5177           1. The tree-code that is used to create the vector operation in the
5178              epilog code (that reduces the partial results) is not the
5179              tree-code of STMT, but is rather the tree-code of the original
5180              stmt from the pattern that STMT is replacing.  I.e, in the example
5181              above we want to use 'widen_sum' in the loop, but 'plus' in the
5182              epilog.
5183           2. The type (mode) we use to check available target support
5184              for the vector operation to be created in the *epilog*, is
5185              determined by the type of the reduction variable (in the example
5186              above we'd check this: optab_handler (plus_optab, vect_int_mode])).
5187              However the type (mode) we use to check available target support
5188              for the vector operation to be created *inside the loop*, is
5189              determined by the type of the other arguments to STMT (in the
5190              example we'd check this: optab_handler (widen_sum_optab,
5191              vect_short_mode)).
5192
5193           This is contrary to "regular" reductions, in which the types of all
5194           the arguments are the same as the type of the reduction variable.
5195           For "regular" reductions we can therefore use the same vector type
5196           (and also the same tree-code) when generating the epilog code and
5197           when generating the code inside the loop.  */
5198
5199   if (orig_stmt)
5200     {
5201       /* This is a reduction pattern: get the vectype from the type of the
5202          reduction variable, and get the tree-code from orig_stmt.  */
5203       orig_code = gimple_assign_rhs_code (orig_stmt);
5204       gcc_assert (vectype_out);
5205       vec_mode = TYPE_MODE (vectype_out);
5206     }
5207   else
5208     {
5209       /* Regular reduction: use the same vectype and tree-code as used for
5210          the vector code inside the loop can be used for the epilog code. */
5211       orig_code = code;
5212     }
5213
5214   if (nested_cycle)
5215     {
5216       def_bb = gimple_bb (reduc_def_stmt);
5217       def_stmt_loop = def_bb->loop_father;
5218       def_arg = PHI_ARG_DEF_FROM_EDGE (reduc_def_stmt,
5219                                        loop_preheader_edge (def_stmt_loop));
5220       if (TREE_CODE (def_arg) == SSA_NAME
5221           && (def_arg_stmt = SSA_NAME_DEF_STMT (def_arg))
5222           && gimple_code (def_arg_stmt) == GIMPLE_PHI
5223           && flow_bb_inside_loop_p (outer_loop, gimple_bb (def_arg_stmt))
5224           && vinfo_for_stmt (def_arg_stmt)
5225           && STMT_VINFO_DEF_TYPE (vinfo_for_stmt (def_arg_stmt))
5226               == vect_double_reduction_def)
5227         double_reduc = true;
5228     }
5229
5230   epilog_reduc_code = ERROR_MARK;
5231   if (reduction_code_for_scalar_code (orig_code, &epilog_reduc_code))
5232     {
5233       reduc_optab = optab_for_tree_code (epilog_reduc_code, vectype_out,
5234                                          optab_default);
5235       if (!reduc_optab)
5236         {
5237           if (dump_enabled_p ())
5238             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5239                              "no optab for reduction.\n");
5240
5241           epilog_reduc_code = ERROR_MARK;
5242         }
5243       else if (optab_handler (reduc_optab, vec_mode) == CODE_FOR_nothing)
5244         {
5245           optab = scalar_reduc_to_vector (reduc_optab, vectype_out);
5246           if (optab_handler (optab, vec_mode) == CODE_FOR_nothing)
5247             {
5248               if (dump_enabled_p ())
5249                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5250                                  "reduc op not supported by target.\n");
5251
5252               epilog_reduc_code = ERROR_MARK;
5253             }
5254         }
5255     }
5256   else
5257     {
5258       if (!nested_cycle || double_reduc)
5259         {
5260           if (dump_enabled_p ())
5261             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5262                              "no reduc code for scalar code.\n");
5263
5264           return false;
5265         }
5266     }
5267
5268   if (double_reduc && ncopies > 1)
5269     {
5270       if (dump_enabled_p ())
5271         dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5272                          "multiple types in double reduction\n");
5273
5274       return false;
5275     }
5276
5277   /* In case of widenning multiplication by a constant, we update the type
5278      of the constant to be the type of the other operand.  We check that the
5279      constant fits the type in the pattern recognition pass.  */
5280   if (code == DOT_PROD_EXPR
5281       && !types_compatible_p (TREE_TYPE (ops[0]), TREE_TYPE (ops[1])))
5282     {
5283       if (TREE_CODE (ops[0]) == INTEGER_CST)
5284         ops[0] = fold_convert (TREE_TYPE (ops[1]), ops[0]);
5285       else if (TREE_CODE (ops[1]) == INTEGER_CST)
5286         ops[1] = fold_convert (TREE_TYPE (ops[0]), ops[1]);
5287       else
5288         {
5289           if (dump_enabled_p ())
5290             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5291                              "invalid types in dot-prod\n");
5292
5293           return false;
5294         }
5295     }
5296
5297   if (!vec_stmt) /* transformation not required.  */
5298     {
5299       if (first_p
5300           && !vect_model_reduction_cost (stmt_info, epilog_reduc_code, ncopies,
5301                                          reduc_index))
5302         return false;
5303       STMT_VINFO_TYPE (stmt_info) = reduc_vec_info_type;
5304       return true;
5305     }
5306
5307   /** Transform.  **/
5308
5309   if (dump_enabled_p ())
5310     dump_printf_loc (MSG_NOTE, vect_location, "transform reduction.\n");
5311
5312   /* FORNOW: Multiple types are not supported for condition.  */
5313   if (code == COND_EXPR)
5314     gcc_assert (ncopies == 1);
5315
5316   /* Create the destination vector  */
5317   vec_dest = vect_create_destination_var (scalar_dest, vectype_out);
5318
5319   /* In case the vectorization factor (VF) is bigger than the number
5320      of elements that we can fit in a vectype (nunits), we have to generate
5321      more than one vector stmt - i.e - we need to "unroll" the
5322      vector stmt by a factor VF/nunits.  For more details see documentation
5323      in vectorizable_operation.  */
5324
5325   /* If the reduction is used in an outer loop we need to generate
5326      VF intermediate results, like so (e.g. for ncopies=2):
5327         r0 = phi (init, r0)
5328         r1 = phi (init, r1)
5329         r0 = x0 + r0;
5330         r1 = x1 + r1;
5331     (i.e. we generate VF results in 2 registers).
5332     In this case we have a separate def-use cycle for each copy, and therefore
5333     for each copy we get the vector def for the reduction variable from the
5334     respective phi node created for this copy.
5335
5336     Otherwise (the reduction is unused in the loop nest), we can combine
5337     together intermediate results, like so (e.g. for ncopies=2):
5338         r = phi (init, r)
5339         r = x0 + r;
5340         r = x1 + r;
5341    (i.e. we generate VF/2 results in a single register).
5342    In this case for each copy we get the vector def for the reduction variable
5343    from the vectorized reduction operation generated in the previous iteration.
5344   */
5345
5346   if (STMT_VINFO_RELEVANT (stmt_info) == vect_unused_in_scope)
5347     {
5348       single_defuse_cycle = true;
5349       epilog_copies = 1;
5350     }
5351   else
5352     epilog_copies = ncopies;
5353
5354   prev_stmt_info = NULL;
5355   prev_phi_info = NULL;
5356   if (slp_node)
5357     vec_num = SLP_TREE_NUMBER_OF_VEC_STMTS (slp_node);
5358   else
5359     {
5360       vec_num = 1;
5361       vec_oprnds0.create (1);
5362       if (op_type == ternary_op)
5363         vec_oprnds1.create (1);
5364     }
5365
5366   phis.create (vec_num);
5367   vect_defs.create (vec_num);
5368   if (!slp_node)
5369     vect_defs.quick_push (NULL_TREE);
5370
5371   for (j = 0; j < ncopies; j++)
5372     {
5373       if (j == 0 || !single_defuse_cycle)
5374         {
5375           for (i = 0; i < vec_num; i++)
5376             {
5377               /* Create the reduction-phi that defines the reduction
5378                  operand.  */
5379               new_phi = create_phi_node (vec_dest, loop->header);
5380               set_vinfo_for_stmt (new_phi,
5381                                   new_stmt_vec_info (new_phi, loop_vinfo,
5382                                                      NULL));
5383                if (j == 0 || slp_node)
5384                  phis.quick_push (new_phi);
5385             }
5386         }
5387
5388       if (code == COND_EXPR)
5389         {
5390           gcc_assert (!slp_node);
5391           vectorizable_condition (stmt, gsi, vec_stmt,
5392                                   PHI_RESULT (phis[0]),
5393                                   reduc_index, NULL);
5394           /* Multiple types are not supported for condition.  */
5395           break;
5396         }
5397
5398       /* Handle uses.  */
5399       if (j == 0)
5400         {
5401           op0 = ops[!reduc_index];
5402           if (op_type == ternary_op)
5403             {
5404               if (reduc_index == 0)
5405                 op1 = ops[2];
5406               else
5407                 op1 = ops[1];
5408             }
5409
5410           if (slp_node)
5411             vect_get_vec_defs (op0, op1, stmt, &vec_oprnds0, &vec_oprnds1,
5412                                slp_node, -1);
5413           else
5414             {
5415               loop_vec_def0 = vect_get_vec_def_for_operand (ops[!reduc_index],
5416                                                             stmt, NULL);
5417               vec_oprnds0.quick_push (loop_vec_def0);
5418               if (op_type == ternary_op)
5419                {
5420                  loop_vec_def1 = vect_get_vec_def_for_operand (op1, stmt,
5421                                                                NULL);
5422                  vec_oprnds1.quick_push (loop_vec_def1);
5423                }
5424             }
5425         }
5426       else
5427         {
5428           if (!slp_node)
5429             {
5430               enum vect_def_type dt;
5431               gimple dummy_stmt;
5432               tree dummy;
5433
5434               vect_is_simple_use (ops[!reduc_index], stmt, loop_vinfo, NULL,
5435                                   &dummy_stmt, &dummy, &dt);
5436               loop_vec_def0 = vect_get_vec_def_for_stmt_copy (dt,
5437                                                               loop_vec_def0);
5438               vec_oprnds0[0] = loop_vec_def0;
5439               if (op_type == ternary_op)
5440                 {
5441                   vect_is_simple_use (op1, stmt, loop_vinfo, NULL, &dummy_stmt,
5442                                       &dummy, &dt);
5443                   loop_vec_def1 = vect_get_vec_def_for_stmt_copy (dt,
5444                                                                 loop_vec_def1);
5445                   vec_oprnds1[0] = loop_vec_def1;
5446                 }
5447             }
5448
5449           if (single_defuse_cycle)
5450             reduc_def = gimple_assign_lhs (new_stmt);
5451
5452           STMT_VINFO_RELATED_STMT (prev_phi_info) = new_phi;
5453         }
5454
5455       FOR_EACH_VEC_ELT (vec_oprnds0, i, def0)
5456         {
5457           if (slp_node)
5458             reduc_def = PHI_RESULT (phis[i]);
5459           else
5460             {
5461               if (!single_defuse_cycle || j == 0)
5462                 reduc_def = PHI_RESULT (new_phi);
5463             }
5464
5465           def1 = ((op_type == ternary_op)
5466                   ? vec_oprnds1[i] : NULL);
5467           if (op_type == binary_op)
5468             {
5469               if (reduc_index == 0)
5470                 expr = build2 (code, vectype_out, reduc_def, def0);
5471               else
5472                 expr = build2 (code, vectype_out, def0, reduc_def);
5473             }
5474           else
5475             {
5476               if (reduc_index == 0)
5477                 expr = build3 (code, vectype_out, reduc_def, def0, def1);
5478               else
5479                 {
5480                   if (reduc_index == 1)
5481                     expr = build3 (code, vectype_out, def0, reduc_def, def1);
5482                   else
5483                     expr = build3 (code, vectype_out, def0, def1, reduc_def);
5484                 }
5485             }
5486
5487           new_stmt = gimple_build_assign (vec_dest, expr);
5488           new_temp = make_ssa_name (vec_dest, new_stmt);
5489           gimple_assign_set_lhs (new_stmt, new_temp);
5490           vect_finish_stmt_generation (stmt, new_stmt, gsi);
5491
5492           if (slp_node)
5493             {
5494               SLP_TREE_VEC_STMTS (slp_node).quick_push (new_stmt);
5495               vect_defs.quick_push (new_temp);
5496             }
5497           else
5498             vect_defs[0] = new_temp;
5499         }
5500
5501       if (slp_node)
5502         continue;
5503
5504       if (j == 0)
5505         STMT_VINFO_VEC_STMT (stmt_info) = *vec_stmt = new_stmt;
5506       else
5507         STMT_VINFO_RELATED_STMT (prev_stmt_info) = new_stmt;
5508
5509       prev_stmt_info = vinfo_for_stmt (new_stmt);
5510       prev_phi_info = vinfo_for_stmt (new_phi);
5511     }
5512
5513   /* Finalize the reduction-phi (set its arguments) and create the
5514      epilog reduction code.  */
5515   if ((!single_defuse_cycle || code == COND_EXPR) && !slp_node)
5516     {
5517       new_temp = gimple_assign_lhs (*vec_stmt);
5518       vect_defs[0] = new_temp;
5519     }
5520
5521   vect_create_epilog_for_reduction (vect_defs, stmt, epilog_copies,
5522                                     epilog_reduc_code, phis, reduc_index,
5523                                     double_reduc, slp_node);
5524
5525   return true;
5526 }
5527
5528 /* Function vect_min_worthwhile_factor.
5529
5530    For a loop where we could vectorize the operation indicated by CODE,
5531    return the minimum vectorization factor that makes it worthwhile
5532    to use generic vectors.  */
5533 int
5534 vect_min_worthwhile_factor (enum tree_code code)
5535 {
5536   switch (code)
5537     {
5538     case PLUS_EXPR:
5539     case MINUS_EXPR:
5540     case NEGATE_EXPR:
5541       return 4;
5542
5543     case BIT_AND_EXPR:
5544     case BIT_IOR_EXPR:
5545     case BIT_XOR_EXPR:
5546     case BIT_NOT_EXPR:
5547       return 2;
5548
5549     default:
5550       return INT_MAX;
5551     }
5552 }
5553
5554
5555 /* Function vectorizable_induction
5556
5557    Check if PHI performs an induction computation that can be vectorized.
5558    If VEC_STMT is also passed, vectorize the induction PHI: create a vectorized
5559    phi to replace it, put it in VEC_STMT, and add it to the same basic block.
5560    Return FALSE if not a vectorizable STMT, TRUE otherwise.  */
5561
5562 bool
5563 vectorizable_induction (gimple phi, gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5564                         gimple *vec_stmt)
5565 {
5566   stmt_vec_info stmt_info = vinfo_for_stmt (phi);
5567   tree vectype = STMT_VINFO_VECTYPE (stmt_info);
5568   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5569   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5570   int nunits = TYPE_VECTOR_SUBPARTS (vectype);
5571   int ncopies = LOOP_VINFO_VECT_FACTOR (loop_vinfo) / nunits;
5572   tree vec_def;
5573
5574   gcc_assert (ncopies >= 1);
5575   /* FORNOW. These restrictions should be relaxed.  */
5576   if (nested_in_vect_loop_p (loop, phi))
5577     {
5578       imm_use_iterator imm_iter;
5579       use_operand_p use_p;
5580       gimple exit_phi;
5581       edge latch_e;
5582       tree loop_arg;
5583
5584       if (ncopies > 1)
5585         {
5586           if (dump_enabled_p ())
5587             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5588                              "multiple types in nested loop.\n");
5589           return false;
5590         }
5591
5592       exit_phi = NULL;
5593       latch_e = loop_latch_edge (loop->inner);
5594       loop_arg = PHI_ARG_DEF_FROM_EDGE (phi, latch_e);
5595       FOR_EACH_IMM_USE_FAST (use_p, imm_iter, loop_arg)
5596         {
5597           gimple use_stmt = USE_STMT (use_p);
5598           if (is_gimple_debug (use_stmt))
5599             continue;
5600
5601           if (!flow_bb_inside_loop_p (loop->inner, gimple_bb (use_stmt)))
5602             {
5603               exit_phi = use_stmt;
5604               break;
5605             }
5606         }
5607       if (exit_phi)
5608         {
5609           stmt_vec_info exit_phi_vinfo  = vinfo_for_stmt (exit_phi);
5610           if (!(STMT_VINFO_RELEVANT_P (exit_phi_vinfo)
5611                 && !STMT_VINFO_LIVE_P (exit_phi_vinfo)))
5612             {
5613               if (dump_enabled_p ())
5614                 dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5615                                  "inner-loop induction only used outside "
5616                                  "of the outer vectorized loop.\n");
5617               return false;
5618             }
5619         }
5620     }
5621
5622   if (!STMT_VINFO_RELEVANT_P (stmt_info))
5623     return false;
5624
5625   /* FORNOW: SLP not supported.  */
5626   if (STMT_SLP_TYPE (stmt_info))
5627     return false;
5628
5629   gcc_assert (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def);
5630
5631   if (gimple_code (phi) != GIMPLE_PHI)
5632     return false;
5633
5634   if (!vec_stmt) /* transformation not required.  */
5635     {
5636       STMT_VINFO_TYPE (stmt_info) = induc_vec_info_type;
5637       if (dump_enabled_p ())
5638         dump_printf_loc (MSG_NOTE, vect_location,
5639                          "=== vectorizable_induction ===\n");
5640       vect_model_induction_cost (stmt_info, ncopies);
5641       return true;
5642     }
5643
5644   /** Transform.  **/
5645
5646   if (dump_enabled_p ())
5647     dump_printf_loc (MSG_NOTE, vect_location, "transform induction phi.\n");
5648
5649   vec_def = get_initial_def_for_induction (phi);
5650   *vec_stmt = SSA_NAME_DEF_STMT (vec_def);
5651   return true;
5652 }
5653
5654 /* Function vectorizable_live_operation.
5655
5656    STMT computes a value that is used outside the loop.  Check if
5657    it can be supported.  */
5658
5659 bool
5660 vectorizable_live_operation (gimple stmt,
5661                              gimple_stmt_iterator *gsi ATTRIBUTE_UNUSED,
5662                              gimple *vec_stmt)
5663 {
5664   stmt_vec_info stmt_info = vinfo_for_stmt (stmt);
5665   loop_vec_info loop_vinfo = STMT_VINFO_LOOP_VINFO (stmt_info);
5666   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5667   int i;
5668   int op_type;
5669   tree op;
5670   tree def;
5671   gimple def_stmt;
5672   enum vect_def_type dt;
5673   enum tree_code code;
5674   enum gimple_rhs_class rhs_class;
5675
5676   gcc_assert (STMT_VINFO_LIVE_P (stmt_info));
5677
5678   if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_reduction_def)
5679     return false;
5680
5681   if (!is_gimple_assign (stmt))
5682     {
5683       if (gimple_call_internal_p (stmt)
5684           && gimple_call_internal_fn (stmt) == IFN_GOMP_SIMD_LANE
5685           && gimple_call_lhs (stmt)
5686           && loop->simduid
5687           && TREE_CODE (gimple_call_arg (stmt, 0)) == SSA_NAME
5688           && loop->simduid
5689              == SSA_NAME_VAR (gimple_call_arg (stmt, 0)))
5690         {
5691           edge e = single_exit (loop);
5692           basic_block merge_bb = e->dest;
5693           imm_use_iterator imm_iter;
5694           use_operand_p use_p;
5695           tree lhs = gimple_call_lhs (stmt);
5696
5697           FOR_EACH_IMM_USE_FAST (use_p, imm_iter, lhs)
5698             {
5699               gimple use_stmt = USE_STMT (use_p);
5700               if (gimple_code (use_stmt) == GIMPLE_PHI
5701                   && gimple_bb (use_stmt) == merge_bb)
5702                 {
5703                   if (vec_stmt)
5704                     {
5705                       tree vfm1
5706                         = build_int_cst (unsigned_type_node,
5707                                          loop_vinfo->vectorization_factor - 1);
5708                       SET_PHI_ARG_DEF (use_stmt, e->dest_idx, vfm1);
5709                     }
5710                   return true;
5711                 }
5712             }
5713         }
5714
5715       return false;
5716     }
5717
5718   if (TREE_CODE (gimple_assign_lhs (stmt)) != SSA_NAME)
5719     return false;
5720
5721   /* FORNOW. CHECKME. */
5722   if (nested_in_vect_loop_p (loop, stmt))
5723     return false;
5724
5725   code = gimple_assign_rhs_code (stmt);
5726   op_type = TREE_CODE_LENGTH (code);
5727   rhs_class = get_gimple_rhs_class (code);
5728   gcc_assert (rhs_class != GIMPLE_UNARY_RHS || op_type == unary_op);
5729   gcc_assert (rhs_class != GIMPLE_BINARY_RHS || op_type == binary_op);
5730
5731   /* FORNOW: support only if all uses are invariant.  This means
5732      that the scalar operations can remain in place, unvectorized.
5733      The original last scalar value that they compute will be used.  */
5734
5735   for (i = 0; i < op_type; i++)
5736     {
5737       if (rhs_class == GIMPLE_SINGLE_RHS)
5738         op = TREE_OPERAND (gimple_op (stmt, 1), i);
5739       else
5740         op = gimple_op (stmt, i + 1);
5741       if (op
5742           && !vect_is_simple_use (op, stmt, loop_vinfo, NULL, &def_stmt, &def,
5743                                   &dt))
5744         {
5745           if (dump_enabled_p ())
5746             dump_printf_loc (MSG_MISSED_OPTIMIZATION, vect_location,
5747                              "use not simple.\n");
5748           return false;
5749         }
5750
5751       if (dt != vect_external_def && dt != vect_constant_def)
5752         return false;
5753     }
5754
5755   /* No transformation is required for the cases we currently support.  */
5756   return true;
5757 }
5758
5759 /* Kill any debug uses outside LOOP of SSA names defined in STMT.  */
5760
5761 static void
5762 vect_loop_kill_debug_uses (struct loop *loop, gimple stmt)
5763 {
5764   ssa_op_iter op_iter;
5765   imm_use_iterator imm_iter;
5766   def_operand_p def_p;
5767   gimple ustmt;
5768
5769   FOR_EACH_PHI_OR_STMT_DEF (def_p, stmt, op_iter, SSA_OP_DEF)
5770     {
5771       FOR_EACH_IMM_USE_STMT (ustmt, imm_iter, DEF_FROM_PTR (def_p))
5772         {
5773           basic_block bb;
5774
5775           if (!is_gimple_debug (ustmt))
5776             continue;
5777
5778           bb = gimple_bb (ustmt);
5779
5780           if (!flow_bb_inside_loop_p (loop, bb))
5781             {
5782               if (gimple_debug_bind_p (ustmt))
5783                 {
5784                   if (dump_enabled_p ())
5785                     dump_printf_loc (MSG_NOTE, vect_location,
5786                                      "killing debug use\n");
5787
5788                   gimple_debug_bind_reset_value (ustmt);
5789                   update_stmt (ustmt);
5790                 }
5791               else
5792                 gcc_unreachable ();
5793             }
5794         }
5795     }
5796 }
5797
5798
5799 /* This function builds ni_name = number of iterations.  Statements
5800    are emitted on the loop preheader edge.  */
5801
5802 static tree
5803 vect_build_loop_niters (loop_vec_info loop_vinfo)
5804 {
5805   tree ni = unshare_expr (LOOP_VINFO_NITERS (loop_vinfo));
5806   if (TREE_CODE (ni) == INTEGER_CST)
5807     return ni;
5808   else
5809     {
5810       tree ni_name, var;
5811       gimple_seq stmts = NULL;
5812       edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5813
5814       var = create_tmp_var (TREE_TYPE (ni), "niters");
5815       ni_name = force_gimple_operand (ni, &stmts, false, var);
5816       if (stmts)
5817         gsi_insert_seq_on_edge_immediate (pe, stmts);
5818
5819       return ni_name;
5820     }
5821 }
5822
5823
5824 /* This function generates the following statements:
5825
5826    ni_name = number of iterations loop executes
5827    ratio = ni_name / vf
5828    ratio_mult_vf_name = ratio * vf
5829
5830    and places them on the loop preheader edge.  */
5831
5832 static void
5833 vect_generate_tmps_on_preheader (loop_vec_info loop_vinfo,
5834                                  tree ni_name,
5835                                  tree *ratio_mult_vf_name_ptr,
5836                                  tree *ratio_name_ptr)
5837 {
5838   tree ni_minus_gap_name;
5839   tree var;
5840   tree ratio_name;
5841   tree ratio_mult_vf_name;
5842   int vf = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5843   edge pe = loop_preheader_edge (LOOP_VINFO_LOOP (loop_vinfo));
5844   tree log_vf;
5845
5846   log_vf = build_int_cst (TREE_TYPE (ni_name), exact_log2 (vf));
5847
5848   /* If epilogue loop is required because of data accesses with gaps, we
5849      subtract one iteration from the total number of iterations here for
5850      correct calculation of RATIO.  */
5851   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5852     {
5853       ni_minus_gap_name = fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5854                                        ni_name,
5855                                        build_one_cst (TREE_TYPE (ni_name)));
5856       if (!is_gimple_val (ni_minus_gap_name))
5857         {
5858           var = create_tmp_var (TREE_TYPE (ni_name), "ni_gap");
5859           gimple stmts = NULL;
5860           ni_minus_gap_name = force_gimple_operand (ni_minus_gap_name, &stmts,
5861                                                     true, var);
5862           gsi_insert_seq_on_edge_immediate (pe, stmts);
5863         }
5864     }
5865   else
5866     ni_minus_gap_name = ni_name;
5867
5868   /* Create: ratio = ni >> log2(vf) */
5869   /* ???  As we have ni == number of latch executions + 1, ni could
5870      have overflown to zero.  So avoid computing ratio based on ni
5871      but compute it using the fact that we know ratio will be at least
5872      one, thus via (ni - vf) >> log2(vf) + 1.  */
5873   ratio_name
5874     = fold_build2 (PLUS_EXPR, TREE_TYPE (ni_name),
5875                    fold_build2 (RSHIFT_EXPR, TREE_TYPE (ni_name),
5876                                 fold_build2 (MINUS_EXPR, TREE_TYPE (ni_name),
5877                                              ni_minus_gap_name,
5878                                              build_int_cst
5879                                                (TREE_TYPE (ni_name), vf)),
5880                                 log_vf),
5881                    build_int_cst (TREE_TYPE (ni_name), 1));
5882   if (!is_gimple_val (ratio_name))
5883     {
5884       var = create_tmp_var (TREE_TYPE (ni_name), "bnd");
5885       gimple stmts = NULL;
5886       ratio_name = force_gimple_operand (ratio_name, &stmts, true, var);
5887       gsi_insert_seq_on_edge_immediate (pe, stmts);
5888     }
5889   *ratio_name_ptr = ratio_name;
5890
5891   /* Create: ratio_mult_vf = ratio << log2 (vf).  */
5892
5893   if (ratio_mult_vf_name_ptr)
5894     {
5895       ratio_mult_vf_name = fold_build2 (LSHIFT_EXPR, TREE_TYPE (ratio_name),
5896                                         ratio_name, log_vf);
5897       if (!is_gimple_val (ratio_mult_vf_name))
5898         {
5899           var = create_tmp_var (TREE_TYPE (ni_name), "ratio_mult_vf");
5900           gimple stmts = NULL;
5901           ratio_mult_vf_name = force_gimple_operand (ratio_mult_vf_name, &stmts,
5902                                                      true, var);
5903           gsi_insert_seq_on_edge_immediate (pe, stmts);
5904         }
5905       *ratio_mult_vf_name_ptr = ratio_mult_vf_name;
5906     }
5907
5908   return;
5909 }
5910
5911
5912 /* Function vect_transform_loop.
5913
5914    The analysis phase has determined that the loop is vectorizable.
5915    Vectorize the loop - created vectorized stmts to replace the scalar
5916    stmts in the loop, and update the loop exit condition.  */
5917
5918 void
5919 vect_transform_loop (loop_vec_info loop_vinfo)
5920 {
5921   struct loop *loop = LOOP_VINFO_LOOP (loop_vinfo);
5922   basic_block *bbs = LOOP_VINFO_BBS (loop_vinfo);
5923   int nbbs = loop->num_nodes;
5924   int i;
5925   tree ratio = NULL;
5926   int vectorization_factor = LOOP_VINFO_VECT_FACTOR (loop_vinfo);
5927   bool grouped_store;
5928   bool slp_scheduled = false;
5929   gimple stmt, pattern_stmt;
5930   gimple_seq pattern_def_seq = NULL;
5931   gimple_stmt_iterator pattern_def_si = gsi_none ();
5932   bool transform_pattern_stmt = false;
5933   bool check_profitability = false;
5934   int th;
5935   /* Record number of iterations before we started tampering with the profile. */
5936   gcov_type expected_iterations = expected_loop_iterations_unbounded (loop);
5937
5938   if (dump_enabled_p ())
5939     dump_printf_loc (MSG_NOTE, vect_location, "=== vec_transform_loop ===\n");
5940
5941   /* If profile is inprecise, we have chance to fix it up.  */
5942   if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5943     expected_iterations = LOOP_VINFO_INT_NITERS (loop_vinfo);
5944
5945   /* Use the more conservative vectorization threshold.  If the number
5946      of iterations is constant assume the cost check has been performed
5947      by our caller.  If the threshold makes all loops profitable that
5948      run at least the vectorization factor number of times checking
5949      is pointless, too.  */
5950   th = LOOP_VINFO_COST_MODEL_THRESHOLD (loop_vinfo);
5951   if (th >= LOOP_VINFO_VECT_FACTOR (loop_vinfo) - 1
5952       && !LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
5953     {
5954       if (dump_enabled_p ())
5955         dump_printf_loc (MSG_NOTE, vect_location,
5956                          "Profitability threshold is %d loop iterations.\n",
5957                          th);
5958       check_profitability = true;
5959     }
5960
5961   /* Version the loop first, if required, so the profitability check
5962      comes first.  */
5963
5964   if (LOOP_REQUIRES_VERSIONING_FOR_ALIGNMENT (loop_vinfo)
5965       || LOOP_REQUIRES_VERSIONING_FOR_ALIAS (loop_vinfo))
5966     {
5967       vect_loop_versioning (loop_vinfo, th, check_profitability);
5968       check_profitability = false;
5969     }
5970
5971   tree ni_name = vect_build_loop_niters (loop_vinfo);
5972   LOOP_VINFO_NITERS_UNCHANGED (loop_vinfo) = ni_name;
5973
5974   /* Peel the loop if there are data refs with unknown alignment.
5975      Only one data ref with unknown store is allowed.  */
5976
5977   if (LOOP_VINFO_PEELING_FOR_ALIGNMENT (loop_vinfo))
5978     {
5979       vect_do_peeling_for_alignment (loop_vinfo, ni_name,
5980                                      th, check_profitability);
5981       check_profitability = false;
5982       /* The above adjusts LOOP_VINFO_NITERS, so cause ni_name to
5983          be re-computed.  */
5984       ni_name = NULL_TREE;
5985     }
5986
5987   /* If the loop has a symbolic number of iterations 'n' (i.e. it's not a
5988      compile time constant), or it is a constant that doesn't divide by the
5989      vectorization factor, then an epilog loop needs to be created.
5990      We therefore duplicate the loop: the original loop will be vectorized,
5991      and will compute the first (n/VF) iterations.  The second copy of the loop
5992      will remain scalar and will compute the remaining (n%VF) iterations.
5993      (VF is the vectorization factor).  */
5994
5995   if (LOOP_VINFO_PEELING_FOR_NITER (loop_vinfo)
5996       || LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo))
5997     {
5998       tree ratio_mult_vf;
5999       if (!ni_name)
6000         ni_name = vect_build_loop_niters (loop_vinfo);
6001       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, &ratio_mult_vf,
6002                                        &ratio);
6003       vect_do_peeling_for_loop_bound (loop_vinfo, ni_name, ratio_mult_vf,
6004                                       th, check_profitability);
6005     }
6006   else if (LOOP_VINFO_NITERS_KNOWN_P (loop_vinfo))
6007     ratio = build_int_cst (TREE_TYPE (LOOP_VINFO_NITERS (loop_vinfo)),
6008                 LOOP_VINFO_INT_NITERS (loop_vinfo) / vectorization_factor);
6009   else
6010     {
6011       if (!ni_name)
6012         ni_name = vect_build_loop_niters (loop_vinfo);
6013       vect_generate_tmps_on_preheader (loop_vinfo, ni_name, NULL, &ratio);
6014     }
6015
6016   /* 1) Make sure the loop header has exactly two entries
6017      2) Make sure we have a preheader basic block.  */
6018
6019   gcc_assert (EDGE_COUNT (loop->header->preds) == 2);
6020
6021   split_edge (loop_preheader_edge (loop));
6022
6023   /* FORNOW: the vectorizer supports only loops which body consist
6024      of one basic block (header + empty latch). When the vectorizer will
6025      support more involved loop forms, the order by which the BBs are
6026      traversed need to be reconsidered.  */
6027
6028   for (i = 0; i < nbbs; i++)
6029     {
6030       basic_block bb = bbs[i];
6031       stmt_vec_info stmt_info;
6032
6033       for (gphi_iterator si = gsi_start_phis (bb); !gsi_end_p (si);
6034            gsi_next (&si))
6035         {
6036           gphi *phi = si.phi ();
6037           if (dump_enabled_p ())
6038             {
6039               dump_printf_loc (MSG_NOTE, vect_location,
6040                                "------>vectorizing phi: ");
6041               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, phi, 0);
6042               dump_printf (MSG_NOTE, "\n");
6043             }
6044           stmt_info = vinfo_for_stmt (phi);
6045           if (!stmt_info)
6046             continue;
6047
6048           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6049             vect_loop_kill_debug_uses (loop, phi);
6050
6051           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6052               && !STMT_VINFO_LIVE_P (stmt_info))
6053             continue;
6054
6055           if (STMT_VINFO_VECTYPE (stmt_info)
6056               && (TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info))
6057                   != (unsigned HOST_WIDE_INT) vectorization_factor)
6058               && dump_enabled_p ())
6059             dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6060
6061           if (STMT_VINFO_DEF_TYPE (stmt_info) == vect_induction_def)
6062             {
6063               if (dump_enabled_p ())
6064                 dump_printf_loc (MSG_NOTE, vect_location, "transform phi.\n");
6065               vect_transform_stmt (phi, NULL, NULL, NULL, NULL);
6066             }
6067         }
6068
6069       pattern_stmt = NULL;
6070       for (gimple_stmt_iterator si = gsi_start_bb (bb);
6071            !gsi_end_p (si) || transform_pattern_stmt;)
6072         {
6073           bool is_store;
6074
6075           if (transform_pattern_stmt)
6076             stmt = pattern_stmt;
6077           else
6078             {
6079               stmt = gsi_stmt (si);
6080               /* During vectorization remove existing clobber stmts.  */
6081               if (gimple_clobber_p (stmt))
6082                 {
6083                   unlink_stmt_vdef (stmt);
6084                   gsi_remove (&si, true);
6085                   release_defs (stmt);
6086                   continue;
6087                 }
6088             }
6089
6090           if (dump_enabled_p ())
6091             {
6092               dump_printf_loc (MSG_NOTE, vect_location,
6093                                "------>vectorizing statement: ");
6094               dump_gimple_stmt (MSG_NOTE, TDF_SLIM, stmt, 0);
6095               dump_printf (MSG_NOTE, "\n");
6096             }
6097
6098           stmt_info = vinfo_for_stmt (stmt);
6099
6100           /* vector stmts created in the outer-loop during vectorization of
6101              stmts in an inner-loop may not have a stmt_info, and do not
6102              need to be vectorized.  */
6103           if (!stmt_info)
6104             {
6105               gsi_next (&si);
6106               continue;
6107             }
6108
6109           if (MAY_HAVE_DEBUG_STMTS && !STMT_VINFO_LIVE_P (stmt_info))
6110             vect_loop_kill_debug_uses (loop, stmt);
6111
6112           if (!STMT_VINFO_RELEVANT_P (stmt_info)
6113               && !STMT_VINFO_LIVE_P (stmt_info))
6114             {
6115               if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6116                   && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6117                   && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6118                       || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6119                 {
6120                   stmt = pattern_stmt;
6121                   stmt_info = vinfo_for_stmt (stmt);
6122                 }
6123               else
6124                 {
6125                   gsi_next (&si);
6126                   continue;
6127                 }
6128             }
6129           else if (STMT_VINFO_IN_PATTERN_P (stmt_info)
6130                    && (pattern_stmt = STMT_VINFO_RELATED_STMT (stmt_info))
6131                    && (STMT_VINFO_RELEVANT_P (vinfo_for_stmt (pattern_stmt))
6132                        || STMT_VINFO_LIVE_P (vinfo_for_stmt (pattern_stmt))))
6133             transform_pattern_stmt = true;
6134
6135           /* If pattern statement has def stmts, vectorize them too.  */
6136           if (is_pattern_stmt_p (stmt_info))
6137             {
6138               if (pattern_def_seq == NULL)
6139                 {
6140                   pattern_def_seq = STMT_VINFO_PATTERN_DEF_SEQ (stmt_info);
6141                   pattern_def_si = gsi_start (pattern_def_seq);
6142                 }
6143               else if (!gsi_end_p (pattern_def_si))
6144                 gsi_next (&pattern_def_si);
6145               if (pattern_def_seq != NULL)
6146                 {
6147                   gimple pattern_def_stmt = NULL;
6148                   stmt_vec_info pattern_def_stmt_info = NULL;
6149
6150                   while (!gsi_end_p (pattern_def_si))
6151                     {
6152                       pattern_def_stmt = gsi_stmt (pattern_def_si);
6153                       pattern_def_stmt_info
6154                         = vinfo_for_stmt (pattern_def_stmt);
6155                       if (STMT_VINFO_RELEVANT_P (pattern_def_stmt_info)
6156                           || STMT_VINFO_LIVE_P (pattern_def_stmt_info))
6157                         break;
6158                       gsi_next (&pattern_def_si);
6159                     }
6160
6161                   if (!gsi_end_p (pattern_def_si))
6162                     {
6163                       if (dump_enabled_p ())
6164                         {
6165                           dump_printf_loc (MSG_NOTE, vect_location,
6166                                            "==> vectorizing pattern def "
6167                                            "stmt: ");
6168                           dump_gimple_stmt (MSG_NOTE, TDF_SLIM,
6169                                             pattern_def_stmt, 0);
6170                           dump_printf (MSG_NOTE, "\n");
6171                         }
6172
6173                       stmt = pattern_def_stmt;
6174                       stmt_info = pattern_def_stmt_info;
6175                     }
6176                   else
6177                     {
6178                       pattern_def_si = gsi_none ();
6179                       transform_pattern_stmt = false;
6180                     }
6181                 }
6182               else
6183                 transform_pattern_stmt = false;
6184             }
6185
6186           if (STMT_VINFO_VECTYPE (stmt_info))
6187             {
6188               unsigned int nunits
6189                 = (unsigned int)
6190                   TYPE_VECTOR_SUBPARTS (STMT_VINFO_VECTYPE (stmt_info));
6191               if (!STMT_SLP_TYPE (stmt_info)
6192                   && nunits != (unsigned int) vectorization_factor
6193                   && dump_enabled_p ())
6194                   /* For SLP VF is set according to unrolling factor, and not
6195                      to vector size, hence for SLP this print is not valid.  */
6196                 dump_printf_loc (MSG_NOTE, vect_location, "multiple-types.\n");
6197             }
6198
6199           /* SLP. Schedule all the SLP instances when the first SLP stmt is
6200              reached.  */
6201           if (STMT_SLP_TYPE (stmt_info))
6202             {
6203               if (!slp_scheduled)
6204                 {
6205                   slp_scheduled = true;
6206
6207                   if (dump_enabled_p ())
6208                     dump_printf_loc (MSG_NOTE, vect_location,
6209                                      "=== scheduling SLP instances ===\n");
6210
6211                   vect_schedule_slp (loop_vinfo, NULL);
6212                 }
6213
6214               /* Hybrid SLP stmts must be vectorized in addition to SLP.  */
6215               if (!vinfo_for_stmt (stmt) || PURE_SLP_STMT (stmt_info))
6216                 {
6217                   if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6218                     {
6219                       pattern_def_seq = NULL;
6220                       gsi_next (&si);
6221                     }
6222                   continue;
6223                 }
6224             }
6225
6226           /* -------- vectorize statement ------------ */
6227           if (dump_enabled_p ())
6228             dump_printf_loc (MSG_NOTE, vect_location, "transform statement.\n");
6229
6230           grouped_store = false;
6231           is_store = vect_transform_stmt (stmt, &si, &grouped_store, NULL, NULL);
6232           if (is_store)
6233             {
6234               if (STMT_VINFO_GROUPED_ACCESS (stmt_info))
6235                 {
6236                   /* Interleaving. If IS_STORE is TRUE, the vectorization of the
6237                      interleaving chain was completed - free all the stores in
6238                      the chain.  */
6239                   gsi_next (&si);
6240                   vect_remove_stores (GROUP_FIRST_ELEMENT (stmt_info));
6241                 }
6242               else
6243                 {
6244                   /* Free the attached stmt_vec_info and remove the stmt.  */
6245                   gimple store = gsi_stmt (si);
6246                   free_stmt_vec_info (store);
6247                   unlink_stmt_vdef (store);
6248                   gsi_remove (&si, true);
6249                   release_defs (store);
6250                 }
6251
6252               /* Stores can only appear at the end of pattern statements.  */
6253               gcc_assert (!transform_pattern_stmt);
6254               pattern_def_seq = NULL;
6255             }
6256           else if (!transform_pattern_stmt && gsi_end_p (pattern_def_si))
6257             {
6258               pattern_def_seq = NULL;
6259               gsi_next (&si);
6260             }
6261         }                       /* stmts in BB */
6262     }                           /* BBs in loop */
6263
6264   slpeel_make_loop_iterate_ntimes (loop, ratio);
6265
6266   /* Reduce loop iterations by the vectorization factor.  */
6267   scale_loop_profile (loop, GCOV_COMPUTE_SCALE (1, vectorization_factor),
6268                       expected_iterations / vectorization_factor);
6269   loop->nb_iterations_upper_bound
6270     = wi::udiv_floor (loop->nb_iterations_upper_bound, vectorization_factor);
6271   if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6272       && loop->nb_iterations_upper_bound != 0)
6273     loop->nb_iterations_upper_bound = loop->nb_iterations_upper_bound - 1;
6274   if (loop->any_estimate)
6275     {
6276       loop->nb_iterations_estimate
6277         = wi::udiv_floor (loop->nb_iterations_estimate, vectorization_factor);
6278        if (LOOP_VINFO_PEELING_FOR_GAPS (loop_vinfo)
6279            && loop->nb_iterations_estimate != 0)
6280          loop->nb_iterations_estimate = loop->nb_iterations_estimate - 1;
6281     }
6282
6283   if (dump_enabled_p ())
6284     {
6285       dump_printf_loc (MSG_NOTE, vect_location,
6286                        "LOOP VECTORIZED\n");
6287       if (loop->inner)
6288         dump_printf_loc (MSG_NOTE, vect_location,
6289                          "OUTER LOOP VECTORIZED\n");
6290       dump_printf (MSG_NOTE, "\n");
6291     }
6292 }